From d30e51ae44d4bc745458fa2469868af68faf13ee Mon Sep 17 00:00:00 2001 From: jialongzeng Date: Mon, 30 Sep 2024 01:46:21 +0000 Subject: [PATCH] deploy: 80572b1a5a128abc54e4a4a84d2128330a19d26a --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 27061 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 27456 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/cache.json b/cache.json new file mode 100644 index 0000000..50d9125 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-09-26T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.18110v1","updated":"2024-09-26T17:52:57Z","published":"2024-09-26T17:52:57Z","title":"Open-World Evaluation for Retrieving Diverse Perspectives","summary":" We study retrieving a set of documents that covers various perspectives on a\ncomplex and contentious question (e.g., will ChatGPT do more harm than good?).\nWe curate a Benchmark for Retrieval Diversity for Subjective questions (BERDS),\nwhere each example consists of a question and diverse perspectives associated\nwith the question, sourced from survey questions and debate websites. On this\ndata, retrievers paired with a corpus are evaluated to surface a document set\nthat contains diverse perspectives. Our framing diverges from most retrieval\ntasks in that document relevancy cannot be decided by simple string matches to\nreferences. Instead, we build a language model based automatic evaluator that\ndecides whether each retrieved document contains a perspective. This allows us\nto evaluate the performance of three different types of corpus (Wikipedia, web\nsnapshot, and corpus constructed on the fly with retrieved pages from the\nsearch engine) paired with retrievers. Retrieving diverse documents remains\nchallenging, with the outputs from existing retrievers covering all\nperspectives on only 33.74% of the examples. We further study the impact of\nquery expansion and diversity-focused reranking approaches and analyze\nretriever sycophancy. Together, we lay the foundation for future studies in\nretrieval diversity handling complex queries.\n","authors":["Hung-Ting Chen","Eunsol Choi"],"pdf_url":"https://arxiv.org/pdf/2409.18110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12822v3","updated":"2024-09-26T17:39:44Z","published":"2024-06-18T17:43:47Z","title":"Is It Good Data for Multilingual Instruction Tuning or Just Bad\n Multilingual Evaluation for Large Language Models?","summary":" Multilingual large language models are designed, claimed, and expected to\ncater to speakers of varied languages. We hypothesise that the current\npractices of fine-tuning and evaluating these models may not perfectly align\nwith this objective owing to a heavy reliance on translation, which cannot\ncover language-specific knowledge but can introduce translation defects. It\nremains unknown whether the nature of the instruction data has an impact on the\nmodel output; conversely, it is questionable whether translated test sets can\ncapture such nuances. Due to the often coupled practices of using translated\ndata in both stages, such imperfections could have been overlooked. This work\ninvestigates these issues using controlled native or translated data during the\ninstruction tuning and evaluation stages. We show that native or generation\nbenchmarks reveal a notable difference between native and translated\ninstruction data especially when model performance is high, whereas other types\nof test sets cannot. The comparison between round-trip and single-pass\ntranslations reflects the importance of knowledge from language-native\nresources. Finally, we demonstrate that regularization is beneficial to\nbridging this gap on structured but not generative tasks.\n","authors":["Pinzhen Chen","Simon Yu","Zhicheng Guo","Barry Haddow"],"pdf_url":"https://arxiv.org/pdf/2406.12822v3.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18073v1","updated":"2024-09-26T17:19:49Z","published":"2024-09-26T17:19:49Z","title":"Infer Human's Intentions Before Following Natural Language Instructions","summary":" For AI agents to be helpful to humans, they should be able to follow natural\nlanguage instructions to complete everyday cooperative tasks in human\nenvironments. However, real human instructions inherently possess ambiguity,\nbecause the human speakers assume sufficient prior knowledge about their hidden\ngoals and intentions. Standard language grounding and planning methods fail to\naddress such ambiguities because they do not model human internal goals as\nadditional partially observable factors in the environment. We propose a new\nframework, Follow Instructions with Social and Embodied Reasoning (FISER),\naiming for better natural language instruction following in collaborative\nembodied tasks. Our framework makes explicit inferences about human goals and\nintentions as intermediate reasoning steps. We implement a set of\nTransformer-based models and evaluate them over a challenging benchmark,\nHandMeThat. We empirically demonstrate that using social reasoning to\nexplicitly infer human intentions before making action plans surpasses purely\nend-to-end approaches. We also compare our implementation with strong\nbaselines, including Chain of Thought prompting on the largest available\npre-trained language models, and find that FISER provides better performance on\nthe embodied social reasoning tasks under investigation, reaching the\nstate-of-the-art on HandMeThat.\n","authors":["Yanming Wan","Yue Wu","Yiping Wang","Jiayuan Mao","Natasha Jaques"],"pdf_url":"https://arxiv.org/pdf/2409.18073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18046v1","updated":"2024-09-26T16:47:32Z","published":"2024-09-26T16:47:32Z","title":"IFCap: Image-like Retrieval and Frequency-based Entity Filtering for\n Zero-shot Captioning","summary":" Recent advancements in image captioning have explored text-only training\nmethods to overcome the limitations of paired image-text data. However,\nexisting text-only training methods often overlook the modality gap between\nusing text data during training and employing images during inference. To\naddress this issue, we propose a novel approach called Image-like Retrieval,\nwhich aligns text features with visually relevant features to mitigate the\nmodality gap. Our method further enhances the accuracy of generated captions by\ndesigning a Fusion Module that integrates retrieved captions with input\nfeatures. Additionally, we introduce a Frequency-based Entity Filtering\ntechnique that significantly improves caption quality. We integrate these\nmethods into a unified framework, which we refer to as IFCap\n($\\textbf{I}$mage-like Retrieval and $\\textbf{F}$requency-based Entity\nFiltering for Zero-shot $\\textbf{Cap}$tioning). Through extensive\nexperimentation, our straightforward yet powerful approach has demonstrated its\nefficacy, outperforming the state-of-the-art methods by a significant margin in\nboth image captioning and video captioning compared to zero-shot captioning\nbased on text-only training.\n","authors":["Soeun Lee","Si-Woo Kim","Taewhan Kim","Dong-Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18046v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18044v1","updated":"2024-09-26T16:46:46Z","published":"2024-09-26T16:46:46Z","title":"Unveiling the Role of Pretraining in Direct Speech Translation","summary":" Direct speech-to-text translation systems encounter an important drawback in\ndata scarcity. A common solution consists on pretraining the encoder on\nautomatic speech recognition, hence losing efficiency in the training process.\nIn this study, we compare the training dynamics of a system using a pretrained\nencoder, the conventional approach, and one trained from scratch. We observe\nthat, throughout the training, the randomly initialized model struggles to\nincorporate information from the speech inputs for its predictions. Hence, we\nhypothesize that this issue stems from the difficulty of effectively training\nan encoder for direct speech translation. While a model trained from scratch\nneeds to learn acoustic and semantic modeling simultaneously, a pretrained one\ncan just focus on the latter. Based on these findings, we propose a subtle\nchange in the decoder cross-attention to integrate source information from\nearlier steps in training. We show that with this change, the model trained\nfrom scratch can achieve comparable performance to the pretrained one, while\nreducing the training time.\n","authors":["Belen Alastruey","Gerard I. Gállego","Marta R. Costa-jussà"],"pdf_url":"https://arxiv.org/pdf/2409.18044v1.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18042v1","updated":"2024-09-26T16:44:02Z","published":"2024-09-26T16:44:02Z","title":"EMOVA: Empowering Language Models to See, Hear and Speak with Vivid\n Emotions","summary":" GPT-4o, an omni-modal model that enables vocal conversations with diverse\nemotions and tones, marks a milestone for omni-modal foundation models.\nHowever, empowering Large Language Models to perceive and generate images,\ntexts, and speeches end-to-end with publicly available data remains challenging\nin the open-source community. Existing vision-language models rely on external\ntools for the speech processing, while speech-language models still suffer from\nlimited or even without vision-understanding abilities. To address this gap, we\npropose EMOVA (EMotionally Omni-present Voice Assistant), to enable Large\nLanguage Models with end-to-end speech capabilities while maintaining the\nleading vision-language performance. With a semantic-acoustic disentangled\nspeech tokenizer, we notice surprisingly that omni-modal alignment can further\nenhance vision-language and speech abilities compared with the corresponding\nbi-modal aligned counterparts. Moreover, a lightweight style module is proposed\nfor flexible speech style controls (e.g., emotions and pitches). For the first\ntime, EMOVA achieves state-of-the-art performance on both the vision-language\nand speech benchmarks, and meanwhile, supporting omni-modal spoken dialogue\nwith vivid emotions.\n","authors":["Kai Chen","Yunhao Gou","Runhui Huang","Zhili Liu","Daxin Tan","Jing Xu","Chunwei Wang","Yi Zhu","Yihan Zeng","Kuo Yang","Dingdong Wang","Kun Xiang","Haoyuan Li","Haoli Bai","Jianhua Han","Xiaohui Li","Weike Jin","Nian Xie","Yu Zhang","James T. Kwok","Hengshuang Zhao","Xiaodan Liang","Dit-Yan Yeung","Xiao Chen","Zhenguo Li","Wei Zhang","Qun Liu","Lanqing Hong","Lu Hou","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.18042v1.pdf","comment":"Project Page: https://emova-ollm.github.io/"},{"id":"http://arxiv.org/abs/2409.18033v1","updated":"2024-09-26T16:38:56Z","published":"2024-09-26T16:38:56Z","title":"Automated Detection and Analysis of Power Words in Persuasive Text Using\n Natural Language Processing","summary":" Power words are terms that evoke strong emotional responses and significantly\ninfluence readers' behavior, playing a crucial role in fields like marketing,\npolitics, and motivational writing. This study proposes a methodology for the\nautomated detection and analysis of power words in persuasive text using a\ncustom lexicon and the TextBlob library in Python. By identifying the presence\nand frequency of power words within a given text, we aim to classify and\nanalyze their impact on sentiment and reader engagement. This research examines\ndiverse datasets across various domains to provide insights into the\neffectiveness of power words, offering practical applications for content\ncreators, advertisers, and policymakers.\n","authors":["Sahil Garje"],"pdf_url":"https://arxiv.org/pdf/2409.18033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13731v3","updated":"2024-09-26T16:34:35Z","published":"2024-09-10T02:00:28Z","title":"KAG: Boosting LLMs in Professional Domains via Knowledge Augmented\n Generation","summary":" The recently developed retrieval-augmented generation (RAG) technology has\nenabled the efficient construction of domain-specific applications. However, it\nalso has limitations, including the gap between vector similarity and the\nrelevance of knowledge reasoning, as well as insensitivity to knowledge logic,\nsuch as numerical values, temporal relations, expert rules, and others, which\nhinder the effectiveness of professional knowledge services. In this work, we\nintroduce a professional domain knowledge service framework called Knowledge\nAugmented Generation (KAG). KAG is designed to address the aforementioned\nchallenges with the motivation of making full use of the advantages of\nknowledge graph(KG) and vector retrieval, and to improve generation and\nreasoning performance by bidirectionally enhancing large language models (LLMs)\nand KGs through five key aspects: (1) LLM-friendly knowledge representation,\n(2) mutual-indexing between knowledge graphs and original chunks, (3)\nlogical-form-guided hybrid reasoning engine, (4) knowledge alignment with\nsemantic reasoning, and (5) model capability enhancement for KAG. We compared\nKAG with existing RAG methods in multihop question answering and found that it\nsignificantly outperforms state-of-theart methods, achieving a relative\nimprovement of 19.6% on 2wiki and 33.5% on hotpotQA in terms of F1 score. We\nhave successfully applied KAG to two professional knowledge Q&A tasks of Ant\nGroup, including E-Government Q&A and E-Health Q&A, achieving significant\nimprovement in professionalism compared to RAG methods.\n","authors":["Lei Liang","Mengshu Sun","Zhengke Gui","Zhongshu Zhu","Zhouyu Jiang","Ling Zhong","Yuan Qu","Peilong Zhao","Zhongpu Bo","Jin Yang","Huaidong Xiong","Lin Yuan","Jun Xu","Zaoyang Wang","Zhiqiang Zhang","Wen Zhang","Huajun Chen","Wenguang Chen","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.13731v3.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2409.18028v1","updated":"2024-09-26T16:34:35Z","published":"2024-09-26T16:34:35Z","title":"Compositional Hardness of Code in Large Language Models -- A\n Probabilistic Perspective","summary":" A common practice in large language model (LLM) usage for complex analytical\ntasks such as code generation, is to sample a solution for the entire task\nwithin the model's context window. Previous works have shown that subtask\ndecomposition within the model's context (chain of thought), is beneficial for\nsolving such tasks. In this work, we point a limitation of LLMs' ability to\nperform several sub-tasks within the same context window - an in-context\nhardness of composition, pointing to an advantage for distributing a decomposed\nproblem in a multi-agent system of LLMs. The hardness of composition is\nquantified by a generation complexity metric, i.e., the number of LLM\ngenerations required to sample at least one correct solution. We find a gap\nbetween the generation complexity of solving a compositional problem within the\nsame context relative to distributing it among multiple agents, that increases\nexponentially with the solution's length. We prove our results theoretically\nand demonstrate them empirically.\n","authors":["Yotam Wolf","Binyamin Rothberg","Dorin Shteyman","Amnon Shashua"],"pdf_url":"https://arxiv.org/pdf/2409.18028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18025v1","updated":"2024-09-26T16:32:19Z","published":"2024-09-26T16:32:19Z","title":"An Adversarial Perspective on Machine Unlearning for AI Safety","summary":" Large language models are finetuned to refuse questions about hazardous\nknowledge, but these protections can often be bypassed. Unlearning methods aim\nat completely removing hazardous capabilities from models and make them\ninaccessible to adversaries. This work challenges the fundamental differences\nbetween unlearning and traditional safety post-training from an adversarial\nperspective. We demonstrate that existing jailbreak methods, previously\nreported as ineffective against unlearning, can be successful when applied\ncarefully. Furthermore, we develop a variety of adaptive methods that recover\nmost supposedly unlearned capabilities. For instance, we show that finetuning\non 10 unrelated examples or removing specific directions in the activation\nspace can recover most hazardous capabilities for models edited with RMU, a\nstate-of-the-art unlearning method. Our findings challenge the robustness of\ncurrent unlearning approaches and question their advantages over safety\ntraining.\n","authors":["Jakub Łucki","Boyi Wei","Yangsibo Huang","Peter Henderson","Florian Tramèr","Javier Rando"],"pdf_url":"https://arxiv.org/pdf/2409.18025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18023v1","updated":"2024-09-26T16:31:50Z","published":"2024-09-26T16:31:50Z","title":"DARE: Diverse Visual Question Answering with Robustness Evaluation","summary":" Vision Language Models (VLMs) extend remarkable capabilities of text-only\nlarge language models and vision-only models, and are able to learn from and\nprocess multi-modal vision-text input. While modern VLMs perform well on a\nnumber of standard image classification and image-text matching tasks, they\nstill struggle with a number of crucial vision-language (VL) reasoning\nabilities such as counting and spatial reasoning. Moreover, while they might be\nvery brittle to small variations in instructions and/or evaluation protocols,\nexisting benchmarks fail to evaluate their robustness (or rather the lack of\nit). In order to couple challenging VL scenarios with comprehensive robustness\nevaluation, we introduce DARE, Diverse Visual Question Answering with\nRobustness Evaluation, a carefully created and curated multiple-choice VQA\nbenchmark. DARE evaluates VLM performance on five diverse categories and\nincludes four robustness-oriented evaluations based on the variations of:\nprompts, the subsets of answer options, the output format and the number of\ncorrect answers. Among a spectrum of other findings, we report that\nstate-of-the-art VLMs still struggle with questions in most categories and are\nunable to consistently deliver their peak performance across the tested\nrobustness evaluations. The worst case performance across the subsets of\noptions is up to 34% below the performance in the standard case. The robustness\nof the open-source VLMs such as LLaVA 1.6 and Idefics2 cannot match the\nclosed-source models such as GPT-4 and Gemini, but even the latter remain very\nbrittle to different variations.\n","authors":["Hannah Sterz","Jonas Pfeiffer","Ivan Vulić"],"pdf_url":"https://arxiv.org/pdf/2409.18023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18006v1","updated":"2024-09-26T16:15:14Z","published":"2024-09-26T16:15:14Z","title":"Multilingual Evaluation of Long Context Retrieval and Reasoning","summary":" Recent large language models (LLMs) demonstrate impressive capabilities in\nhandling long contexts, some exhibiting near-perfect recall on synthetic\nretrieval tasks. However, these evaluations have mainly focused on English text\nand involved a single target sentence within lengthy contexts. Our work\ninvestigates how LLM performance generalizes to multilingual settings with\nmultiple hidden target sentences. We comprehensively evaluate several\nlong-context LLMs on retrieval and reasoning tasks across five languages:\nEnglish, Vietnamese, Indonesian, Swahili, and Somali. These languages share the\nLatin script but belong to distinct language families and resource levels. Our\nanalysis reveals a significant performance gap between languages. The\nbest-performing models such as Gemini-1.5 and GPT-4o, achieve around 96%\naccuracy in English to around 36% in Somali with a single target sentence.\nHowever, this accuracy drops to 40% in English and 0% in Somali when dealing\nwith three target sentences. Our findings highlight the challenges long-context\nLLMs face when processing longer contexts, an increase in the number of target\nsentences, or languages of lower resource levels.\n","authors":["Ameeta Agrawal","Andy Dang","Sina Bagheri Nezhad","Rhitabrat Pokharel","Russell Scheinberg"],"pdf_url":"https://arxiv.org/pdf/2409.18006v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2409.17990v1","updated":"2024-09-26T16:02:00Z","published":"2024-09-26T16:02:00Z","title":"Extracting Affect Aggregates from Longitudinal Social Media Data with\n Temporal Adapters for Large Language Models","summary":" This paper proposes temporally aligned Large Language Models (LLMs) as a tool\nfor longitudinal analysis of social media data. We fine-tune Temporal Adapters\nfor Llama 3 8B on full timelines from a panel of British Twitter users, and\nextract longitudinal aggregates of emotions and attitudes with established\nquestionnaires. We validate our estimates against representative British survey\ndata and find strong positive, significant correlations for several collective\nemotions. The obtained estimates are robust across multiple training seeds and\nprompt formulations, and in line with collective emotions extracted using a\ntraditional classification model trained on labeled data. To the best of our\nknowledge, this is the first work to extend the analysis of affect in LLMs to a\nlongitudinal setting through Temporal Adapters. Our work enables new approaches\ntowards the longitudinal analysis of social media data.\n","authors":["Georg Ahnert","Max Pellert","David Garcia","Markus Strohmaier"],"pdf_url":"https://arxiv.org/pdf/2409.17990v1.pdf","comment":"Code available at https://github.com/dess-mannheim/temporal-adapters"},{"id":"http://arxiv.org/abs/2409.17972v1","updated":"2024-09-26T15:47:42Z","published":"2024-09-26T15:47:42Z","title":"BEATS: Optimizing LLM Mathematical Capabilities with BackVerify and\n Adaptive Disambiguate based Efficient Tree Search","summary":" Large Language Models (LLMs) have exhibited exceptional performance across a\nbroad range of tasks and domains. However, they still encounter difficulties in\nsolving mathematical problems due to the rigorous and logical nature of\nmathematics. Previous studies have employed techniques such as supervised\nfine-tuning (SFT), prompt engineering, and search-based methods to improve the\nmathematical problem-solving abilities of LLMs. Despite these efforts, their\nperformance remains suboptimal and demands substantial computational resources.\nTo address this issue, we propose a novel approach, BEATS, to enhance\nmathematical problem-solving abilities. Our method leverages newly designed\nprompts that guide the model to iteratively rewrite, advance by one step, and\ngenerate answers based on previous steps. Additionally, we introduce a new\nback-verification technique that uses LLMs to validate the correctness of the\ngenerated answers. Furthermore, we employ a pruning tree search to optimize\nsearch time while achieving strong performance. Notably, our method improves\nQwen2-7b-Instruct's score from 36.94 to 61.52, outperforming GPT4's 42.5 on the\nMATH benchmark.\n","authors":["Linzhuang Sun","Hao Liang","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17958v1","updated":"2024-09-26T15:36:10Z","published":"2024-09-26T15:36:10Z","title":"The Hard Positive Truth about Vision-Language Compositionality","summary":" Several benchmarks have concluded that our best vision-language models (e.g.,\nCLIP) are lacking in compositionality. Given an image, these benchmarks probe a\nmodel's ability to identify its associated caption amongst a set of\ncompositional distractors. In response, a surge of recent proposals show\nimprovements by finetuning CLIP with distractors as hard negatives. Our\ninvestigations reveal that these improvements have, in fact, been significantly\noverstated -- because existing benchmarks do not probe whether finetuned\nvision-language models remain invariant to hard positives. By curating an\nevaluation dataset with 112,382 hard negatives and hard positives, we uncover\nthat including hard positives decreases CLIP's performance by 12.9%, while\nhumans perform effortlessly at 99%. CLIP finetuned with hard negatives results\nin an even larger decrease, up to 38.7%. With this finding, we then produce a\n1,775,259 image-text training set with both hard negative and hard positive\ncaptions. By training with both, we see improvements on existing benchmarks\nwhile simultaneously improving performance on hard positives, indicating a more\nrobust improvement in compositionality. Our work suggests the need for future\nresearch to rigorously test and improve CLIP's understanding of semantic\nrelationships between related \"positive\" concepts.\n","authors":["Amita Kamath","Cheng-Yu Hsieh","Kai-Wei Chang","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2409.17958v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2305.11231v2","updated":"2024-09-26T15:32:47Z","published":"2023-05-18T18:00:44Z","title":"Recent Trends in Unsupervised Summarization","summary":" Unsupervised summarization is a powerful technique that enables training\nsummarizing models without requiring labeled datasets. This survey covers\ndifferent recent techniques and models used for unsupervised summarization. We\ncover extractive, abstractive, and hybrid models and strategies used to achieve\nunsupervised summarization. While the main focus of this survey is on recent\nresearch, we also cover some of the important previous research. We\nadditionally introduce a taxonomy, classifying different research based on\ntheir approach to unsupervised training. Finally, we discuss the current\napproaches and mention some datasets and evaluation methods.\n","authors":["Mohammad Khosravani","Amine Trabelsi"],"pdf_url":"https://arxiv.org/pdf/2305.11231v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13740v2","updated":"2024-09-26T15:27:08Z","published":"2024-09-10T16:37:58Z","title":"Language agents achieve superhuman synthesis of scientific knowledge","summary":" Language models are known to hallucinate incorrect information, and it is\nunclear if they are sufficiently accurate and reliable for use in scientific\nresearch. We developed a rigorous human-AI comparison methodology to evaluate\nlanguage model agents on real-world literature search tasks covering\ninformation retrieval, summarization, and contradiction detection tasks. We\nshow that PaperQA2, a frontier language model agent optimized for improved\nfactuality, matches or exceeds subject matter expert performance on three\nrealistic literature research tasks without any restrictions on humans (i.e.,\nfull access to internet, search tools, and time). PaperQA2 writes cited,\nWikipedia-style summaries of scientific topics that are significantly more\naccurate than existing, human-written Wikipedia articles. We also introduce a\nhard benchmark for scientific literature research called LitQA2 that guided\ndesign of PaperQA2, leading to it exceeding human performance. Finally, we\napply PaperQA2 to identify contradictions within the scientific literature, an\nimportant scientific task that is challenging for humans. PaperQA2 identifies\n2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of\nwhich 70% are validated by human experts. These results demonstrate that\nlanguage model agents are now capable of exceeding domain experts across\nmeaningful tasks on scientific literature.\n","authors":["Michael D. Skarlinski","Sam Cox","Jon M. Laurent","James D. Braza","Michaela Hinks","Michael J. Hammerling","Manvitha Ponnapati","Samuel G. Rodriques","Andrew D. White"],"pdf_url":"https://arxiv.org/pdf/2409.13740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17946v1","updated":"2024-09-26T15:20:37Z","published":"2024-09-26T15:20:37Z","title":"Weak-To-Strong Backdoor Attacks for LLMs with Contrastive Knowledge\n Distillation","summary":" Despite being widely applied due to their exceptional capabilities, Large\nLanguage Models (LLMs) have been proven to be vulnerable to backdoor attacks.\nThese attacks introduce targeted vulnerabilities into LLMs by poisoning\ntraining samples and full-parameter fine-tuning. However, this kind of backdoor\nattack is limited since they require significant computational resources,\nespecially as the size of LLMs increases. Besides, parameter-efficient\nfine-tuning (PEFT) offers an alternative but the restricted parameter updating\nmay impede the alignment of triggers with target labels. In this study, we\nfirst verify that backdoor attacks with PEFT may encounter challenges in\nachieving feasible performance. To address these issues and improve the\neffectiveness of backdoor attacks with PEFT, we propose a novel backdoor attack\nalgorithm from weak to strong based on contrastive knowledge distillation\n(W2SAttack). Specifically, we poison small-scale language models through\nfull-parameter fine-tuning to serve as the teacher model. The teacher model\nthen covertly transfers the backdoor to the large-scale student model through\ncontrastive knowledge distillation, which employs PEFT. Theoretical analysis\nreveals that W2SAttack has the potential to augment the effectiveness of\nbackdoor attacks. We demonstrate the superior performance of W2SAttack on\nclassification tasks across four language models, four backdoor attack\nalgorithms, and two different architectures of teacher models. Experimental\nresults indicate success rates close to 100% for backdoor attacks targeting\nPEFT.\n","authors":["Shuai Zhao","Leilei Gan","Zhongliang Guo","Xiaobao Wu","Luwei Xiao","Xiaoyu Xu","Cong-Duy Nguyen","Luu Anh Tuan"],"pdf_url":"https://arxiv.org/pdf/2409.17946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17943v1","updated":"2024-09-26T15:18:34Z","published":"2024-09-26T15:18:34Z","title":"On Translating Technical Terminology: A Translation Workflow for\n Machine-Translated Acronyms","summary":" The typical workflow for a professional translator to translate a document\nfrom its source language (SL) to a target language (TL) is not always focused\non what many language models in natural language processing (NLP) do - predict\nthe next word in a series of words. While high-resource languages like English\nand French are reported to achieve near human parity using common metrics for\nmeasurement such as BLEU and COMET, we find that an important step is being\nmissed: the translation of technical terms, specifically acronyms. Some\nstate-of-the art machine translation systems like Google Translate which are\npublicly available can be erroneous when dealing with acronyms - as much as 50%\nin our findings. This article addresses acronym disambiguation for MT systems\nby proposing an additional step to the SL-TL (FR-EN) translation workflow where\nwe first offer a new acronym corpus for public consumption and then experiment\nwith a search-based thresholding algorithm that achieves nearly 10% increase\nwhen compared to Google Translate and OpusMT.\n","authors":["Richard Yue","John E. Ortega","Kenneth Ward Church"],"pdf_url":"https://arxiv.org/pdf/2409.17943v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17939v1","updated":"2024-09-26T15:12:59Z","published":"2024-09-26T15:12:59Z","title":"Predicting Anchored Text from Translation Memories for Machine\n Translation Using Deep Learning Methods","summary":" Translation memories (TMs) are the backbone for professional translation\ntools called computer-aided translation (CAT) tools. In order to perform a\ntranslation using a CAT tool, a translator uses the TM to gather translations\nsimilar to the desired segment to translate (s'). Many CAT tools offer a\nfuzzy-match algorithm to locate segments (s) in the TM that are close in\ndistance to s'. After locating two similar segments, the CAT tool will present\nparallel segments (s, t) that contain one segment in the source language along\nwith its translation in the target language. Additionally, CAT tools contain\nfuzzy-match repair (FMR) techniques that will automatically use the parallel\nsegments from the TM to create new TM entries containing a modified version of\nthe original with the idea in mind that it will be the translation of s'. Most\nFMR techniques use machine translation as a way of \"repairing\" those words that\nhave to be modified. In this article, we show that for a large part of those\nwords which are anchored, we can use other techniques that are based on machine\nlearning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we\nshow that for anchored words that follow the continuous bag-of-words (CBOW)\nparadigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for\nsome cases, better results than neural machine translation for translating\nanchored words from French to English.\n","authors":["Richard Yue","John E. Ortega"],"pdf_url":"https://arxiv.org/pdf/2409.17939v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17929v1","updated":"2024-09-26T15:08:17Z","published":"2024-09-26T15:08:17Z","title":"The Lou Dataset -- Exploring the Impact of Gender-Fair Language in\n German Text Classification","summary":" Gender-fair language, an evolving German linguistic variation, fosters\ninclusion by addressing all genders or using neutral forms. Nevertheless, there\nis a significant lack of resources to assess the impact of this linguistic\nshift on classification using language models (LMs), which are probably not\ntrained on such variations. To address this gap, we present Lou, the first\ndataset featuring high-quality reformulations for German text classification\ncovering seven tasks, like stance detection and toxicity classification.\nEvaluating 16 mono- and multi-lingual LMs on Lou shows that gender-fair\nlanguage substantially impacts predictions by flipping labels, reducing\ncertainty, and altering attention patterns. However, existing evaluations\nremain valid, as LM rankings of original and reformulated instances do not\nsignificantly differ. While we offer initial insights on the effect on German\ntext classification, the findings likely apply to other languages, as\nconsistent patterns were observed in multi-lingual and English LMs.\n","authors":["Andreas Waldis","Joel Birrer","Anne Lauscher","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2409.17929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17928v1","updated":"2024-09-26T15:07:30Z","published":"2024-09-26T15:07:30Z","title":"Pioneering Reliable Assessment in Text-to-Image Knowledge Editing:\n Leveraging a Fine-Grained Dataset and an Innovative Criterion","summary":" During pre-training, the Text-to-Image (T2I) diffusion models encode factual\nknowledge into their parameters. These parameterized facts enable realistic\nimage generation, but they may become obsolete over time, thereby\nmisrepresenting the current state of the world. Knowledge editing techniques\naim to update model knowledge in a targeted way. However, facing the dual\nchallenges posed by inadequate editing datasets and unreliable evaluation\ncriterion, the development of T2I knowledge editing encounter difficulties in\neffectively generalizing injected knowledge. In this work, we design a T2I\nknowledge editing framework by comprehensively spanning on three phases: First,\nwe curate a dataset \\textbf{CAKE}, comprising paraphrase and multi-object test,\nto enable more fine-grained assessment on knowledge generalization. Second, we\npropose a novel criterion, \\textbf{adaptive CLIP threshold}, to effectively\nfilter out false successful images under the current criterion and achieve\nreliable editing evaluation. Finally, we introduce \\textbf{MPE}, a simple but\neffective approach for T2I knowledge editing. Instead of tuning parameters, MPE\nprecisely recognizes and edits the outdated part of the conditioning\ntext-prompt to accommodate the up-to-date knowledge. A straightforward\nimplementation of MPE (Based on in-context learning) exhibits better overall\nperformance than previous model editors. We hope these efforts can further\npromote faithful evaluation of T2I knowledge editing methods.\n","authors":["Hengrui Gu","Kaixiong Zhou","Yili Wang","Ruobing Wang","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17928v1.pdf","comment":"EMNLP24 Findings"},{"id":"http://arxiv.org/abs/2409.17912v1","updated":"2024-09-26T14:56:38Z","published":"2024-09-26T14:56:38Z","title":"Atlas-Chat: Adapting Large Language Models for Low-Resource Moroccan\n Arabic Dialect","summary":" We introduce Atlas-Chat, the first-ever collection of large language models\nspecifically developed for dialectal Arabic. Focusing on Moroccan Arabic, also\nknown as Darija, we construct our instruction dataset by consolidating existing\nDarija language resources, creating novel datasets both manually and\nsynthetically, and translating English instructions with stringent quality\ncontrol. Atlas-Chat-9B and 2B models, fine-tuned on the dataset, exhibit\nsuperior ability in following Darija instructions and performing standard NLP\ntasks. Notably, our models outperform both state-of-the-art and\nArabic-specialized LLMs like LLaMa, Jais, and AceGPT, e.g., achieving a 13%\nperformance boost over a larger 13B model on DarijaMMLU, in our newly\nintroduced evaluation suite for Darija covering both discriminative and\ngenerative tasks. Furthermore, we perform an experimental analysis of various\nfine-tuning strategies and base model choices to determine optimal\nconfigurations. All our resources are publicly accessible, and we believe our\nwork offers comprehensive design methodologies of instruction-tuning for\nlow-resource language variants, which are often neglected in favor of data-rich\nlanguages by contemporary LLMs.\n","authors":["Guokan Shang","Hadi Abdine","Yousef Khoubrane","Amr Mohamed","Yassine Abbahaddou","Sofiane Ennadir","Imane Momayiz","Xuguang Ren","Eric Moulines","Preslav Nakov","Michalis Vazirgiannis","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2409.17912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17899v1","updated":"2024-09-26T14:49:09Z","published":"2024-09-26T14:49:09Z","title":"Revisiting Acoustic Similarity in Emotional Speech and Music via\n Self-Supervised Representations","summary":" Emotion recognition from speech and music shares similarities due to their\nacoustic overlap, which has led to interest in transferring knowledge between\nthese domains. However, the shared acoustic cues between speech and music,\nparticularly those encoded by Self-Supervised Learning (SSL) models, remain\nlargely unexplored, given the fact that SSL models for speech and music have\nrarely been applied in cross-domain research. In this work, we revisit the\nacoustic similarity between emotion speech and music, starting with an analysis\nof the layerwise behavior of SSL models for Speech Emotion Recognition (SER)\nand Music Emotion Recognition (MER). Furthermore, we perform cross-domain\nadaptation by comparing several approaches in a two-stage fine-tuning process,\nexamining effective ways to utilize music for SER and speech for MER. Lastly,\nwe explore the acoustic similarities between emotional speech and music using\nFrechet audio distance for individual emotions, uncovering the issue of emotion\nbias in both speech and music SSL models. Our findings reveal that while speech\nand music SSL models do capture shared acoustic features, their behaviors can\nvary depending on different emotions due to their training strategies and\ndomain-specificities. Additionally, parameter-efficient fine-tuning can enhance\nSER and MER performance by leveraging knowledge from each other. This study\nprovides new insights into the acoustic similarity between emotional speech and\nmusic, and highlights the potential for cross-domain generalization to improve\nSER and MER systems.\n","authors":["Yujia Sun","Zeyu Zhao","Korin Richmond","Yuanchao Li"],"pdf_url":"https://arxiv.org/pdf/2409.17899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18789v2","updated":"2024-09-26T14:48:42Z","published":"2024-07-26T14:52:37Z","title":"Granularity is crucial when applying differential privacy to text: An\n investigation for neural machine translation","summary":" Applying differential privacy (DP) by means of the DP-SGD algorithm to\nprotect individual data points during training is becoming increasingly popular\nin NLP. However, the choice of granularity at which DP is applied is often\nneglected. For example, neural machine translation (NMT) typically operates on\nthe sentence-level granularity. From the perspective of DP, this setup assumes\nthat each sentence belongs to a single person and any two sentences in the\ntraining dataset are independent. This assumption is however violated in many\nreal-world NMT datasets, e.g., those including dialogues. For proper\napplication of DP we thus must shift from sentences to entire documents. In\nthis paper, we investigate NMT at both the sentence and document levels,\nanalyzing the privacy/utility trade-off for both scenarios, and evaluating the\nrisks of not using the appropriate privacy granularity in terms of leaking\npersonally identifiable information (PII). Our findings indicate that the\ndocument-level NMT system is more resistant to membership inference attacks,\nemphasizing the significance of using the appropriate granularity when working\nwith DP.\n","authors":["Doan Nam Long Vu","Timour Igamberdiev","Ivan Habernal"],"pdf_url":"https://arxiv.org/pdf/2407.18789v2.pdf","comment":"Accepted at EMNLP Findings 2024"},{"id":"http://arxiv.org/abs/2409.17892v1","updated":"2024-09-26T14:40:45Z","published":"2024-09-26T14:40:45Z","title":"EMMA-500: Enhancing Massively Multilingual Adaptation of Large Language\n Models","summary":" In this work, we introduce EMMA-500, a large-scale multilingual language\nmodel continue-trained on texts across 546 languages designed for enhanced\nmultilingual performance, focusing on improving language coverage for\nlow-resource languages. To facilitate continual pre-training, we compile the\nMaLA corpus, a comprehensive multilingual dataset enriched with curated\ndatasets across diverse domains. Leveraging this corpus, we conduct extensive\ncontinual pre-training of the Llama 2 7B model, resulting in EMMA-500, which\ndemonstrates robust performance across a wide collection of benchmarks,\nincluding a comprehensive set of multilingual tasks and PolyWrite, an\nopen-ended generation benchmark developed in this study. Our results highlight\nthe effectiveness of continual pre-training in expanding large language models'\nlanguage capacity, particularly for underrepresented languages, demonstrating\nsignificant gains in cross-lingual transfer, task generalization, and language\nadaptability.\n","authors":["Shaoxiong Ji","Zihao Li","Indraneil Paul","Jaakko Paavola","Peiqin Lin","Pinzhen Chen","Dayyán O'Brien","Hengyu Luo","Hinrich Schütze","Jörg Tiedemann","Barry Haddow"],"pdf_url":"https://arxiv.org/pdf/2409.17892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09577v2","updated":"2024-09-26T14:34:53Z","published":"2024-04-15T08:38:43Z","title":"Transformers, Contextualism, and Polysemy","summary":" The transformer architecture, introduced by Vaswani et al. (2017), is at the\nheart of the remarkable recent progress in the development of language models,\nincluding widely-used chatbots such as Chat-GPT and Claude. In this paper, I\nargue that we can extract from the way the transformer architecture works a\ntheory of the relationship between context and meaning. I call this the\ntransformer theory, and I argue that it is novel with regard to two related\nphilosophical debates: the contextualism debate regarding the extent of\ncontext-sensitivity across natural language, and the polysemy debate regarding\nhow polysemy should be captured within an account of word meaning.\n","authors":["Jumbly Grindrod"],"pdf_url":"https://arxiv.org/pdf/2404.09577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17865v1","updated":"2024-09-26T14:15:54Z","published":"2024-09-26T14:15:54Z","title":"Implementing a Nordic-Baltic Federated Health Data Network: a case\n report","summary":" Background: Centralized collection and processing of healthcare data across\nnational borders pose significant challenges, including privacy concerns, data\nheterogeneity and legal barriers. To address some of these challenges, we\nformed an interdisciplinary consortium to develop a feder-ated health data\nnetwork, comprised of six institutions across five countries, to facilitate\nNordic-Baltic cooperation on secondary use of health data. The objective of\nthis report is to offer early insights into our experiences developing this\nnetwork. Methods: We used a mixed-method ap-proach, combining both experimental\ndesign and implementation science to evaluate the factors affecting the\nimplementation of our network. Results: Technically, our experiments indicate\nthat the network functions without significant performance degradation compared\nto centralized simu-lation. Conclusion: While use of interdisciplinary\napproaches holds a potential to solve challeng-es associated with establishing\nsuch collaborative networks, our findings turn the spotlight on the uncertain\nregulatory landscape playing catch up and the significant operational costs.\n","authors":["Taridzo Chomutare","Aleksandar Babic","Laura-Maria Peltonen","Silja Elunurm","Peter Lundberg","Arne Jönsson","Emma Eneling","Ciprian-Virgil Gerstenberger","Troels Siggaard","Raivo Kolde","Oskar Jerdhaf","Martin Hansson","Alexandra Makhlysheva","Miroslav Muzny","Erik Ylipää","Søren Brunak","Hercules Dalianis"],"pdf_url":"https://arxiv.org/pdf/2409.17865v1.pdf","comment":"24 pages (including appendices), 1 figure"},{"id":"http://arxiv.org/abs/2409.17834v1","updated":"2024-09-26T13:36:00Z","published":"2024-09-26T13:36:00Z","title":"PEDRO: Parameter-Efficient Fine-tuning with Prompt DEpenDent\n Representation MOdification","summary":" Due to their substantial sizes, large language models (LLMs) are typically\ndeployed within a single-backbone multi-tenant framework. In this setup, a\nsingle instance of an LLM backbone must cater to multiple users or tasks\nthrough the application of various parameter-efficient fine-tuning (PEFT)\nmodels. Despite the availability of numerous effective PEFT techniques such as\nLoRA, there remains a need for a PEFT approach that achieves both high\nefficiency during inference and competitive performance on downstream tasks. In\nthis research, we introduce a new and straightforward PEFT methodology named\n\\underline{P}rompt D\\underline{E}pen\\underline{D}ent \\underline{R}epresentation\nM\\underline{O}dification (PEDRO). The proposed method involves integrating a\nlightweight vector generator into each Transformer layer, which generates\nvectors contingent upon the input prompts. These vectors then modify the hidden\nrepresentations created by the LLM through a dot product operation, thereby\ninfluencing the semantic output and generated content of the model. Extensive\nexperimentation across a variety of tasks indicates that: (a) PEDRO surpasses\nrecent PEFT benchmarks when using a similar number of tunable parameters. (b)\nUnder the single-backbone multi-tenant deployment model, PEDRO exhibits\nsuperior efficiency compared to LoRA, indicating significant industrial\npotential.\n","authors":["Tianfang Xie","Tianjing Li","Wei Zhu","Wei Han","Yi Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.17834v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.18203"},{"id":"http://arxiv.org/abs/2409.17827v1","updated":"2024-09-26T13:26:46Z","published":"2024-09-26T13:26:46Z","title":"BeanCounter: A low-toxicity, large-scale, and open dataset of\n business-oriented text","summary":" Many of the recent breakthroughs in language modeling have resulted from\nscaling effectively the same model architecture to larger datasets. In this\nvein, recent work has highlighted performance gains from increasing training\ndataset size and quality, suggesting a need for novel sources of large-scale\ndatasets. In this work, we introduce BeanCounter, a public dataset consisting\nof more than 159B tokens extracted from businesses' disclosures. We show that\nthis data is indeed novel: less than 0.1% of BeanCounter appears in Common\nCrawl-based datasets and it is an order of magnitude larger than datasets\nrelying on similar sources. Given the data's provenance, we hypothesize that\nBeanCounter is comparatively more factual and less toxic than web-based\ndatasets. Exploring this hypothesis, we find that many demographic identities\noccur with similar prevalence in BeanCounter but with significantly less toxic\ncontext relative to other datasets. To demonstrate the utility of BeanCounter,\nwe evaluate and compare two LLMs continually pre-trained on BeanCounter with\ntheir base models. We find an 18-33% reduction in toxic generation and improved\nperformance within the finance domain for the continually pretrained models.\nCollectively, our work suggests that BeanCounter is a novel source of\nlow-toxicity and high-quality domain-specific data with sufficient scale to\ntrain multi-billion parameter LLMs.\n","authors":["Siyan Wang","Bradford Levy"],"pdf_url":"https://arxiv.org/pdf/2409.17827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16934v2","updated":"2024-09-26T13:22:37Z","published":"2024-09-25T13:45:23Z","title":"Investigating OCR-Sensitive Neurons to Improve Entity Recognition in\n Historical Documents","summary":" This paper investigates the presence of OCR-sensitive neurons within the\nTransformer architecture and their influence on named entity recognition (NER)\nperformance on historical documents. By analysing neuron activation patterns in\nresponse to clean and noisy text inputs, we identify and then neutralise\nOCR-sensitive neurons to improve model performance. Based on two open access\nlarge language models (Llama2 and Mistral), experiments demonstrate the\nexistence of OCR-sensitive regions and show improvements in NER performance on\nhistorical newspapers and classical commentaries, highlighting the potential of\ntargeted neuron modulation to improve models' performance on noisy text.\n","authors":["Emanuela Boros","Maud Ehrmann"],"pdf_url":"https://arxiv.org/pdf/2409.16934v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17819v1","updated":"2024-09-26T13:15:18Z","published":"2024-09-26T13:15:18Z","title":"Inference-Time Language Model Alignment via Integrated Value Guidance","summary":" Large language models are typically fine-tuned to align with human\npreferences, but tuning large models is computationally intensive and complex.\nIn this work, we introduce $\\textit{Integrated Value Guidance}$ (IVG), a method\nthat uses implicit and explicit value functions to guide language model\ndecoding at token and chunk-level respectively, efficiently aligning large\nlanguage models purely at inference time. This approach circumvents the\ncomplexities of direct fine-tuning and outperforms traditional methods.\nEmpirically, we demonstrate the versatility of IVG across various tasks. In\ncontrolled sentiment generation and summarization tasks, our method\nsignificantly improves the alignment of large models using inference-time\nguidance from $\\texttt{gpt2}$-based value functions. Moreover, in a more\nchallenging instruction-following benchmark AlpacaEval 2.0, we show that both\nspecifically tuned and off-the-shelf value functions greatly improve the\nlength-controlled win rates of large models against $\\texttt{gpt-4-turbo}$\n(e.g., $19.51\\% \\rightarrow 26.51\\%$ for $\\texttt{Mistral-7B-Instruct-v0.2}$\nand $25.58\\% \\rightarrow 33.75\\%$ for $\\texttt{Mixtral-8x7B-Instruct-v0.1}$\nwith Tulu guidance).\n","authors":["Zhixuan Liu","Zhanhui Zhou","Yuanfu Wang","Chao Yang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2409.17819v1.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17791v1","updated":"2024-09-26T12:37:26Z","published":"2024-09-26T12:37:26Z","title":"Self-supervised Preference Optimization: Enhance Your Language Model\n with Preference Degree Awareness","summary":" Recently, there has been significant interest in replacing the reward model\nin Reinforcement Learning with Human Feedback (RLHF) methods for Large Language\nModels (LLMs), such as Direct Preference Optimization (DPO) and its variants.\nThese approaches commonly use a binary cross-entropy mechanism on pairwise\nsamples, i.e., minimizing and maximizing the loss based on preferred or\ndis-preferred responses, respectively. However, while this training strategy\nomits the reward model, it also overlooks the varying preference degrees within\ndifferent responses. We hypothesize that this is a key factor hindering LLMs\nfrom sufficiently understanding human preferences. To address this problem, we\npropose a novel Self-supervised Preference Optimization (SPO) framework, which\nconstructs a self-supervised preference degree loss combined with the alignment\nloss, thereby helping LLMs improve their ability to understand the degree of\npreference. Extensive experiments are conducted on two widely used datasets of\ndifferent tasks. The results demonstrate that SPO can be seamlessly integrated\nwith existing preference optimization methods and significantly boost their\nperformance to achieve state-of-the-art performance. We also conduct detailed\nanalyses to offer comprehensive insights into SPO, which verifies its\neffectiveness. The code is available at https://github.com/lijian16/SPO.\n","authors":["Jian Li","Haojing Huang","Yujia Zhang","Pengfei Xu","Xi Chen","Rui Song","Lida Shi","Jingwen Wang","Hao Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17791v1.pdf","comment":"Accepted at EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2403.15676v4","updated":"2024-09-26T12:18:21Z","published":"2024-03-23T01:44:57Z","title":"AC4: Algebraic Computation Checker for Circuit Constraints in ZKPs","summary":" Zero-knowledge proof (ZKP) systems have surged attention and held a\nfundamental role in contemporary cryptography. Zero-knowledge succinct\nnon-interactive argument of knowledge (zk-SNARK) protocols dominate the ZKP\nusage, implemented through arithmetic circuit programming paradigm. However,\nunderconstrained or overconstrained circuits may lead to bugs. The former\nrefers to circuits that lack the necessary constraints, resulting in unexpected\nsolutions and causing the verifier to accept a bogus witness, and the latter\nrefers to circuits that are constrained excessively, resulting in lacking\nnecessary solutions and causing the verifier to accept no witness. This paper\nintroduces a novel approach for pinpointing two distinct types of bugs in ZKP\ncircuits. The method involves encoding the arithmetic circuit constraints to\npolynomial equation systems and solving them over finite fields by the computer\nalgebra system. The classification of verification results is refined, greatly\nenhancing the expressive power of the system. A tool, AC4, is proposed to\nrepresent the implementation of the method. Experiments show that AC4\ndemonstrates a increase in the checked ratio, showing a 29% improvement over\nPicus, a checker for Circom circuits, and a 10% improvement over\nhalo2-analyzer, a checker for halo2 circuits. Within a solvable range, the\nchecking time has also exhibited noticeable improvement, demonstrating a\nmagnitude increase compared to previous efforts.\n","authors":["Hao Chen","Guoqiang Li","Minyu Chen","Ruibang Liu","Sinka Gao"],"pdf_url":"https://arxiv.org/pdf/2403.15676v4.pdf","comment":"24 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17774v1","updated":"2024-09-26T12:11:28Z","published":"2024-09-26T12:11:28Z","title":"Faithfulness and the Notion of Adversarial Sensitivity in NLP\n Explanations","summary":" Faithfulness is arguably the most critical metric to assess the reliability\nof explainable AI. In NLP, current methods for faithfulness evaluation are\nfraught with discrepancies and biases, often failing to capture the true\nreasoning of models. We introduce Adversarial Sensitivity as a novel approach\nto faithfulness evaluation, focusing on the explainer's response when the model\nis under adversarial attack. Our method accounts for the faithfulness of\nexplainers by capturing sensitivity to adversarial input changes. This work\naddresses significant limitations in existing evaluation techniques, and\nfurthermore, quantifies faithfulness from a crucial yet underexplored paradigm.\n","authors":["Supriya Manna","Niladri Sett"],"pdf_url":"https://arxiv.org/pdf/2409.17774v1.pdf","comment":"Accepted as a Full Paper at EMNLP 2024 Workshop BlackBoxNLP"},{"id":"http://arxiv.org/abs/2409.13832v2","updated":"2024-09-26T12:07:20Z","published":"2024-09-20T18:18:14Z","title":"GTSinger: A Global Multi-Technique Singing Corpus with Realistic Music\n Scores for All Singing Tasks","summary":" The scarcity of high-quality and multi-task singing datasets significantly\nhinders the development of diverse controllable and personalized singing tasks,\nas existing singing datasets suffer from low quality, limited diversity of\nlanguages and singers, absence of multi-technique information and realistic\nmusic scores, and poor task suitability. To tackle these problems, we present\nGTSinger, a large global, multi-technique, free-to-use, high-quality singing\ncorpus with realistic music scores, designed for all singing tasks, along with\nits benchmarks. Particularly, (1) we collect 80.59 hours of high-quality\nsinging voices, forming the largest recorded singing dataset; (2) 20\nprofessional singers across nine widely spoken languages offer diverse timbres\nand styles; (3) we provide controlled comparison and phoneme-level annotations\nof six commonly used singing techniques, helping technique modeling and\ncontrol; (4) GTSinger offers realistic music scores, assisting real-world\nmusical composition; (5) singing voices are accompanied by manual\nphoneme-to-audio alignments, global style labels, and 16.16 hours of paired\nspeech for various singing tasks. Moreover, to facilitate the use of GTSinger,\nwe conduct four benchmark experiments: technique-controllable singing voice\nsynthesis, technique recognition, style transfer, and speech-to-singing\nconversion. The corpus and demos can be found at http://gtsinger.github.io. We\nprovide the dataset and the code for processing data and conducting benchmarks\nat https://huggingface.co/datasets/GTSinger/GTSinger and\nhttps://github.com/GTSinger/GTSinger.\n","authors":["Yu Zhang","Changhao Pan","Wenxiang Guo","Ruiqi Li","Zhiyuan Zhu","Jialei Wang","Wenhao Xu","Jingyu Lu","Zhiqing Hong","Chuxin Wang","LiChao Zhang","Jinzheng He","Ziyue Jiang","Yuxin Chen","Chen Yang","Jiecheng Zhou","Xinyu Cheng","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.13832v2.pdf","comment":"Accepted by NeurIPS 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2409.17757v1","updated":"2024-09-26T11:46:58Z","published":"2024-09-26T11:46:58Z","title":"Integrating Hierarchical Semantic into Iterative Generation Model for\n Entailment Tree Explanation","summary":" Manifestly and logically displaying the line of reasoning from evidence to\nanswer is significant to explainable question answering (QA). The entailment\ntree exhibits the lines structurally, which is different from the\nself-explanation principle in large-scale language models. Existing methods\nrarely consider the semantic association of sentences between and within\nhierarchies within the tree structure, which is prone to apparent mistakes in\ncombinations. In this work, we propose an architecture of integrating the\nHierarchical Semantics of sentences under the framework of Controller-Generator\n(HiSCG) to explain answers. The HiSCG designs a hierarchical mapping between\nhypotheses and facts, discriminates the facts involved in tree constructions,\nand optimizes single-step entailments. To the best of our knowledge, We are the\nfirst to notice hierarchical semantics of sentences between the same layer and\nadjacent layers to yield improvements. The proposed method achieves comparable\nperformance on all three settings of the EntailmentBank dataset. The\ngeneralization results on two out-of-domain datasets also demonstrate the\neffectiveness of our method.\n","authors":["Qin Wang","Jianzhou Feng","Yiming Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04259v2","updated":"2024-09-26T11:42:35Z","published":"2024-08-08T06:57:49Z","title":"EfficientRAG: Efficient Retriever for Multi-Hop Question Answering","summary":" Retrieval-augmented generation (RAG) methods encounter difficulties when\naddressing complex questions like multi-hop queries. While iterative retrieval\nmethods improve performance by gathering additional information, current\napproaches often rely on multiple calls of large language models (LLMs). In\nthis paper, we introduce EfficientRAG, an efficient retriever for multi-hop\nquestion answering. EfficientRAG iteratively generates new queries without the\nneed for LLM calls at each iteration and filters out irrelevant information.\nExperimental results demonstrate that EfficientRAG surpasses existing RAG\nmethods on three open-domain multi-hop question-answering datasets.\n","authors":["Ziyuan Zhuang","Zhiyang Zhang","Sitao Cheng","Fangkai Yang","Jia Liu","Shujian Huang","Qingwei Lin","Saravan Rajmohan","Dongmei Zhang","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.04259v2.pdf","comment":"20 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17755v1","updated":"2024-09-26T11:40:07Z","published":"2024-09-26T11:40:07Z","title":"SECURE: Semantics-aware Embodied Conversation under Unawareness for\n Lifelong Robot Learning","summary":" This paper addresses a challenging interactive task learning scenario we call\nrearrangement under unawareness: to manipulate a rigid-body environment in a\ncontext where the robot is unaware of a concept that's key to solving the\ninstructed task. We propose SECURE, an interactive task learning framework\ndesigned to solve such problems by fixing a deficient domain model using\nembodied conversation. Through dialogue, the robot discovers and then learns to\nexploit unforeseen possibilities. Using SECURE, the robot not only learns from\nthe user's corrective feedback when it makes a mistake, but it also learns to\nmake strategic dialogue decisions for revealing useful evidence about novel\nconcepts for solving the instructed task. Together, these abilities allow the\nrobot to generalise to subsequent tasks using newly acquired knowledge. We\ndemonstrate that a robot that is semantics-aware -- that is, it exploits the\nlogical consequences of both sentence and discourse semantics in the learning\nand inference process -- learns to solve rearrangement under unawareness more\neffectively than a robot that lacks such capabilities.\n","authors":["Rimvydas Rubavicius","Peter David Fagan","Alex Lascarides","Subramanian Ramamoorthy"],"pdf_url":"https://arxiv.org/pdf/2409.17755v1.pdf","comment":"10 pages,4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.17750v1","updated":"2024-09-26T11:31:18Z","published":"2024-09-26T11:31:18Z","title":"Are Transformers in Pre-trained LM A Good ASR Encoder? An Empirical\n Study","summary":" In this study, we delve into the efficacy of transformers within pre-trained\nlanguage models (PLMs) when repurposed as encoders for Automatic Speech\nRecognition (ASR). Our underlying hypothesis posits that, despite being\ninitially trained on text-based corpora, these transformers possess a\nremarkable capacity to extract effective features from the input sequence. This\ninherent capability, we argue, is transferrable to speech data, thereby\naugmenting the acoustic modeling ability of ASR. Through rigorous empirical\nanalysis, our findings reveal a notable improvement in Character Error Rate\n(CER) and Word Error Rate (WER) across diverse ASR tasks when transformers from\npre-trained LMs are incorporated. Particularly, they serve as an advantageous\nstarting point for initializing ASR encoders. Furthermore, we uncover that\nthese transformers, when integrated into a well-established ASR encoder, can\nsignificantly boost performance, especially in scenarios where profound\nsemantic comprehension is pivotal. This underscores the potential of leveraging\nthe semantic prowess embedded within pre-trained transformers to advance ASR\nsystems' capabilities.\n","authors":["Keyu An","Shiliang Zhang","Zhijie Yan"],"pdf_url":"https://arxiv.org/pdf/2409.17750v1.pdf","comment":"8pages"},{"id":"http://arxiv.org/abs/2402.12844v2","updated":"2024-09-26T11:29:04Z","published":"2024-02-20T09:13:15Z","title":"ICON: Improving Inter-Report Consistency in Radiology Report Generation\n via Lesion-aware Mixup Augmentation","summary":" Previous research on radiology report generation has made significant\nprogress in terms of increasing the clinical accuracy of generated reports. In\nthis paper, we emphasize another crucial quality that it should possess, i.e.,\ninter-report consistency, which refers to the capability of generating\nconsistent reports for semantically equivalent radiographs. This quality is\neven of greater significance than the overall report accuracy in terms of\nensuring the system's credibility, as a system prone to providing conflicting\nresults would severely erode users' trust. Regrettably, existing approaches\nstruggle to maintain inter-report consistency, exhibiting biases towards common\npatterns and susceptibility to lesion variants. To address this issue, we\npropose ICON, which improves the inter-report consistency of radiology report\ngeneration. Aiming to enhance the system's ability to capture similarities in\nsemantically equivalent lesions, our approach first involves extracting lesions\nfrom input images and examining their characteristics. Then, we introduce a\nlesion-aware mixup technique to ensure that the representations of the\nsemantically equivalent lesions align with the same attributes, achieved\nthrough a linear combination during the training phase. Extensive experiments\non three publicly available chest X-ray datasets verify the effectiveness of\nour approach, both in terms of improving the consistency and accuracy of the\ngenerated reports.\n","authors":["Wenjun Hou","Yi Cheng","Kaishuai Xu","Yan Hu","Wenjie Li","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.12844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17745v1","updated":"2024-09-26T11:19:09Z","published":"2024-09-26T11:19:09Z","title":"Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval\n Model","summary":" A supervised ranking model, despite its advantage of being effective, usually\ninvolves complex processing - typically multiple stages of task-specific\npre-training and fine-tuning. This has motivated researchers to explore simpler\npipelines leveraging large language models (LLMs) that are capable of working\nin a zero-shot manner. However, since zero-shot inference does not make use of\na training set of pairs of queries and their relevant documents, its\nperformance is mostly worse than that of supervised models, which are trained\non such example pairs. Motivated by the existing findings that training\nexamples generally improve zero-shot performance, in our work, we explore if\nthis also applies to ranking models. More specifically, given a query and a\npair of documents, the preference prediction task is improved by augmenting\nexamples of preferences for similar queries from a training set. Our proposed\npairwise few-shot ranker demonstrates consistent improvements over the\nzero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset)\nretrieval benchmarks. Our method also achieves a close performance to that of a\nsupervised model without requiring any complex training pipeline.\n","authors":["Nilanjan Sinhababu","Andrew Parry","Debasis Ganguly","Debasis Samanta","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2409.17745v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2402.10712v3","updated":"2024-09-26T11:15:14Z","published":"2024-02-16T14:15:15Z","title":"An Empirical Study on Cross-lingual Vocabulary Adaptation for Efficient\n Language Model Inference","summary":" The development of state-of-the-art generative large language models (LLMs)\ndisproportionately relies on English-centric tokenizers, vocabulary and\npre-training data. Despite the fact that some LLMs have multilingual\ncapabilities, recent studies have shown that their inference efficiency\ndeteriorates when generating text in languages other than English. This results\nin increased inference time and costs. Cross-lingual vocabulary adaptation\n(CVA) methods have been proposed for adapting models to a target language\naiming to improve downstream performance. However, the effectiveness of these\nmethods on increasing inference efficiency of generative LLMs has yet to be\nexplored. In this paper, we perform an empirical study of five CVA methods on\nfour generative LLMs (including monolingual and multilingual models) across\nfour typologically-diverse languages and four natural language understanding\ntasks. We find that CVA substantially contributes to LLM inference speedups of\nup to 271.5\\%. We also show that adapting LLMs that have been pre-trained on\nmore balanced multilingual data results in downstream performance comparable to\nthe original models.\n","authors":["Atsuki Yamaguchi","Aline Villavicencio","Nikolaos Aletras"],"pdf_url":"https://arxiv.org/pdf/2402.10712v3.pdf","comment":"Accepted at EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2406.12442v2","updated":"2024-09-26T11:15:14Z","published":"2024-06-18T09:46:44Z","title":"Abstraction-of-Thought Makes Language Models Better Reasoners","summary":" Abstract reasoning, the ability to reason from the abstract essence of a\nproblem, serves as a key to generalization in human reasoning. However,\neliciting language models to perform reasoning with abstraction remains\nunexplored. This paper seeks to bridge this gap by introducing a novel\nstructured reasoning format called Abstraction-of-Thought (AoT). The uniqueness\nof AoT lies in its explicit requirement for varying levels of abstraction\nwithin the reasoning process. This approach could elicit language models to\nfirst contemplate on the abstract level before incorporating concrete details,\nwhich is overlooked by the prevailing step-by-step Chain-of-Thought (CoT)\nmethod. To align models with the AoT format, we present AoT Collection, a\ngeneric finetuning dataset consisting of 348k high-quality samples with AoT\nreasoning processes, collected via an automated and scalable pipeline. We\nfinetune a wide range of language models with AoT Collection and conduct\nextensive evaluations on 23 unseen tasks from the challenging benchmark\nBig-Bench Hard. Experimental results indicate that models aligned to AoT\nreasoning format substantially outperform those aligned to CoT in many\nreasoning tasks.\n","authors":["Ruixin Hong","Hongming Zhang","Xiaoman Pan","Dong Yu","Changshui Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.12442v2.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2405.14722v2","updated":"2024-09-26T10:23:33Z","published":"2024-05-23T15:51:24Z","title":"DAPE: Data-Adaptive Positional Encoding for Length Extrapolation","summary":" Positional encoding plays a crucial role in transformers, significantly\nimpacting model performance and length generalization. Prior research has\nintroduced absolute positional encoding (APE) and relative positional encoding\n(RPE) to distinguish token positions in given sequences. However, both APE and\nRPE remain fixed after model training regardless of input data, limiting their\nadaptability and flexibility. Hence, we expect that the desired positional\nencoding should be data-adaptive and can be dynamically adjusted with the given\nattention. In this paper, we propose a Data-Adaptive Positional Encoding (DAPE)\nmethod, which dynamically and semantically adjusts based on input context and\nlearned fixed priors. Experimental validation on real-world datasets (Arxiv,\nBooks3, and CHE) demonstrates that DAPE enhances model performances in terms of\ntrained length and length generalization, where the improvements are\nstatistically significant. The model visualization suggests that our model can\nkeep both local and anti-local information. Finally, we successfully train the\nmodel on sequence length 128 and achieve better performance at evaluation\nsequence length 8192, compared with other static positional encoding methods,\nrevealing the benefit of the adaptive positional encoding method.\n","authors":["Chuanyang Zheng","Yihang Gao","Han Shi","Minbin Huang","Jingyao Li","Jing Xiong","Xiaozhe Ren","Michael Ng","Xin Jiang","Zhenguo Li","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2405.14722v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2407.14788v2","updated":"2024-09-26T10:21:33Z","published":"2024-07-20T07:39:07Z","title":"On the Design and Analysis of LLM-Based Algorithms","summary":" We initiate a formal investigation into the design and analysis of LLM-based\nalgorithms, i.e. algorithms that contain one or multiple calls of large\nlanguage models (LLMs) as sub-routines and critically rely on the capabilities\nof LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt\nengineering to complicated LLM-powered agent systems and compound AI systems,\nhave achieved remarkable empirical success, the design and optimization of them\nhave mostly relied on heuristics and trial-and-errors, which is largely due to\na lack of formal and analytical study for these algorithms. To fill this gap,\nwe start by identifying the computational-graph representation of LLM-based\nalgorithms, the design principle of task decomposition, and some key\nabstractions, which then facilitate our formal analysis for the accuracy and\nefficiency of LLM-based algorithms, despite the black-box nature of LLMs.\nThrough extensive analytical and empirical investigation in a series of case\nstudies, we demonstrate that the proposed framework is broadly applicable to a\nwide range of scenarios and diverse patterns of LLM-based algorithms, such as\nparallel, hierarchical and recursive task decomposition. Our proposed framework\nholds promise for advancing LLM-based algorithms, by revealing the reasons\nbehind curious empirical phenomena, guiding the choices of hyperparameters,\npredicting the empirical performance of algorithms, and inspiring new algorithm\ndesign. To promote further study of LLM-based algorithms, we release our source\ncode at\nhttps://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm.\n","authors":["Yanxi Chen","Yaliang Li","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.14788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01432v3","updated":"2024-09-26T10:03:41Z","published":"2024-03-03T08:07:55Z","title":"Fine Tuning vs. Retrieval Augmented Generation for Less Popular\n Knowledge","summary":" Language Models (LMs) memorize a vast amount of factual knowledge, exhibiting\nstrong performance across diverse tasks and domains. However, it has been\nobserved that the performance diminishes when dealing with less-popular or\nlow-frequency concepts and entities, for example in domain specific\napplications. The two prominent approaches to enhance the performance of LMs on\nlow-frequent topics are: Retrieval Augmented Generation (RAG) and fine-tuning\n(FT) over synthetic data. This paper explores and evaluates the impact of RAG\nand FT on customizing LMs in handling low-frequency entities on question\nanswering tasks. We conduct extensive experiments on twelve LMs of varying size\nand type and different fine tuning, data augmentation, and retrieval models.\nOur findings indicate that while FT boosts the performance across entities of\nvarying popularity, RAG surpasses FT by a large margin particularly for least\npopular factual knowledge. Additionally, the success of both RAG and FT\napproaches is amplified by improving retrieval and data augmentation\ntechniques. Fine tuning, while beneficial for small LMs, requires extensive\nresources. To address this issue, we propose the new Stimulus RAG approach that\nsurpasses the effectiveness of fine tuning based approaches, thereby\neliminating the need for the costly data augmentation and fine tuning step for\nenriching LMs with less popular factual knowledge.\n","authors":["Heydar Soudani","Evangelos Kanoulas","Faegheh Hasibi"],"pdf_url":"https://arxiv.org/pdf/2403.01432v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14374v2","updated":"2024-09-26T10:01:17Z","published":"2024-09-22T09:33:54Z","title":"J2N -- Nominal Adjective Identification and its Application","summary":" This paper explores the challenges posed by nominal adjectives (NAs) in\nnatural language processing (NLP) tasks, particularly in part-of-speech (POS)\ntagging. We propose treating NAs as a distinct POS tag, \"JN,\" and investigate\nits impact on POS tagging, BIO chunking, and coreference resolution. Our study\nshows that reclassifying NAs can improve the accuracy of syntactic analysis and\nstructural understanding in NLP. We present experimental results using Hidden\nMarkov Models (HMMs), Maximum Entropy (MaxEnt) models, and Spacy, demonstrating\nthe feasibility and potential benefits of this approach. Additionally we\ntrained a bert model to identify the NA in untagged text.\n","authors":["Lemeng Qi","Yang Han","Zhuotong Xie"],"pdf_url":"https://arxiv.org/pdf/2409.14374v2.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17692v1","updated":"2024-09-26T09:57:16Z","published":"2024-09-26T09:57:16Z","title":"MIO: A Foundation Model on Multimodal Tokens","summary":" In this paper, we introduce MIO, a novel foundation model built on multimodal\ntokens, capable of understanding and generating speech, text, images, and\nvideos in an end-to-end, autoregressive manner. While the emergence of large\nlanguage models (LLMs) and multimodal large language models (MM-LLMs) propels\nadvancements in artificial general intelligence through their versatile\ncapabilities, they still lack true any-to-any understanding and generation.\nRecently, the release of GPT-4o has showcased the remarkable potential of\nany-to-any LLMs for complex real-world tasks, enabling omnidirectional input\nand output across images, speech, and text. However, it is closed-source and\ndoes not support the generation of multimodal interleaved sequences. To address\nthis gap, we present MIO, which is trained on a mixture of discrete tokens\nacross four modalities using causal multimodal modeling. MIO undergoes a\nfour-stage training process: (1) alignment pre-training, (2) interleaved\npre-training, (3) speech-enhanced pre-training, and (4) comprehensive\nsupervised fine-tuning on diverse textual, visual, and speech tasks. Our\nexperimental results indicate that MIO exhibits competitive, and in some cases\nsuperior, performance compared to previous dual-modal baselines, any-to-any\nmodel baselines, and even modality-specific baselines. Moreover, MIO\ndemonstrates advanced capabilities inherent to its any-to-any feature, such as\ninterleaved video-text generation, chain-of-visual-thought reasoning, visual\nguideline generation, instructional image editing, etc.\n","authors":["Zekun Wang","King Zhu","Chunpu Xu","Wangchunshu Zhou","Jiaheng Liu","Yibo Zhang","Jiashuo Wang","Ning Shi","Siyu Li","Yizhi Li","Haoran Que","Zhaoxiang Zhang","Yuanxing Zhang","Ge Zhang","Ke Xu","Jie Fu","Wenhao Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17692v1.pdf","comment":"Technical Report. Codes and models will be available soon"},{"id":"http://arxiv.org/abs/2404.00459v2","updated":"2024-09-26T09:54:57Z","published":"2024-03-30T19:46:59Z","title":"NumeroLogic: Number Encoding for Enhanced LLMs' Numerical Reasoning","summary":" Language models struggle with handling numerical data and performing\narithmetic operations. We hypothesize that this limitation can be partially\nattributed to non-intuitive textual numbers representation. When a digit is\nread or generated by a causal language model it does not know its place value\n(e.g. thousands vs. hundreds) until the entire number is processed. To address\nthis issue, we propose a simple adjustment to how numbers are represented by\nincluding the count of digits before each number. For instance, instead of\n\"42\", we suggest using \"{2:42}\" as the new format. This approach, which we term\nNumeroLogic, offers an added advantage in number generation by serving as a\nChain of Thought (CoT). By requiring the model to consider the number of digits\nfirst, it enhances the reasoning process before generating the actual number.\nWe use arithmetic tasks to demonstrate the effectiveness of the NumeroLogic\nformatting. We further demonstrate NumeroLogic applicability to general natural\nlanguage modeling, improving language understanding performance in the MMLU\nbenchmark.\n","authors":["Eli Schwartz","Leshem Choshen","Joseph Shtok","Sivan Doveh","Leonid Karlinsky","Assaf Arbelle"],"pdf_url":"https://arxiv.org/pdf/2404.00459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06802v2","updated":"2024-09-26T09:52:20Z","published":"2024-05-10T20:29:25Z","title":"Leveraging summary of radiology reports with transformers","summary":" Two fundamental problems in health-care stem from patient handoff and triage.\nDoctors are often required to perform complex findings summarization to\nfacilitate efficient communication with specialists and decision making on the\nurgency of each case. To address these challenges, we present a state of the\nart radiology report summarization model utilizing adjusted bidirectional\nencoder representation from transformers BERTtoBERT encoder and decoder\narchitecture. We also provide a data processing pipeline for future models\ndeveloped on the the MIMIC CXR dataset. Our approach includes a novel method\nfor augmenting medical data and a comprehensive performance analysis. Our best\nperforming model achieved a recall oriented understudy for gisting evaluation L\nF1 score of 58.75/100, outperforming specialized checkpoints with more\nsophisticated attention mechanisms. We also provide a data processing pipeline\nfor future models developed on the MIMIC chest X-ray dataset. The model\nintroduced in this paper demonstrates significantly improved capacity in\nradiology report summarization, highlighting the potential for ensuring better\nclinical workflows and enhanced patient care.\n","authors":["Raul Salles de Padua","Imran Qureshi"],"pdf_url":"https://arxiv.org/pdf/2405.06802v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17683v1","updated":"2024-09-26T09:49:27Z","published":"2024-09-26T09:49:27Z","title":"Zero- and Few-shot Named Entity Recognition and Text Expansion in\n Medication Prescriptions using ChatGPT","summary":" Introduction: Medication prescriptions are often in free text and include a\nmix of two languages, local brand names, and a wide range of idiosyncratic\nformats and abbreviations. Large language models (LLMs) have shown promising\nability to generate text in response to input prompts. We use ChatGPT 3.5 to\nautomatically structure and expand medication statements in discharge summaries\nand thus make them easier to interpret for people and machines. Methods:\nNamed-entity Recognition (NER) and Text Expansion (EX) are used in a zero- and\nfew-shot setting with different prompt strategies. 100 medication statements\nwere manually annotated and curated. NER performance was measured by using\nstrict and partial matching. For the task EX, two experts interpreted the\nresults by assessing semantic equivalence between original and expanded\nstatements. The model performance was measured by precision, recall, and F1\nscore. Results: For NER, the best-performing prompt reached an average F1 score\nof 0.94 in the test set. For EX, the few-shot prompt showed superior\nperformance among other prompts, with an average F1 score of 0.87. Conclusion:\nOur study demonstrates good performance for NER and EX tasks in free-text\nmedication statements using ChatGPT. Compared to a zero-shot baseline, a\nfew-shot approach prevented the system from hallucinating, which would be\nunacceptable when processing safety-relevant medication data.\n","authors":["Natthanaphop Isaradech","Andrea Riedel","Wachiranun Sirikul","Markus Kreuzthaler","Stefan Schulz"],"pdf_url":"https://arxiv.org/pdf/2409.17683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13167v2","updated":"2024-09-26T09:42:48Z","published":"2024-06-19T02:46:18Z","title":"QRMeM: Unleash the Length Limitation through Question then Reflection\n Memory Mechanism","summary":" While large language models (LLMs) have made notable advancements in natural\nlanguage processing, they continue to struggle with processing extensive text.\nMemory mechanism offers a flexible solution for managing long contexts,\nutilizing techniques such as compression, summarization, and structuring to\nfacilitate nuanced and efficient handling of large volumes of text. However,\nexisting techniques face challenges with static knowledge integration, leading\nto insufficient adaptation to task-specific needs and missing\nmulti-segmentation relationships, which hinders the dynamic reorganization and\nlogical combination of relevant segments during the response process. To\naddress these issues, we introduce a novel strategy, Question then Reflection\nMemory Mechanism (QRMeM), incorporating a dual-structured memory pool. This\npool synergizes static textual content with structured graph guidance,\nfostering a reflective trial-and-error approach for navigating and identifying\nrelevant segments. Our evaluation across multiple-choice questions (MCQ) and\nmulti-document question answering (Multi-doc QA) benchmarks showcases QRMeM\nenhanced performance compared to existing approaches.\n","authors":["Bo Wang","Heyan Huang","Yixin Cao","Jiahao Ying","Wei Tang","Chong Feng"],"pdf_url":"https://arxiv.org/pdf/2406.13167v2.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17673v1","updated":"2024-09-26T09:32:12Z","published":"2024-09-26T09:32:12Z","title":"Cross-lingual Human-Preference Alignment for Neural Machine Translation\n with Direct Quality Optimization","summary":" Reinforcement Learning from Human Feedback (RLHF) and derivative techniques\nlike Direct Preference Optimization (DPO) are task-alignment algorithms used to\nrepurpose general, foundational models for specific tasks. We show that\napplying task-alignment to neural machine translation (NMT) addresses an\nexisting task--data mismatch in NMT, leading to improvements across all\nlanguages of a multilingual model, even when task-alignment is only applied to\na subset of those languages. We do so by introducing Direct Quality\nOptimization (DQO), a variant of DPO leveraging a pre-trained translation\nquality estimation model as a proxy for human preferences, and verify the\nimprovements with both automatic metrics and human evaluation.\n","authors":["Kaden Uhlig","Joern Wuebker","Raphael Reinauer","John DeNero"],"pdf_url":"https://arxiv.org/pdf/2409.17673v1.pdf","comment":"17 pages, 1 figure"},{"id":"http://arxiv.org/abs/2404.09486v2","updated":"2024-09-26T09:31:48Z","published":"2024-04-15T06:15:46Z","title":"MMCode: Benchmarking Multimodal Large Language Models for Code\n Generation with Visually Rich Programming Problems","summary":" Programming often involves converting detailed and complex specifications\ninto code, a process during which developers typically utilize visual aids to\nmore effectively convey concepts. While recent developments in Large Multimodal\nModels have demonstrated remarkable abilities in visual reasoning and\nmathematical tasks, there is little work on investigating whether these models\ncan effectively interpret visual elements for code generation. To this end, we\npresent MMCode, the first multi-modal coding dataset for evaluating algorithmic\nproblem-solving skills in visually rich contexts. MMCode contains 3,548\nquestions and 6,620 images collected from real-world programming challenges\nharvested from 10 code competition websites, presenting significant challenges\ndue to the extreme demand for reasoning abilities. Our experiment results show\nthat current state-of-the-art models struggle to solve these problems. The\nresults highlight the lack of powerful vision-code models, and we hope MMCode\ncan serve as an inspiration for future works in this domain. The data and code\nare publicly available at https://github.com/likaixin2000/MMCode.\n","authors":["Kaixin Li","Yuchen Tian","Qisheng Hu","Ziyang Luo","Zhiyong Huang","Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2404.09486v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2407.16693v2","updated":"2024-09-26T09:27:30Z","published":"2024-07-23T17:56:32Z","title":"Explanation Regularisation through the Lens of Attributions","summary":" Explanation regularisation (ER) has been introduced as a way to guide text\nclassifiers to form their predictions relying on input tokens that humans\nconsider plausible. This is achieved by introducing an auxiliary explanation\nloss that measures how well the output of an input attribution technique for\nthe model agrees with human-annotated rationales. The guidance appears to\nbenefit performance in out-of-domain (OOD) settings, presumably due to an\nincreased reliance on \"plausible\" tokens. However, previous work has\nunder-explored the impact of guidance on that reliance, particularly when\nreliance is measured using attribution techniques different from those used to\nguide the model. In this work, we seek to close this gap, and also explore the\nrelationship between reliance on plausible features and OOD performance. We\nfind that the connection between ER and the ability of a classifier to rely on\nplausible features has been overstated and that a stronger reliance on\nplausible tokens does not seem to be the cause for OOD improvements.\n","authors":["Pedro Ferreira","Ivan Titov","Wilker Aziz"],"pdf_url":"https://arxiv.org/pdf/2407.16693v2.pdf","comment":"22 pages, 14 figures, 9 tables"},{"id":"http://arxiv.org/abs/2404.12753v2","updated":"2024-09-26T09:17:10Z","published":"2024-04-19T09:59:44Z","title":"AutoScraper: A Progressive Understanding Web Agent for Web Scraper\n Generation","summary":" Web scraping is a powerful technique that extracts data from websites,\nenabling automated data collection, enhancing data analysis capabilities, and\nminimizing manual data entry efforts. Existing methods, wrappers-based methods\nsuffer from limited adaptability and scalability when faced with a new website,\nwhile language agents, empowered by large language models (LLMs), exhibit poor\nreusability in diverse web environments. In this work, we introduce the\nparadigm of generating web scrapers with LLMs and propose AutoScraper, a\ntwo-stage framework that can handle diverse and changing web environments more\nefficiently. AutoScraper leverages the hierarchical structure of HTML and\nsimilarity across different web pages for generating web scrapers. Besides, we\npropose a new executability metric for better measuring the performance of web\nscraper generation tasks. We conduct comprehensive experiments with multiple\nLLMs and demonstrate the effectiveness of our framework. Resources of this\npaper can be found at \\url{https://github.com/EZ-hwh/AutoScraper}\n","authors":["Wenhao Huang","Zhouhong Gu","Chenghao Peng","Zhixu Li","Jiaqing Liang","Yanghua Xiao","Liqian Wen","Zulong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.12753v2.pdf","comment":"19 pages, 4 figures, 18 tables. Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17650v1","updated":"2024-09-26T08:56:54Z","published":"2024-09-26T08:56:54Z","title":"Digital Twin Ecosystem for Oncology Clinical Operations","summary":" Artificial Intelligence (AI) and Large Language Models (LLMs) hold\nsignificant promise in revolutionizing healthcare, especially in clinical\napplications. Simultaneously, Digital Twin technology, which models and\nsimulates complex systems, has gained traction in enhancing patient care.\nHowever, despite the advances in experimental clinical settings, the potential\nof AI and digital twins to streamline clinical operations remains largely\nuntapped. This paper introduces a novel digital twin framework specifically\ndesigned to enhance oncology clinical operations. We propose the integration of\nmultiple specialized digital twins, such as the Medical Necessity Twin, Care\nNavigator Twin, and Clinical History Twin, to enhance workflow efficiency and\npersonalize care for each patient based on their unique data. Furthermore, by\nsynthesizing multiple data sources and aligning them with the National\nComprehensive Cancer Network (NCCN) guidelines, we create a dynamic Cancer Care\nPath, a continuously evolving knowledge base that enables these digital twins\nto provide precise, tailored clinical recommendations.\n","authors":["Himanshu Pandey","Akhil Amod"," Shivang","Kshitij Jaggi","Ruchi Garg","Abheet Jain","Vinayak Tantia"],"pdf_url":"https://arxiv.org/pdf/2409.17650v1.pdf","comment":"Pre Print"},{"id":"http://arxiv.org/abs/2409.17648v1","updated":"2024-09-26T08:55:21Z","published":"2024-09-26T08:55:21Z","title":"Efficient In-Domain Question Answering for Resource-Constrained\n Environments","summary":" Retrieval Augmented Generation (RAG) is a common method for integrating\nexternal knowledge into pretrained Large Language Models (LLMs) to enhance\naccuracy and relevancy in question answering (QA) tasks. However, prompt\nengineering and resource efficiency remain significant bottlenecks in\ndeveloping optimal and robust RAG solutions for real-world QA applications.\nRecent studies have shown success in using fine tuning to address these\nproblems; in particular, Retrieval Augmented Fine Tuning (RAFT) applied to\nsmaller 7B models has demonstrated superior performance compared to RAG setups\nwith much larger models such as GPT-3.5. The combination of RAFT with\nparameter-efficient fine tuning (PEFT) techniques, such as Low-Rank Adaptation\n(LoRA), promises an even more efficient solution, yet remains an unexplored\narea. In this work, we combine RAFT with LoRA to reduce fine tuning and storage\nrequirements and gain faster inference times while maintaining comparable RAG\nperformance. This results in a more compute-efficient RAFT, or CRAFT, which is\nparticularly useful for knowledge-intensive QA tasks in resource-constrained\nenvironments where internet access may be restricted and hardware resources\nlimited.\n","authors":["Isaac Chung","Phat Vo","Arman Kizilkale","Aaron Reite"],"pdf_url":"https://arxiv.org/pdf/2409.17648v1.pdf","comment":"6 pages, 2 tables"},{"id":"http://arxiv.org/abs/2405.16908v2","updated":"2024-09-26T08:53:01Z","published":"2024-05-27T07:56:23Z","title":"Can Large Language Models Faithfully Express Their Intrinsic Uncertainty\n in Words?","summary":" We posit that large language models (LLMs) should be capable of expressing\ntheir intrinsic uncertainty in natural language. For example, if the LLM is\nequally likely to output two contradicting answers to the same question, then\nits generated response should reflect this uncertainty by hedging its answer\n(e.g., \"I'm not sure, but I think...\"). We formalize faithful response\nuncertainty based on the gap between the model's intrinsic confidence in the\nassertions it makes and the decisiveness by which they are conveyed. This\nexample-level metric reliably indicates whether the model reflects its\nuncertainty, as it penalizes both excessive and insufficient hedging. We\nevaluate a variety of aligned LLMs at faithfully communicating uncertainty on\nseveral knowledge-intensive question answering tasks. Our results provide\nstrong evidence that modern LLMs are poor at faithfully conveying their\nuncertainty, and that better alignment is necessary to improve their\ntrustworthiness.\n","authors":["Gal Yona","Roee Aharoni","Mor Geva"],"pdf_url":"https://arxiv.org/pdf/2405.16908v2.pdf","comment":"To appear in EMNLP 2024 (main conference)"},{"id":"http://arxiv.org/abs/2408.10902v2","updated":"2024-09-26T08:47:36Z","published":"2024-08-20T14:45:23Z","title":"Soda-Eval: Open-Domain Dialogue Evaluation in the age of LLMs","summary":" Although human evaluation remains the gold standard for open-domain dialogue\nevaluation, the growing popularity of automated evaluation using Large Language\nModels (LLMs) has also extended to dialogue. However, most frameworks leverage\nbenchmarks that assess older chatbots on aspects such as fluency and relevance,\nwhich are not reflective of the challenges associated with contemporary models.\nIn fact, a qualitative analysis on Soda, a GPT-3.5 generated dialogue dataset,\nsuggests that current chatbots may exhibit several recurring issues related to\ncoherence and commonsense knowledge, but generally produce highly fluent and\nrelevant responses.\n Noting the aforementioned limitations, this paper introduces Soda-Eval, an\nannotated dataset based on Soda that covers over 120K turn-level assessments\nacross 10K dialogues, where the annotations were generated by GPT-4. Using\nSoda-Eval as a benchmark, we then study the performance of several open-access\ninstruction-tuned LLMs, finding that dialogue evaluation remains challenging.\nFine-tuning these models improves performance over few-shot inferences, both in\nterms of correlation and explanation.\n","authors":["John Mendonça","Isabel Trancoso","Alon Lavie"],"pdf_url":"https://arxiv.org/pdf/2408.10902v2.pdf","comment":"Accepted to EMNLP2024 (findings)"},{"id":"http://arxiv.org/abs/2409.17640v1","updated":"2024-09-26T08:44:38Z","published":"2024-09-26T08:44:38Z","title":"T3: A Novel Zero-shot Transfer Learning Framework Iteratively Training\n on an Assistant Task for a Target Task","summary":" Long text summarization, gradually being essential for efficiently processing\nlarge volumes of information, stays challenging for Large Language Models\n(LLMs) such as GPT and LLaMA families because of the insufficient open-sourced\ntraining datasets and the high requirement of contextual details dealing. To\naddress the issue, we design a novel zero-shot transfer learning framework,\nabbreviated as T3, to iteratively training a baseline LLM on an assistant task\nfor the target task, where the former should own richer data resources and\nshare structural or semantic similarity with the latter. In practice, T3 is\napproached to deal with the long text summarization task by utilizing question\nanswering as the assistant task, and further validated its effectiveness on the\nBBC summary, NarraSum, FairytaleQA, and NLQuAD datasets, with up to nearly 14%\nimprovement in ROUGE, 35% improvement in BLEU, and 16% improvement in Factscore\ncompared to three baseline LLMs, demonstrating its potential for more\nassistant-target task combinations.\n","authors":["Xindi Tong","Yujin Zhu","Shijian Fan","Liang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09802v2","updated":"2024-09-26T08:15:50Z","published":"2023-11-16T11:26:21Z","title":"Neuro-Symbolic Integration Brings Causal and Reliable Reasoning Proofs","summary":" Two lines of approaches are adopted for complex reasoning with LLMs. One line\nof work prompts LLMs with various reasoning structures, while the structural\noutputs can be naturally regarded as intermediate reasoning steps. Another line\nof work adopt LLM-free declarative solvers to do the reasoning task, rendering\nhigher reasoning accuracy but lacking interpretability due to the black-box\nnature of the solvers. Aiming to resolve the trade-off between answer accuracy\nand interpretability, we present a simple extension to the latter line of work.\nSpecifically, we showcase that the intermediate search logs generated by Prolog\ninterpreters can be accessed and interpreted into human-readable reasoning\nproofs. As long as LLMs correctly translate problem descriptions into Prolog\nrepresentations, the corresponding reasoning proofs are ensured to be causal\nand reliable. On two logical reasoning and one arithmetic reasoning datasets,\nour framework obtains significant improvements in terms of both answer accuracy\nand reasoning proof accuracy. Our code is released at\nhttps://github.com/DAMO-NLP-SG/CaRing\n","authors":["Sen Yang","Xin Li","Leyang Cui","Lidong Bing","Wai Lam"],"pdf_url":"https://arxiv.org/pdf/2311.09802v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15254v3","updated":"2024-09-26T08:01:39Z","published":"2024-09-23T17:53:42Z","title":"Archon: An Architecture Search Framework for Inference-Time Techniques","summary":" Inference-time techniques are emerging as highly effective tools to increase\nlarge language model (LLM) capabilities. However, there is still limited\nunderstanding of the best practices for developing systems that combine\ninference-time techniques with one or more LLMs, with challenges including: (1)\neffectively allocating inference compute budget, (2) understanding the\ninteractions between different combinations of inference-time techniques and\ntheir impact on downstream performance, and 3) efficiently searching over the\nlarge space of model choices, inference-time techniques, and their\ncompositions. To address these challenges, we introduce Archon, an automated\nframework for designing inference-time architectures. Archon defines an\nextensible design space, encompassing methods such as generation ensembling,\nmulti-sampling, ranking, fusion, critiquing, verification, and unit testing. It\nthen transforms the problem of selecting and combining LLMs and inference-time\ntechniques into a hyperparameter optimization objective. To optimize this\nobjective, we introduce automated Inference-Time Architecture Search (ITAS)\nalgorithms. Given target benchmark(s), an inference compute budget, and\navailable LLMs, ITAS outputs optimized architectures. We evaluate Archon\narchitectures across a wide range of instruction-following and reasoning\nbenchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval,\nMixEval Hard, MATH, and CodeContests. We show that automatically designed\ninference-time architectures by Archon outperform strong models such as GPT-4o\nand Claude 3.5 Sonnet on these benchmarks, achieving an average increase of\n15.1 and 11.2 percentage points with all-source models and open-source models,\nrespectively. We make our code and datasets available publicly on Github:\nhttps://github.com/ScalingIntelligence/Archon.\n","authors":["Jon Saad-Falcon","Adrian Gamarra Lafuente","Shlok Natarajan","Nahum Maru","Hristo Todorov","Etash Guha","E. Kelly Buchanan","Mayee Chen","Neel Guha","Christopher Ré","Azalia Mirhoseini"],"pdf_url":"https://arxiv.org/pdf/2409.15254v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17610v1","updated":"2024-09-26T07:55:57Z","published":"2024-09-26T07:55:57Z","title":"ZALM3: Zero-Shot Enhancement of Vision-Language Alignment via In-Context\n Information in Multi-Turn Multimodal Medical Dialogue","summary":" The rocketing prosperity of large language models (LLMs) in recent years has\nboosted the prevalence of vision-language models (VLMs) in the medical sector.\nIn our online medical consultation scenario, a doctor responds to the texts and\nimages provided by a patient in multiple rounds to diagnose her/his health\ncondition, forming a multi-turn multimodal medical dialogue format. Unlike\nhigh-quality images captured by professional equipment in traditional medical\nvisual question answering (Med-VQA), the images in our case are taken by\npatients' mobile phones. These images have poor quality control, with issues\nsuch as excessive background elements and the lesion area being significantly\noff-center, leading to degradation of vision-language alignment in the model\ntraining phase. In this paper, we propose ZALM3, a Zero-shot strategy to\nimprove vision-language ALignment in Multi-turn Multimodal Medical dialogue.\nSince we observe that the preceding text conversations before an image can\ninfer the regions of interest (RoIs) in the image, ZALM3 employs an LLM to\nsummarize the keywords from the preceding context and a visual grounding model\nto extract the RoIs. The updated images eliminate unnecessary background noise\nand provide more effective vision-language alignment. To better evaluate our\nproposed method, we design a new subjective assessment metric for multi-turn\nunimodal/multimodal medical dialogue to provide a fine-grained performance\ncomparison. Our experiments across three different clinical departments\nremarkably demonstrate the efficacy of ZALM3 with statistical significance.\n","authors":["Zhangpu Li","Changhong Zou","Suxue Ma","Zhicheng Yang","Chen Du","Youbao Tang","Zhenjie Cao","Ning Zhang","Jui-Hsin Lai","Ruei-Sung Lin","Yuan Ni","Xingzhi Sun","Jing Xiao","Kai Zhang","Mei Han"],"pdf_url":"https://arxiv.org/pdf/2409.17610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16341v2","updated":"2024-09-26T07:54:10Z","published":"2024-09-24T17:20:02Z","title":"Quality Matters: Evaluating Synthetic Data for Tool-Using LLMs","summary":" Training large language models (LLMs) for external tool usage is a rapidly\nexpanding field, with recent research focusing on generating synthetic data to\naddress the shortage of available data. However, the absence of systematic data\nquality checks poses complications for properly training and testing models. To\nthat end, we propose two approaches for assessing the reliability of data for\ntraining LLMs to use external tools. The first approach uses intuitive,\nhuman-defined correctness criteria. The second approach uses a model-driven\nassessment with in-context evaluation. We conduct a thorough evaluation of data\nquality on two popular benchmarks, followed by an extrinsic evaluation that\nshowcases the impact of data quality on model performance. Our results\ndemonstrate that models trained on high-quality data outperform those trained\non unvalidated data, even when trained with a smaller quantity of data. These\nfindings empirically support the significance of assessing and ensuring the\nreliability of training data for tool-using LLMs.\n","authors":["Shadi Iskander","Nachshon Cohen","Zohar Karnin","Ori Shapira","Sofia Tolmach"],"pdf_url":"https://arxiv.org/pdf/2409.16341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17603v1","updated":"2024-09-26T07:40:03Z","published":"2024-09-26T07:40:03Z","title":"Deep CLAS: Deep Contextual Listen, Attend and Spell","summary":" Contextual-LAS (CLAS) has been shown effective in improving Automatic Speech\nRecognition (ASR) of rare words. It relies on phrase-level contextual modeling\nand attention-based relevance scoring without explicit contextual constraint\nwhich lead to insufficient use of contextual information. In this work, we\npropose deep CLAS to use contextual information better. We introduce bias loss\nforcing model to focus on contextual information. The query of bias attention\nis also enriched to improve the accuracy of the bias attention score. To get\nfine-grained contextual information, we replace phrase-level encoding with\ncharacter-level encoding and encode contextual information with conformer\nrather than LSTM. Moreover, we directly use the bias attention score to correct\nthe output probability distribution of the model. Experiments using the public\nAISHELL-1 and AISHELL-NER. On AISHELL-1, compared to CLAS baselines, deep CLAS\nobtains a 65.78% relative recall and a 53.49% relative F1-score increase in the\nnamed entity recognition scene.\n","authors":["Shifu Xiong","Mengzhi Wang","Genshun Wan","Hang Chen","Jianqing Gao","Lirong Dai"],"pdf_url":"https://arxiv.org/pdf/2409.17603v1.pdf","comment":"Accepted by NCMMSC 2022"},{"id":"http://arxiv.org/abs/2409.17588v1","updated":"2024-09-26T07:07:14Z","published":"2024-09-26T07:07:14Z","title":"DualCoTs: Dual Chain-of-Thoughts Prompting for Sentiment Lexicon\n Expansion of Idioms","summary":" Idioms represent a ubiquitous vehicle for conveying sentiments in the realm\nof everyday discourse, rendering the nuanced analysis of idiom sentiment\ncrucial for a comprehensive understanding of emotional expression within\nreal-world texts. Nevertheless, the existing corpora dedicated to idiom\nsentiment analysis considerably limit research in text sentiment analysis. In\nthis paper, we propose an innovative approach to automatically expand the\nsentiment lexicon for idioms, leveraging the capabilities of large language\nmodels through the application of Chain-of-Thought prompting. To demonstrate\nthe effectiveness of this approach, we integrate multiple existing resources\nand construct an emotional idiom lexicon expansion dataset (called EmoIdiomE),\nwhich encompasses a comprehensive repository of Chinese and English idioms.\nThen we designed the Dual Chain-of-Thoughts (DualCoTs) method, which combines\ninsights from linguistics and psycholinguistics, to demonstrate the\neffectiveness of using large models to automatically expand the sentiment\nlexicon for idioms. Experiments show that DualCoTs is effective in idioms\nsentiment lexicon expansion in both Chinese and English. For reproducibility,\nwe will release the data and code upon acceptance.\n","authors":["Fuqiang Niu","Minghuan Tan","Bowen Zhang","Min Yang","Ruifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10267v2","updated":"2024-09-26T06:57:27Z","published":"2024-06-11T09:24:18Z","title":"Unused information in token probability distribution of generative LLM:\n improving LLM reading comprehension through calculation of expected values","summary":" LLM text decoding is key component for perceived LLM quality. We demonstrate\ntwo experiments showing that decoding methods could be improved by manipulation\nof token probabilities. First, we test few LLM on SummEval summary scoring\ndataset, to measure reading comprehension. We compare scores from greedy\ndecoding to expected values over the next token distribution. We scale logits\nby large temperature to increase the entropy of scores. This allows strong\nimprovement of performance on SummEval (in terms of correlations to human\njudgement). We see improvement from 6-8% to 13-28% for 7B Mistral and from\n20%-46% to 37%-56% for Mixtral, beating GPT 4 0314 result on two metrics. Part\nof the gain seems related to positional bias. Secondly, we use\nprobability-based tree sampling algorithm, to examine all most probable\ngenerations for given prompt.\n","authors":["Krystian Zawistowski"],"pdf_url":"https://arxiv.org/pdf/2406.10267v2.pdf","comment":"7 pages, 1 figure, presented at FEDCSIS 2024 conference,"},{"id":"http://arxiv.org/abs/2409.17577v1","updated":"2024-09-26T06:46:53Z","published":"2024-09-26T06:46:53Z","title":"Leveraging Annotator Disagreement for Text Classification","summary":" It is common practice in text classification to only use one majority label\nfor model training even if a dataset has been annotated by multiple annotators.\nDoing so can remove valuable nuances and diverse perspectives inherent in the\nannotators' assessments. This paper proposes and compares three different\nstrategies to leverage annotator disagreement for text classification: a\nprobability-based multi-label method, an ensemble system, and instruction\ntuning. All three approaches are evaluated on the tasks of hate speech and\nabusive conversation detection, which inherently entail a high degree of\nsubjectivity. Moreover, to evaluate the effectiveness of embracing annotation\ndisagreements for model training, we conduct an online survey that compares the\nperformance of the multi-label model against a baseline model, which is trained\nwith the majority label.\n The results show that in hate speech detection, the multi-label method\noutperforms the other two approaches, while in abusive conversation detection,\ninstruction tuning achieves the best performance. The results of the survey\nalso show that the outputs from the multi-label models are considered a better\nrepresentation of the texts than the single-label model.\n","authors":["Jin Xu","Mariët Theune","Daniel Braun"],"pdf_url":"https://arxiv.org/pdf/2409.17577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02436v2","updated":"2024-09-26T06:27:08Z","published":"2024-03-04T19:33:39Z","title":"How does Architecture Influence the Base Capabilities of Pre-trained\n Language Models? A Case Study Based on FFN-Wider and MoE Transformers","summary":" Pre-trained language models have been proven to possess strong base\ncapabilities, which not only excel in in-distribution language modeling but\nalso show powerful abilities in out-of-distribution language modeling, transfer\nlearning and few-shot learning. Unlike existing work focusing on the influence\nof scale on base capabilities, our work examines the influence of architecture\non those. Specifically, our concern is: How does architecture influence the\nbase capabilities of pre-trained language models? In this work, we attempt to\nexplain and reverse the decline in base capabilities caused by the architecture\nof FFN-Wider Transformers, seeking to provide some insights. Through analysis,\nwe found the contribution ratio of Multi-Head Attention (a combination\nfunction) to pre-trained language modeling is a key factor affecting base\ncapabilities. FFN-Wider Transformers reduce the contribution ratio of this\ncombination function, leading to a decline in base capabilities. We confirmed\nthis by experiments and proposed Combination Enhanced Architecture (CEA) to\naddress the decline in base capabilities of such models. Significantly, we\nextended our explanation and CEA to Mixture of Experts (MoE) Transformers. We\nsuccessfully achieved significant improvements in base capabilities on a 14B\nparameter MoE model, demonstrating the practical application value of our work.\nThis also indicates that our analysis has a certain guiding significance for\narchitecture analysis, architecture improvement and architecture design.\n","authors":["Xin Lu","Yanyan Zhao","Bing Qin","Liangyu Huo","Qing Yang","Dongliang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.02436v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05013v2","updated":"2024-09-26T06:19:34Z","published":"2024-06-07T15:23:53Z","title":"CHIQ: Contextual History Enhancement for Improving Query Rewriting in\n Conversational Search","summary":" In this paper, we study how open-source large language models (LLMs) can be\neffectively deployed for improving query rewriting in conversational search,\nespecially for ambiguous queries. We introduce CHIQ, a two-step method that\nleverages the capabilities of LLMs to resolve ambiguities in the conversation\nhistory before query rewriting. This approach contrasts with prior studies that\npredominantly use closed-source LLMs to directly generate search queries from\nconversation history. We demonstrate on five well-established benchmarks that\nCHIQ leads to state-of-the-art results across most settings, showing highly\ncompetitive performances with systems leveraging closed-source LLMs. Our study\nprovides a first step towards leveraging open-source LLMs in conversational\nsearch, as a competitive alternative to the prevailing reliance on commercial\nLLMs. Data, models, and source code will be publicly available upon acceptance\nat https://github.com/fengranMark/CHIQ.\n","authors":["Fengran Mo","Abbas Ghaddar","Kelong Mao","Mehdi Rezagholizadeh","Boxing Chen","Qun Liu","Jian-Yun Nie"],"pdf_url":"https://arxiv.org/pdf/2406.05013v2.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2406.17255v2","updated":"2024-09-26T06:18:44Z","published":"2024-06-25T03:45:28Z","title":"MPCODER: Multi-user Personalized Code Generator with Explicit and\n Implicit Style Representation Learning","summary":" Large Language Models (LLMs) have demonstrated great potential for assisting\ndevelopers in their daily development. However, most research focuses on\ngenerating correct code, how to use LLMs to generate personalized code has\nseldom been investigated. To bridge this gap, we proposed MPCoder (Multi-user\nPersonalized Code Generator) to generate personalized code for multiple users.\nTo better learn coding style features, we utilize explicit coding style\nresidual learning to capture the syntax code style standards and implicit style\nlearning to capture the semantic code style conventions. We train a multi-user\nstyle adapter to better differentiate the implicit feature representations of\ndifferent users through contrastive learning, ultimately enabling personalized\ncode generation for multiple users. We further propose a novel evaluation\nmetric for estimating similarities between codes of different coding styles.\nThe experimental results show the effectiveness of our approach for this novel\ntask.\n","authors":["Zhenlong Dai","Chang Yao","WenKang Han","Ying Yuan","Zhipeng Gao","Jingyuan Chen"],"pdf_url":"https://arxiv.org/pdf/2406.17255v2.pdf","comment":"Accepted by ACL 2024, Main Conference"},{"id":"http://arxiv.org/abs/2409.15977v2","updated":"2024-09-26T05:26:50Z","published":"2024-09-24T11:18:09Z","title":"TCSinger: Zero-Shot Singing Voice Synthesis with Style Transfer and\n Multi-Level Style Control","summary":" Zero-shot singing voice synthesis (SVS) with style transfer and style control\naims to generate high-quality singing voices with unseen timbres and styles\n(including singing method, emotion, rhythm, technique, and pronunciation) from\naudio and text prompts. However, the multifaceted nature of singing styles\nposes a significant challenge for effective modeling, transfer, and control.\nFurthermore, current SVS models often fail to generate singing voices rich in\nstylistic nuances for unseen singers. To address these challenges, we introduce\nTCSinger, the first zero-shot SVS model for style transfer across cross-lingual\nspeech and singing styles, along with multi-level style control. Specifically,\nTCSinger proposes three primary modules: 1) the clustering style encoder\nemploys a clustering vector quantization model to stably condense style\ninformation into a compact latent space; 2) the Style and Duration Language\nModel (S\\&D-LM) concurrently predicts style information and phoneme duration,\nwhich benefits both; 3) the style adaptive decoder uses a novel mel-style\nadaptive normalization method to generate singing voices with enhanced details.\nExperimental results show that TCSinger outperforms all baseline models in\nsynthesis quality, singer similarity, and style controllability across various\ntasks, including zero-shot style transfer, multi-level style control,\ncross-lingual style transfer, and speech-to-singing style transfer. Singing\nvoice samples can be accessed at https://tcsinger.github.io/.\n","authors":["Yu Zhang","Ziyue Jiang","Ruiqi Li","Changhao Pan","Jinzheng He","Rongjie Huang","Chuxin Wang","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.15977v2.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17545v1","updated":"2024-09-26T05:24:14Z","published":"2024-09-26T05:24:14Z","title":"Modulated Intervention Preference Optimization (MIPO): Keey the Easy,\n Refine the Difficult","summary":" Preference optimization methods typically begin training with a well-trained\nSFT model as a reference model. In RLHF and DPO, a regularization term is used\nduring the preference optimization process to prevent the policy model from\ndeviating too far from the reference model's distribution, thereby avoiding the\ngeneration of anomalous responses. When the reference model is already\nwell-aligned with the given data or only requires slight adjustments, this\napproach can produce a well-aligned model. However, if the reference model is\nnot aligned with the given data and requires significant deviation from its\ncurrent state, a regularization term may actually hinder the model alignment.\nIn this study, we propose \\textbf{Modulated Intervention Preference\nOptimization (MIPO)} to address this issue. MIPO modulates the degree of\nintervention from the reference model based on how well the given data is\naligned with it. If the data is well-aligned, the intervention is increased to\nprevent the policy model from diverging significantly from reference model.\nConversely, if the alignment is poor, the interference is reduced to facilitate\nmore extensive training. We compare the performance of MIPO and DPO using\nMistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental\nresults demonstrate that MIPO consistently outperforms DPO across various\nevaluation scenarios.\n","authors":["Cheolhun Jang"],"pdf_url":"https://arxiv.org/pdf/2409.17545v1.pdf","comment":"8pages, submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2409.17539v1","updated":"2024-09-26T04:59:45Z","published":"2024-09-26T04:59:45Z","title":"Logic-of-Thought: Injecting Logic into Contexts for Full Reasoning in\n Large Language Models","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities across\nvarious tasks but their performance in complex logical reasoning tasks remains\nunsatisfactory. Although some prompting methods, such as Chain-of-Thought, can\nimprove the reasoning ability of LLMs to some extent, they suffer from an\nunfaithful issue where derived conclusions may not align with the generated\nreasoning chain. To address this issue, some studies employ the approach of\npropositional logic to further enhance logical reasoning abilities of LLMs.\nHowever, the potential omissions in the extraction of logical expressions in\nthese methods can cause information loss in the logical reasoning process,\nthereby generating incorrect results. To this end, we propose Logic-of-Thought\n(LoT) prompting which employs propositional logic to generate expanded logical\ninformation from input context, and utilizes the generated logical information\nas an additional augmentation to the input prompts, thereby enhancing the\ncapability of logical reasoning. The LoT is orthogonal to existing prompting\nmethods and can be seamlessly integrated with them. Extensive experiments\ndemonstrate that LoT boosts the performance of various prompting methods with a\nstriking margin across five logical reasoning tasks. In particular, the LoT\nenhances Chain-of-Thought's performance on the ReClor dataset by +4.35%;\nmoreover, it improves Chain-of-Thought with Self-Consistency's performance on\nLogiQA by +5%; additionally, it boosts performance of Tree-of-Thoughts on\nProofWriter dataset by +8%.\n","authors":["Tongxuan Liu","Wenjiang Xu","Weizhe Huang","Xingyu Wang","Jiaxing Wang","Hailong Yang","Jing Li"],"pdf_url":"https://arxiv.org/pdf/2409.17539v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2409.17538v1","updated":"2024-09-26T04:56:49Z","published":"2024-09-26T04:56:49Z","title":"On the Implicit Relation Between Low-Rank Adaptation and Differential\n Privacy","summary":" A significant approach in natural language processing involves large-scale\npre-training on general domain data followed by adaptation to specific tasks or\ndomains. As models grow in size, full fine-tuning all parameters becomes\nincreasingly impractical. To address this, some methods for low-rank task\nadaptation of language models have been proposed, e.g. LoRA and FLoRA. These\nmethods keep the pre-trained model weights fixed and incorporate trainable\nlow-rank decomposition matrices into some layers of the transformer\narchitecture, called adapters. This approach significantly reduces the number\nof trainable parameters required for downstream tasks compared to full\nfine-tuning all parameters. In this work, we look at low-rank adaptation from\nthe lens of data privacy. We show theoretically that the low-rank adaptation\nused in LoRA and FLoRA is equivalent to injecting some random noise into the\nbatch gradients w.r.t the adapter parameters coming from their full\nfine-tuning, and we quantify the variance of the injected noise. By\nestablishing a Berry-Esseen type bound on the total variation distance between\nthe noise distribution and a Gaussian distribution with the same variance, we\nshow that the dynamics of LoRA and FLoRA are very close to differentially\nprivate full fine-tuning the adapters, which suggests that low-rank adaptation\nimplicitly provides privacy w.r.t the fine-tuning data. Finally, using\nJohnson-Lindenstrauss lemma, we show that when augmented with gradient\nclipping, low-rank adaptation is almost equivalent to differentially private\nfull fine-tuning adapters with a fixed noise scale.\n","authors":["Saber Malekmohammadi","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2409.17538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17536v1","updated":"2024-09-26T04:48:20Z","published":"2024-09-26T04:48:20Z","title":"MUSE: Integrating Multi-Knowledge for Knowledge Graph Completion","summary":" Knowledge Graph Completion (KGC) aims to predict the missing [relation] part\nof (head entity)--[relation]->(tail entity) triplet. Most existing KGC methods\nfocus on single features (e.g., relation types) or sub-graph aggregation.\nHowever, they do not fully explore the Knowledge Graph (KG) features and\nneglect the guidance of external semantic knowledge. To address these\nshortcomings, we propose a knowledge-aware reasoning model (MUSE), which\ndesigns a novel multi-knowledge representation learning mechanism for missing\nrelation prediction. Our model develops a tailored embedding space through\nthree parallel components: 1) Prior Knowledge Learning for enhancing the\ntriplets' semantic representation by fine-tuning BERT; 2) Context Message\nPassing for enhancing the context messages of KG; 3) Relational Path\nAggregation for enhancing the path representation from the head entity to the\ntail entity. The experimental results show that MUSE significantly outperforms\nother baselines on four public datasets, achieving over 5.50% H@1 improvement\nand 4.20% MRR improvement on the NELL995 dataset. The code and datasets will be\nreleased via https://github.com/SUSTech-TP/ADMA2024-MUSE.git.\n","authors":["Pengjie Liu"],"pdf_url":"https://arxiv.org/pdf/2409.17536v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2408.05283"},{"id":"http://arxiv.org/abs/2409.17527v1","updated":"2024-09-26T04:30:32Z","published":"2024-09-26T04:30:32Z","title":"Data Proportion Detection for Optimized Data Management for Large\n Language Models","summary":" Large language models (LLMs) have demonstrated exceptional performance across\na wide range of tasks and domains, with data preparation playing a critical\nrole in achieving these results. Pre-training data typically combines\ninformation from multiple domains. To maximize performance when integrating\ndata from various domains, determining the optimal data proportion is\nessential. However, state-of-the-art (SOTA) LLMs rarely disclose details about\ntheir pre-training data, making it difficult for researchers to identify ideal\ndata proportions. In this paper, we introduce a new topic, \\textit{data\nproportion detection}, which enables the automatic estimation of pre-training\ndata proportions by analyzing the generated outputs of LLMs. We provide\nrigorous theoretical proofs, practical algorithms, and preliminary experimental\nresults for data proportion detection. Based on these findings, we offer\nvaluable insights into the challenges and future directions for effective data\nproportion detection and data management.\n","authors":["Hao Liang","Keshi Zhao","Yajie Yang","Bin Cui","Guosheng Dong","Zenan Zhou","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17525v1","updated":"2024-09-26T04:24:52Z","published":"2024-09-26T04:24:52Z","title":"When A Man Says He Is Pregnant: ERP Evidence for A Rational Account of\n Speaker-contextualized Language Comprehension","summary":" Spoken language is often, if not always, understood in a context that\nincludes the identities of speakers. For instance, we can easily make sense of\nan utterance such as \"I'm going to have a manicure this weekend\" or \"The first\ntime I got pregnant I had a hard time\" when the utterance is spoken by a woman,\nbut it would be harder to understand when it is spoken by a man. Previous\nevent-related potential (ERP) studies have shown mixed results regarding the\nneurophysiological responses to such speaker-mismatched utterances, with some\nreporting an N400 effect and others a P600 effect. In an experiment involving\n64 participants, we showed that these different ERP effects reflect distinct\ncognitive processes employed to resolve the speaker-message mismatch. When\npossible, the message is integrated with the speaker context to arrive at an\ninterpretation, as in the case of violations of social stereotypes (e.g., men\ngetting a manicure), resulting in an N400 effect. However, when such\nintegration is impossible due to violations of biological knowledge (e.g., men\ngetting pregnant), listeners engage in an error correction process to revise\neither the perceived utterance or the speaker context, resulting in a P600\neffect. Additionally, we found that the social N400 effect decreased as a\nfunction of the listener's personality trait of openness, while the biological\nP600 effect remained robust. Our findings help to reconcile the empirical\ninconsistencies in the literature and provide a rational account of\nspeaker-contextualized language comprehension.\n","authors":["Hanlin Wu","Zhenguang G. Cai"],"pdf_url":"https://arxiv.org/pdf/2409.17525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17513v1","updated":"2024-09-26T03:48:47Z","published":"2024-09-26T03:48:47Z","title":"Comparing Unidirectional, Bidirectional, and Word2vec Models for\n Discovering Vulnerabilities in Compiled Lifted Code","summary":" Ransomware and other forms of malware cause significant financial and\noperational damage to organizations by exploiting long-standing and often\ndifficult-to-detect software vulnerabilities. To detect vulnerabilities such as\nbuffer overflows in compiled code, this research investigates the application\nof unidirectional transformer-based embeddings, specifically GPT-2. Using a\ndataset of LLVM functions, we trained a GPT-2 model to generate embeddings,\nwhich were subsequently used to build LSTM neural networks to differentiate\nbetween vulnerable and non-vulnerable code. Our study reveals that embeddings\nfrom the GPT-2 model significantly outperform those from bidirectional models\nof BERT and RoBERTa, achieving an accuracy of 92.5% and an F1-score of 89.7%.\nLSTM neural networks were developed with both frozen and unfrozen embedding\nmodel layers. The model with the highest performance was achieved when the\nembedding layers were unfrozen. Further, the research finds that, in exploring\nthe impact of different optimizers within this domain, the SGD optimizer\ndemonstrates superior performance over Adam. Overall, these findings reveal\nimportant insights into the potential of unidirectional transformer-based\napproaches in enhancing cybersecurity defenses.\n","authors":["Gary A. McCully","John D. Hastings","Shengjie Xu","Adam Fortier"],"pdf_url":"https://arxiv.org/pdf/2409.17513v1.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.17504v1","updated":"2024-09-26T03:22:09Z","published":"2024-09-26T03:22:09Z","title":"HaloScope: Harnessing Unlabeled LLM Generations for Hallucination\n Detection","summary":" The surge in applications of large language models (LLMs) has prompted\nconcerns about the generation of misleading or fabricated information, known as\nhallucinations. Therefore, detecting hallucinations has become critical to\nmaintaining trust in LLM-generated content. A primary challenge in learning a\ntruthfulness classifier is the lack of a large amount of labeled truthful and\nhallucinated data. To address the challenge, we introduce HaloScope, a novel\nlearning framework that leverages the unlabeled LLM generations in the wild for\nhallucination detection. Such unlabeled data arises freely upon deploying LLMs\nin the open world, and consists of both truthful and hallucinated information.\nTo harness the unlabeled data, we present an automated membership estimation\nscore for distinguishing between truthful and untruthful generations within\nunlabeled mixture data, thereby enabling the training of a binary truthfulness\nclassifier on top. Importantly, our framework does not require extra data\ncollection and human annotations, offering strong flexibility and practicality\nfor real-world applications. Extensive experiments show that HaloScope can\nachieve superior hallucination detection performance, outperforming the\ncompetitive rivals by a significant margin. Code is available at\nhttps://github.com/deeplearningwisc/haloscope.\n","authors":["Xuefeng Du","Chaowei Xiao","Yixuan Li"],"pdf_url":"https://arxiv.org/pdf/2409.17504v1.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2402.10669v5","updated":"2024-09-26T03:16:52Z","published":"2024-02-16T13:21:06Z","title":"Humans or LLMs as the Judge? A Study on Judgement Biases","summary":" Adopting human and large language models (LLM) as judges (a.k.a human- and\nLLM-as-a-judge) for evaluating the performance of LLMs has recently gained\nattention. Nonetheless, this approach concurrently introduces potential biases\nfrom human and LLMs, questioning the reliability of the evaluation results. In\nthis paper, we propose a novel framework that is free from referencing\ngroundtruth annotations for investigating Misinformation Oversight Bias, Gender\nBias, Authority Bias and Beauty Bias on LLM and human judges. We curate a\ndataset referring to the revised Bloom's Taxonomy and conduct thousands of\nevaluations. Results show that human and LLM judges are vulnerable to\nperturbations to various degrees, and that even the cutting-edge judges possess\nconsiderable biases. We further exploit these biases to conduct attacks on LLM\njudges. We hope that our work can notify the community of the bias and\nvulnerability of human- and LLM-as-a-judge, as well as the urgency of\ndeveloping robust evaluation systems.\n","authors":["Guiming Hardy Chen","Shunian Chen","Ziche Liu","Feng Jiang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2402.10669v5.pdf","comment":"EMNLP2024"},{"id":"http://arxiv.org/abs/2409.14509v3","updated":"2024-09-26T03:15:53Z","published":"2024-09-22T16:13:00Z","title":"Can AI writing be salvaged? Mitigating Idiosyncrasies and Improving\n Human-AI Alignment in the Writing Process through Edits","summary":" LLM-based applications are helping people write, and LLM-generated text is\nmaking its way into social media, journalism, and our classrooms. However, the\ndifferences between LLM-generated and human-written text remain unclear. To\nexplore this, we hired professional writers to edit paragraphs in several\ncreative domains. We first found these writers agree on undesirable\nidiosyncrasies in LLM-generated text, formalizing it into a seven-category\ntaxonomy (e.g. cliches, unnecessary exposition). Second, we curated the LAMP\ncorpus: 1,057 LLM-generated paragraphs edited by professional writers according\nto our taxonomy. Analysis of LAMP reveals that none of the LLMs used in our\nstudy (GPT4o, Claude-3.5-Sonnet, Llama-3.1-70b) outperform each other in terms\nof writing quality, revealing common limitations across model families. Third,\nwe explored automatic editing methods to improve LLM-generated text. A\nlarge-scale preference annotation confirms that although experts largely prefer\ntext edited by other experts, automatic editing methods show promise in\nimproving alignment between LLM-generated and human-written text.\n","authors":["Tuhin Chakrabarty","Philippe Laban","Chien-Sheng Wu"],"pdf_url":"https://arxiv.org/pdf/2409.14509v3.pdf","comment":"NLP+HCI, Behavioral Science"},{"id":"http://arxiv.org/abs/2409.17481v1","updated":"2024-09-26T02:37:41Z","published":"2024-09-26T02:37:41Z","title":"MaskLLM: Learnable Semi-Structured Sparsity for Large Language Models","summary":" Large Language Models (LLMs) are distinguished by their massive parameter\ncounts, which typically result in significant redundancy. This work introduces\nMaskLLM, a learnable pruning method that establishes Semi-structured (or\n``N:M'') Sparsity in LLMs, aimed at reducing computational overhead during\ninference. Instead of developing a new importance criterion, MaskLLM explicitly\nmodels N:M patterns as a learnable distribution through Gumbel Softmax\nsampling. This approach facilitates end-to-end training on large-scale datasets\nand offers two notable advantages: 1) High-quality Masks - our method\neffectively scales to large datasets and learns accurate masks; 2)\nTransferability - the probabilistic modeling of mask distribution enables the\ntransfer learning of sparsity across domains or tasks. We assessed MaskLLM\nusing 2:4 sparsity on various LLMs, including LLaMA-2, Nemotron-4, and GPT-3,\nwith sizes ranging from 843M to 15B parameters, and our empirical results show\nsubstantial improvements over state-of-the-art methods. For instance, leading\napproaches achieve a perplexity (PPL) of 10 or greater on Wikitext compared to\nthe dense model's 5.12 PPL, but MaskLLM achieves a significantly lower 6.72 PPL\nsolely by learning the masks with frozen weights. Furthermore, MaskLLM's\nlearnable nature allows customized masks for lossless application of 2:4\nsparsity to downstream tasks or domains. Code is available at\n\\url{https://github.com/NVlabs/MaskLLM}.\n","authors":["Gongfan Fang","Hongxu Yin","Saurav Muralidharan","Greg Heinrich","Jeff Pool","Jan Kautz","Pavlo Molchanov","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17481v1.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2409.17474v1","updated":"2024-09-26T02:19:13Z","published":"2024-09-26T02:19:13Z","title":"Reducing and Exploiting Data Augmentation Noise through Meta Reweighting\n Contrastive Learning for Text Classification","summary":" Data augmentation has shown its effectiveness in resolving the data-hungry\nproblem and improving model's generalization ability. However, the quality of\naugmented data can be varied, especially compared with the raw/original data.\nTo boost deep learning models' performance given augmented data/samples in text\nclassification tasks, we propose a novel framework, which leverages both meta\nlearning and contrastive learning techniques as parts of our design for\nreweighting the augmented samples and refining their feature representations\nbased on their quality. As part of the framework, we propose novel\nweight-dependent enqueue and dequeue algorithms to utilize augmented samples'\nweight/quality information effectively. Through experiments, we show that our\nframework can reasonably cooperate with existing deep learning models (e.g.,\nRoBERTa-base and Text-CNN) and augmentation techniques (e.g., Wordnet and\nEasydata) for specific supervised learning tasks. Experiment results show that\nour framework achieves an average of 1.6%, up to 4.3% absolute improvement on\nText-CNN encoders and an average of 1.4%, up to 4.4% absolute improvement on\nRoBERTa-base encoders on seven GLUE benchmark datasets compared with the best\nbaseline. We present an indepth analysis of our framework design, revealing the\nnon-trivial contributions of our network components. Our code is publicly\navailable for better reproducibility.\n","authors":["Guanyi Mou","Yichuan Li","Kyumin Lee"],"pdf_url":"https://arxiv.org/pdf/2409.17474v1.pdf","comment":"IEEE BigData 2021"},{"id":"http://arxiv.org/abs/2409.17472v1","updated":"2024-09-26T02:16:48Z","published":"2024-09-26T02:16:48Z","title":"Autoregressive Multi-trait Essay Scoring via Reinforcement Learning with\n Scoring-aware Multiple Rewards","summary":" Recent advances in automated essay scoring (AES) have shifted towards\nevaluating multiple traits to provide enriched feedback. Like typical AES\nsystems, multi-trait AES employs the quadratic weighted kappa (QWK) to measure\nagreement with human raters, aligning closely with the rating schema; however,\nits non-differentiable nature prevents its direct use in neural network\ntraining. In this paper, we propose Scoring-aware Multi-reward Reinforcement\nLearning (SaMRL), which integrates actual evaluation schemes into the training\nprocess by designing QWK-based rewards with a mean-squared error penalty for\nmulti-trait AES. Existing reinforcement learning (RL) applications in AES are\nlimited to classification models despite associated performance degradation, as\nRL requires probability distributions; instead, we adopt an autoregressive\nscore generation framework to leverage token generation probabilities for\nrobust multi-trait score predictions. Empirical analyses demonstrate that SaMRL\nfacilitates model training, notably enhancing scoring of previously inferior\nprompts.\n","authors":["Heejin Do","Sangwon Ryu","Gary Geunbae Lee"],"pdf_url":"https://arxiv.org/pdf/2409.17472v1.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.14552v2","updated":"2024-09-26T02:02:13Z","published":"2024-09-22T18:29:10Z","title":"Unleashing the Power of Emojis in Texts via Self-supervised Graph\n Pre-Training","summary":" Emojis have gained immense popularity on social platforms, serving as a\ncommon means to supplement or replace text. However, existing data mining\napproaches generally either completely ignore or simply treat emojis as\nordinary Unicode characters, which may limit the model's ability to grasp the\nrich semantic information in emojis and the interaction between emojis and\ntexts. Thus, it is necessary to release the emoji's power in social media data\nmining. To this end, we first construct a heterogeneous graph consisting of\nthree types of nodes, i.e. post, word and emoji nodes to improve the\nrepresentation of different elements in posts. The edges are also well-defined\nto model how these three elements interact with each other. To facilitate the\nsharing of information among post, word and emoji nodes, we propose a graph\npre-train framework for text and emoji co-modeling, which contains two graph\npre-training tasks: node-level graph contrastive learning and edge-level link\nreconstruction learning. Extensive experiments on the Xiaohongshu and Twitter\ndatasets with two types of downstream tasks demonstrate that our approach\nproves significant improvement over previous strong baseline methods.\n","authors":["Zhou Zhang","Dongzeng Tan","Jiaan Wang","Yilong Chen","Jiarong Xu"],"pdf_url":"https://arxiv.org/pdf/2409.14552v2.pdf","comment":"Accepted by EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2409.17467v1","updated":"2024-09-26T01:57:27Z","published":"2024-09-26T01:57:27Z","title":"What is the social benefit of hate speech detection research? A\n Systematic Review","summary":" While NLP research into hate speech detection has grown exponentially in the\nlast three decades, there has been minimal uptake or engagement from policy\nmakers and non-profit organisations. We argue the absence of ethical frameworks\nhave contributed to this rift between current practice and best practice. By\nadopting appropriate ethical frameworks, NLP researchers may enable the social\nimpact potential of hate speech research. This position paper is informed by\nreviewing forty-eight hate speech detection systems associated with\nthirty-seven publications from different venues.\n","authors":["Sidney Gig-Jan Wong"],"pdf_url":"https://arxiv.org/pdf/2409.17467v1.pdf","comment":"Accepted to the 3rd Workshop on NLP for Positive Impact"},{"id":"http://arxiv.org/abs/2409.17458v1","updated":"2024-09-26T01:24:17Z","published":"2024-09-26T01:24:17Z","title":"RED QUEEN: Safeguarding Large Language Models against Concealed\n Multi-Turn Jailbreaking","summary":" The rapid progress of Large Language Models (LLMs) has opened up new\nopportunities across various domains and applications; yet it also presents\nchallenges related to potential misuse. To mitigate such risks, red teaming has\nbeen employed as a proactive security measure to probe language models for\nharmful outputs via jailbreak attacks. However, current jailbreak attack\napproaches are single-turn with explicit malicious queries that do not fully\ncapture the complexity of real-world interactions. In reality, users can engage\nin multi-turn interactions with LLM-based chat assistants, allowing them to\nconceal their true intentions in a more covert manner. To bridge this gap, we,\nfirst, propose a new jailbreak approach, RED QUEEN ATTACK. This method\nconstructs a multi-turn scenario, concealing the malicious intent under the\nguise of preventing harm. We craft 40 scenarios that vary in turns and select\n14 harmful categories to generate 56k multi-turn attack data points. We conduct\ncomprehensive experiments on the RED QUEEN ATTACK with four representative LLM\nfamilies of different sizes. Our experiments reveal that all LLMs are\nvulnerable to RED QUEEN ATTACK, reaching 87.62% attack success rate on GPT-4o\nand 75.4% on Llama3-70B. Further analysis reveals that larger models are more\nsusceptible to the RED QUEEN ATTACK, with multi-turn structures and concealment\nstrategies contributing to its success. To prioritize safety, we introduce a\nstraightforward mitigation strategy called RED QUEEN GUARD, which aligns LLMs\nto effectively counter adversarial attacks. This approach reduces the attack\nsuccess rate to below 1% while maintaining the model's performance across\nstandard benchmarks. Full implementation and dataset are publicly accessible at\nhttps://github.com/kriti-hippo/red_queen.\n","authors":["Yifan Jiang","Kriti Aggarwal","Tanmay Laud","Kashif Munir","Jay Pujara","Subhabrata Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2409.17458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17455v1","updated":"2024-09-26T01:17:42Z","published":"2024-09-26T01:17:42Z","title":"Navigating the Shortcut Maze: A Comprehensive Analysis of Shortcut\n Learning in Text Classification by Language Models","summary":" Language models (LMs), despite their advances, often depend on spurious\ncorrelations, undermining their accuracy and generalizability. This study\naddresses the overlooked impact of subtler, more complex shortcuts that\ncompromise model reliability beyond oversimplified shortcuts. We introduce a\ncomprehensive benchmark that categorizes shortcuts into occurrence, style, and\nconcept, aiming to explore the nuanced ways in which these shortcuts influence\nthe performance of LMs. Through extensive experiments across traditional LMs,\nlarge language models, and state-of-the-art robust models, our research\nsystematically investigates models' resilience and susceptibilities to\nsophisticated shortcuts. Our benchmark and code can be found at:\nhttps://github.com/yuqing-zhou/shortcut-learning-in-text-classification.\n","authors":["Yuqing Zhou","Ruixiang Tang","Ziyu Yao","Ziwei Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.17455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17452v1","updated":"2024-09-26T01:08:09Z","published":"2024-09-26T01:08:09Z","title":"Description-based Controllable Text-to-Speech with Cross-Lingual Voice\n Control","summary":" We propose a novel description-based controllable text-to-speech (TTS) method\nwith cross-lingual control capability. To address the lack of audio-description\npaired data in the target language, we combine a TTS model trained on the\ntarget language with a description control model trained on another language,\nwhich maps input text descriptions to the conditional features of the TTS\nmodel. These two models share disentangled timbre and style representations\nbased on self-supervised learning (SSL), allowing for disentangled voice\ncontrol, such as controlling speaking styles while retaining the original\ntimbre. Furthermore, because the SSL-based timbre and style representations are\nlanguage-agnostic, combining the TTS and description control models while\nsharing the same embedding space effectively enables cross-lingual control of\nvoice characteristics. Experiments on English and Japanese TTS demonstrate that\nour method achieves high naturalness and controllability for both languages,\neven though no Japanese audio-description pairs are used.\n","authors":["Ryuichi Yamamoto","Yuma Shirahata","Masaya Kawamura","Kentaro Tachibana"],"pdf_url":"https://arxiv.org/pdf/2409.17452v1.pdf","comment":"Submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2109.04993v3","updated":"2024-09-26T00:58:15Z","published":"2021-09-04T22:48:46Z","title":"LAViTeR: Learning Aligned Visual and Textual Representations Assisted by\n Image and Caption Generation","summary":" Pre-training visual and textual representations from large-scale image-text\npairs is becoming a standard approach for many downstream vision-language\ntasks. The transformer-based models learn inter and intra-modal attention\nthrough a list of self-supervised learning tasks. This paper proposes LAViTeR,\na novel architecture for visual and textual representation learning. The main\nmodule, Visual Textual Alignment (VTA) will be assisted by two auxiliary tasks,\nGAN-based image synthesis and Image Captioning. We also propose a new\nevaluation metric measuring the similarity between the learnt visual and\ntextual embedding. The experimental results on two public datasets, CUB and\nMS-COCO, demonstrate superior visual and textual representation alignment in\nthe joint feature embedding space\n","authors":["Mohammad Abuzar Hashemi","Zhanghexuan Li","Mihir Chauhan","Yan Shen","Abhishek Satbhai","Mir Basheer Ali","Mingchen Gao","Sargur Srihari"],"pdf_url":"https://arxiv.org/pdf/2109.04993v3.pdf","comment":"15 pages, 10 Figures, 5 Tables. Oral Presentation at Irish Machine\n Vision and Image Processing Conference Proceedings, 2024"},{"id":"http://arxiv.org/abs/2409.17448v1","updated":"2024-09-26T00:54:17Z","published":"2024-09-26T00:54:17Z","title":"Enhancing Financial Sentiment Analysis with Expert-Designed Hint","summary":" This paper investigates the role of expert-designed hint in enhancing\nsentiment analysis on financial social media posts. We explore the capability\nof large language models (LLMs) to empathize with writer perspectives and\nanalyze sentiments. Our findings reveal that expert-designed hint, i.e.,\npointing out the importance of numbers, significantly improve performances\nacross various LLMs, particularly in cases requiring perspective-taking skills.\nFurther analysis on tweets containing different types of numerical data\ndemonstrates that the inclusion of expert-designed hint leads to notable\nimprovements in sentiment analysis performance, especially for tweets with\nmonetary-related numbers. Our findings contribute to the ongoing discussion on\nthe applicability of Theory of Mind in NLP and open new avenues for improving\nsentiment analysis in financial domains through the strategic use of expert\nknowledge.\n","authors":["Chung-Chi Chen","Hiroya Takamura","Ichiro Kobayashi","Yusuke Miyao"],"pdf_url":"https://arxiv.org/pdf/2409.17448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00948v2","updated":"2024-09-26T00:24:25Z","published":"2024-07-01T04:07:49Z","title":"View From Above: A Framework for Evaluating Distribution Shifts in Model\n Behavior","summary":" When large language models (LLMs) are asked to perform certain tasks, how can\nwe be sure that their learned representations align with reality? We propose a\ndomain-agnostic framework for systematically evaluating distribution shifts in\nLLMs decision-making processes, where they are given control of mechanisms\ngoverned by pre-defined rules. While individual LLM actions may appear\nconsistent with expected behavior, across a large number of trials,\nstatistically significant distribution shifts can emerge. To test this, we\nconstruct a well-defined environment with known outcome logic: blackjack. In\nmore than 1,000 trials, we uncover statistically significant evidence\nsuggesting behavioral misalignment in the learned representations of LLM.\n","authors":["Tanush Chopra","Michael Li","Jacob Haimes"],"pdf_url":"https://arxiv.org/pdf/2407.00948v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17073v2","updated":"2024-09-26T20:40:15Z","published":"2024-09-25T16:32:35Z","title":"Enhancing Post-Hoc Attributions in Long Document Comprehension via\n Coarse Grained Answer Decomposition","summary":" Accurately attributing answer text to its source document is crucial for\ndeveloping a reliable question-answering system. However, attribution for long\ndocuments remains largely unexplored. Post-hoc attribution systems are designed\nto map answer text back to the source document, yet the granularity of this\nmapping has not been addressed. Furthermore, a critical question arises: What\nexactly should be attributed? This involves identifying the specific\ninformation units within an answer that require grounding. In this paper, we\npropose and investigate a novel approach to the factual decomposition of\ngenerated answers for attribution, employing template-based in-context\nlearning. To accomplish this, we utilize the question and integrate negative\nsampling during few-shot in-context learning for decomposition. This approach\nenhances the semantic understanding of both abstractive and extractive answers.\nWe examine the impact of answer decomposition by providing a thorough\nexamination of various attribution approaches, ranging from retrieval-based\ntechniques to LLM-based attributors.\n","authors":["Pritika Ramu","Koustava Goswami","Apoorv Saxena","Balaji Vasan Srinivavsan"],"pdf_url":"https://arxiv.org/pdf/2409.17073v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.18128v1","updated":"2024-09-26T17:59:51Z","published":"2024-09-26T17:59:51Z","title":"FlowTurbo: Towards Real-time Flow-Based Image Generation with Velocity\n Refiner","summary":" Building on the success of diffusion models in visual generation, flow-based\nmodels reemerge as another prominent family of generative models that have\nachieved competitive or better performance in terms of both visual quality and\ninference speed. By learning the velocity field through flow-matching,\nflow-based models tend to produce a straighter sampling trajectory, which is\nadvantageous during the sampling process. However, unlike diffusion models for\nwhich fast samplers are well-developed, efficient sampling of flow-based\ngenerative models has been rarely explored. In this paper, we propose a\nframework called FlowTurbo to accelerate the sampling of flow-based models\nwhile still enhancing the sampling quality. Our primary observation is that the\nvelocity predictor's outputs in the flow-based models will become stable during\nthe sampling, enabling the estimation of velocity via a lightweight velocity\nrefiner. Additionally, we introduce several techniques including a pseudo\ncorrector and sample-aware compilation to further reduce inference time. Since\nFlowTurbo does not change the multi-step sampling paradigm, it can be\neffectively applied for various tasks such as image editing, inpainting, etc.\nBy integrating FlowTurbo into different flow-based models, we obtain an\nacceleration ratio of 53.1%$\\sim$58.3% on class-conditional generation and\n29.8%$\\sim$38.5% on text-to-image generation. Notably, FlowTurbo reaches an FID\nof 2.12 on ImageNet with 100 (ms / img) and FID of 3.93 with 38 (ms / img),\nachieving the real-time image generation and establishing the new\nstate-of-the-art. Code is available at https://github.com/shiml20/FlowTurbo.\n","authors":["Wenliang Zhao","Minglei Shi","Xumin Yu","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2409.18128v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.18127v1","updated":"2024-09-26T17:59:31Z","published":"2024-09-26T17:59:31Z","title":"EgoLM: Multi-Modal Language Model of Egocentric Motions","summary":" As the prevalence of wearable devices, learning egocentric motions becomes\nessential to develop contextual AI. In this work, we present EgoLM, a versatile\nframework that tracks and understands egocentric motions from multi-modal\ninputs, e.g., egocentric videos and motion sensors. EgoLM exploits rich\ncontexts for the disambiguation of egomotion tracking and understanding, which\nare ill-posed under single modality conditions. To facilitate the versatile and\nmulti-modal framework, our key insight is to model the joint distribution of\negocentric motions and natural languages using large language models (LLM).\nMulti-modal sensor inputs are encoded and projected to the joint latent space\nof language models, and used to prompt motion generation or text generation for\negomotion tracking or understanding, respectively. Extensive experiments on\nlarge-scale multi-modal human motion dataset validate the effectiveness of\nEgoLM as a generalist model for universal egocentric learning.\n","authors":["Fangzhou Hong","Vladimir Guzov","Hyo Jin Kim","Yuting Ye","Richard Newcombe","Ziwei Liu","Lingni Ma"],"pdf_url":"https://arxiv.org/pdf/2409.18127v1.pdf","comment":"Project Page: https://hongfz16.github.io/projects/EgoLM"},{"id":"http://arxiv.org/abs/2409.18125v1","updated":"2024-09-26T17:59:11Z","published":"2024-09-26T17:59:11Z","title":"LLaVA-3D: A Simple yet Effective Pathway to Empowering LMMs with\n 3D-awareness","summary":" Recent advancements in Large Multimodal Models (LMMs) have greatly enhanced\ntheir proficiency in 2D visual understanding tasks, enabling them to\neffectively process and understand images and videos. However, the development\nof LMMs with 3D-awareness for 3D scene understanding has been hindered by the\nlack of large-scale 3D vision-language datasets and powerful 3D encoders. In\nthis paper, we introduce a simple yet effective framework called LLaVA-3D.\nLeveraging the strong 2D understanding priors from LLaVA, our LLaVA-3D\nefficiently adapts LLaVA for 3D scene understanding without compromising 2D\nunderstanding capabilities. To achieve this, we employ a simple yet effective\nrepresentation, 3D Patch, which connects 2D CLIP patch features with their\ncorresponding positions in 3D space. By integrating the 3D Patches into 2D LMMs\nand employing joint 2D and 3D vision-language instruction tuning, we establish\na unified architecture for both 2D image understanding and 3D scene\nunderstanding. Experimental results show that LLaVA-3D converges 3.5x faster\nthan existing 3D LMMs when trained on 3D vision-language datasets. Moreover,\nLLaVA-3D not only achieves state-of-the-art performance across various 3D tasks\nbut also maintains comparable 2D image understanding and vision-language\nconversation capabilities with LLaVA.\n","authors":["Chenming Zhu","Tai Wang","Wenwei Zhang","Jiangmiao Pang","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18125v1.pdf","comment":"Project page: https://zcmax.github.io/projects/LLaVA-3D/"},{"id":"http://arxiv.org/abs/2409.18124v1","updated":"2024-09-26T17:58:55Z","published":"2024-09-26T17:58:55Z","title":"Lotus: Diffusion-based Visual Foundation Model for High-quality Dense\n Prediction","summary":" Leveraging the visual priors of pre-trained text-to-image diffusion models\noffers a promising solution to enhance zero-shot generalization in dense\nprediction tasks. However, existing methods often uncritically use the original\ndiffusion formulation, which may not be optimal due to the fundamental\ndifferences between dense prediction and image generation. In this paper, we\nprovide a systemic analysis of the diffusion formulation for the dense\nprediction, focusing on both quality and efficiency. And we find that the\noriginal parameterization type for image generation, which learns to predict\nnoise, is harmful for dense prediction; the multi-step noising/denoising\ndiffusion process is also unnecessary and challenging to optimize. Based on\nthese insights, we introduce Lotus, a diffusion-based visual foundation model\nwith a simple yet effective adaptation protocol for dense prediction.\nSpecifically, Lotus is trained to directly predict annotations instead of\nnoise, thereby avoiding harmful variance. We also reformulate the diffusion\nprocess into a single-step procedure, simplifying optimization and\nsignificantly boosting inference speed. Additionally, we introduce a novel\ntuning strategy called detail preserver, which achieves more accurate and\nfine-grained predictions. Without scaling up the training data or model\ncapacity, Lotus achieves SoTA performance in zero-shot depth and normal\nestimation across various datasets. It also significantly enhances efficiency,\nbeing hundreds of times faster than most existing diffusion-based methods.\n","authors":["Jing He","Haodong Li","Wei Yin","Yixun Liang","Leheng Li","Kaiqiang Zhou","Hongbo Liu","Bingbing Liu","Ying-Cong Chen"],"pdf_url":"https://arxiv.org/pdf/2409.18124v1.pdf","comment":"Project page: https://lotus3d.github.io/"},{"id":"http://arxiv.org/abs/2409.18121v1","updated":"2024-09-26T17:57:16Z","published":"2024-09-26T17:57:16Z","title":"Robot See Robot Do: Imitating Articulated Object Manipulation with\n Monocular 4D Reconstruction","summary":" Humans can learn to manipulate new objects by simply watching others;\nproviding robots with the ability to learn from such demonstrations would\nenable a natural interface specifying new behaviors. This work develops Robot\nSee Robot Do (RSRD), a method for imitating articulated object manipulation\nfrom a single monocular RGB human demonstration given a single static\nmulti-view object scan. We first propose 4D Differentiable Part Models\n(4D-DPM), a method for recovering 3D part motion from a monocular video with\ndifferentiable rendering. This analysis-by-synthesis approach uses part-centric\nfeature fields in an iterative optimization which enables the use of geometric\nregularizers to recover 3D motions from only a single video. Given this 4D\nreconstruction, the robot replicates object trajectories by planning bimanual\narm motions that induce the demonstrated object part motion. By representing\ndemonstrations as part-centric trajectories, RSRD focuses on replicating the\ndemonstration's intended behavior while considering the robot's own\nmorphological limits, rather than attempting to reproduce the hand's motion. We\nevaluate 4D-DPM's 3D tracking accuracy on ground truth annotated 3D part\ntrajectories and RSRD's physical execution performance on 9 objects across 10\ntrials each on a bimanual YuMi robot. Each phase of RSRD achieves an average of\n87% success rate, for a total end-to-end success rate of 60% across 90 trials.\nNotably, this is accomplished using only feature fields distilled from large\npretrained vision models -- without any task-specific training, fine-tuning,\ndataset collection, or annotation. Project page:\nhttps://robot-see-robot-do.github.io\n","authors":["Justin Kerr","Chung Min Kim","Mingxuan Wu","Brent Yi","Qianqian Wang","Ken Goldberg","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2409.18121v1.pdf","comment":"CoRL 2024, Project page: https://robot-see-robot-do.github.io"},{"id":"http://arxiv.org/abs/2409.18120v1","updated":"2024-09-26T17:57:15Z","published":"2024-09-26T17:57:15Z","title":"EvMAPPER: High Altitude Orthomapping with Event Cameras","summary":" Traditionally, unmanned aerial vehicles (UAVs) rely on CMOS-based cameras to\ncollect images about the world below. One of the most successful applications\nof UAVs is to generate orthomosaics or orthomaps, in which a series of images\nare integrated together to develop a larger map. However, the use of CMOS-based\ncameras with global or rolling shutters mean that orthomaps are vulnerable to\nchallenging light conditions, motion blur, and high-speed motion of\nindependently moving objects under the camera. Event cameras are less sensitive\nto these issues, as their pixels are able to trigger asynchronously on\nbrightness changes. This work introduces the first orthomosaic approach using\nevent cameras. In contrast to existing methods relying only on CMOS cameras,\nour approach enables map generation even in challenging light conditions,\nincluding direct sunlight and after sunset.\n","authors":["Fernando Cladera","Kenneth Chaney","M. Ani Hsieh","Camillo J. Taylor","Vijay Kumar"],"pdf_url":"https://arxiv.org/pdf/2409.18120v1.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.18119v1","updated":"2024-09-26T17:56:59Z","published":"2024-09-26T17:56:59Z","title":"Multi-View and Multi-Scale Alignment for Contrastive Language-Image\n Pre-training in Mammography","summary":" Contrastive Language-Image Pre-training (CLIP) shows promise in medical image\nanalysis but requires substantial data and computational resources. Due to\nthese restrictions, existing CLIP applications in medical imaging focus mainly\non modalities like chest X-rays that have abundant image-report data available,\nleaving many other important modalities under-explored. Here, we propose the\nfirst adaptation of the full CLIP model to mammography, which presents\nsignificant challenges due to labeled data scarcity, high-resolution images\nwith small regions of interest, and data imbalance. We first develop a\nspecialized supervision framework for mammography that leverages its multi-view\nnature. Furthermore, we design a symmetric local alignment module to better\nfocus on detailed features in high-resolution images. Lastly, we incorporate a\nparameter-efficient fine-tuning approach for large language models pre-trained\nwith medical knowledge to address data limitations. Our multi-view and\nmulti-scale alignment (MaMA) method outperforms state-of-the-art baselines for\nthree different tasks on two large real-world mammography datasets, EMBED and\nRSNA-Mammo, with only 52% model size compared with the largest baseline.\n","authors":["Yuexi Du","John Onofrey","Nicha C. Dvornek"],"pdf_url":"https://arxiv.org/pdf/2409.18119v1.pdf","comment":"This work is also the basis of the overall best solution for the\n MICCAI 2024 CXR-LT Challenge"},{"id":"http://arxiv.org/abs/2409.18114v1","updated":"2024-09-26T17:55:02Z","published":"2024-09-26T17:55:02Z","title":"EdgeRunner: Auto-regressive Auto-encoder for Artistic Mesh Generation","summary":" Current auto-regressive mesh generation methods suffer from issues such as\nincompleteness, insufficient detail, and poor generalization. In this paper, we\npropose an Auto-regressive Auto-encoder (ArAE) model capable of generating\nhigh-quality 3D meshes with up to 4,000 faces at a spatial resolution of\n$512^3$. We introduce a novel mesh tokenization algorithm that efficiently\ncompresses triangular meshes into 1D token sequences, significantly enhancing\ntraining efficiency. Furthermore, our model compresses variable-length\ntriangular meshes into a fixed-length latent space, enabling training latent\ndiffusion models for better generalization. Extensive experiments demonstrate\nthe superior quality, diversity, and generalization capabilities of our model\nin both point cloud and image-conditioned mesh generation tasks.\n","authors":["Jiaxiang Tang","Zhaoshuo Li","Zekun Hao","Xian Liu","Gang Zeng","Ming-Yu Liu","Qinsheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18114v1.pdf","comment":"Project Page: https://research.nvidia.com/labs/dir/edgerunner/"},{"id":"http://arxiv.org/abs/2409.18111v1","updated":"2024-09-26T17:53:04Z","published":"2024-09-26T17:53:04Z","title":"E.T. Bench: Towards Open-Ended Event-Level Video-Language Understanding","summary":" Recent advances in Video Large Language Models (Video-LLMs) have demonstrated\ntheir great potential in general-purpose video understanding. To verify the\nsignificance of these models, a number of benchmarks have been proposed to\ndiagnose their capabilities in different scenarios. However, existing\nbenchmarks merely evaluate models through video-level question-answering,\nlacking fine-grained event-level assessment and task diversity. To fill this\ngap, we introduce E.T. Bench (Event-Level & Time-Sensitive Video Understanding\nBenchmark), a large-scale and high-quality benchmark for open-ended event-level\nvideo understanding. Categorized within a 3-level task taxonomy, E.T. Bench\nencompasses 7.3K samples under 12 tasks with 7K videos (251.4h total length)\nunder 8 domains, providing comprehensive evaluations. We extensively evaluated\n8 Image-LLMs and 12 Video-LLMs on our benchmark, and the results reveal that\nstate-of-the-art models for coarse-level (video-level) understanding struggle\nto solve our fine-grained tasks, e.g., grounding event-of-interests within\nvideos, largely due to the short video context length, improper time\nrepresentations, and lack of multi-event training data. Focusing on these\nissues, we further propose a strong baseline model, E.T. Chat, together with an\ninstruction-tuning dataset E.T. Instruct 164K tailored for fine-grained\nevent-level understanding. Our simple but effective solution demonstrates\nsuperior performance in multiple scenarios.\n","authors":["Ye Liu","Zongyang Ma","Zhongang Qi","Yang Wu","Ying Shan","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2409.18111v1.pdf","comment":"Accepted to NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2409.18104v1","updated":"2024-09-26T17:49:20Z","published":"2024-09-26T17:49:20Z","title":"Find Rhinos without Finding Rhinos: Active Learning with Multimodal\n Imagery of South African Rhino Habitats","summary":" Much of Earth's charismatic megafauna is endangered by human activities,\nparticularly the rhino, which is at risk of extinction due to the poaching\ncrisis in Africa. Monitoring rhinos' movement is crucial to their protection\nbut has unfortunately proven difficult because rhinos are elusive. Therefore,\ninstead of tracking rhinos, we propose the novel approach of mapping communal\ndefecation sites, called middens, which give information about rhinos' spatial\nbehavior valuable to anti-poaching, management, and reintroduction efforts.\nThis paper provides the first-ever mapping of rhino midden locations by\nbuilding classifiers to detect them using remotely sensed thermal, RGB, and\nLiDAR imagery in passive and active learning settings. As existing active\nlearning methods perform poorly due to the extreme class imbalance in our\ndataset, we design MultimodAL, an active learning system employing a ranking\ntechnique and multimodality to achieve competitive performance with passive\nlearning models with 94% fewer labels. Our methods could therefore save over 76\nhours in labeling time when used on a similarly-sized dataset. Unexpectedly,\nour midden map reveals that rhino middens are not randomly distributed\nthroughout the landscape; rather, they are clustered. Consequently, rangers\nshould be targeted at areas with high midden densities to strengthen\nanti-poaching efforts, in line with UN Target 15.7.\n","authors":["Lucia Gordon","Nikhil Behari","Samuel Collier","Elizabeth Bondi-Kelly","Jackson A. Killian","Catherine Ressijac","Peter Boucher","Andrew Davies","Milind Tambe"],"pdf_url":"https://arxiv.org/pdf/2409.18104v1.pdf","comment":"9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good"},{"id":"http://arxiv.org/abs/2409.18102v1","updated":"2024-09-26T17:45:10Z","published":"2024-09-26T17:45:10Z","title":"MALPOLON: A Framework for Deep Species Distribution Modeling","summary":" This paper describes a deep-SDM framework, MALPOLON. Written in Python and\nbuilt upon the PyTorch library, this framework aims to facilitate training and\ninferences of deep species distribution models (deep-SDM) and sharing for users\nwith only general Python language skills (e.g., modeling ecologists) who are\ninterested in testing deep learning approaches to build new SDMs. More advanced\nusers can also benefit from the framework's modularity to run more specific\nexperiments by overriding existing classes while taking advantage of\npress-button examples to train neural networks on multiple classification tasks\nusing custom or provided raw and pre-processed datasets. The framework is\nopen-sourced on GitHub and PyPi along with extensive documentation and examples\nof use in various scenarios. MALPOLON offers straightforward installation,\nYAML-based configuration, parallel computing, multi-GPU utilization, baseline\nand foundational models for benchmarking, and extensive\ntutorials/documentation, aiming to enhance accessibility and performance\nscalability for ecologists and researchers.\n","authors":["Theo Larcher","Lukas Picek","Benjamin Deneu","Titouan Lorieul","Maximilien Servajean","Alexis Joly"],"pdf_url":"https://arxiv.org/pdf/2409.18102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18101v1","updated":"2024-09-26T17:44:52Z","published":"2024-09-26T17:44:52Z","title":"AI-Powered Augmented Reality for Satellite Assembly, Integration and\n Test","summary":" The integration of Artificial Intelligence (AI) and Augmented Reality (AR) is\nset to transform satellite Assembly, Integration, and Testing (AIT) processes\nby enhancing precision, minimizing human error, and improving operational\nefficiency in cleanroom environments. This paper presents a technical\ndescription of the European Space Agency's (ESA) project \"AI for AR in\nSatellite AIT,\" which combines real-time computer vision and AR systems to\nassist technicians during satellite assembly. Leveraging Microsoft HoloLens 2\nas the AR interface, the system delivers context-aware instructions and\nreal-time feedback, tackling the complexities of object recognition and 6D pose\nestimation in AIT workflows. All AI models demonstrated over 70% accuracy, with\nthe detection model exceeding 95% accuracy, indicating a high level of\nperformance and reliability. A key contribution of this work lies in the\neffective use of synthetic data for training AI models in AR applications,\naddressing the significant challenges of obtaining real-world datasets in\nhighly dynamic satellite environments, as well as the creation of the Segmented\nAnything Model for Automatic Labelling (SAMAL), which facilitates the automatic\nannotation of real data, achieving speeds up to 20 times faster than manual\nhuman annotation. The findings demonstrate the efficacy of AI-driven AR systems\nin automating critical satellite assembly tasks, setting a foundation for\nfuture innovations in the space industry.\n","authors":["Alvaro Patricio","Joao Valente","Atabak Dehban","Ines Cadilha","Daniel Reis","Rodrigo Ventura"],"pdf_url":"https://arxiv.org/pdf/2409.18101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18100v1","updated":"2024-09-26T17:44:29Z","published":"2024-09-26T17:44:29Z","title":"Self-supervised Pretraining for Cardiovascular Magnetic Resonance Cine\n Segmentation","summary":" Self-supervised pretraining (SSP) has shown promising results in learning\nfrom large unlabeled datasets and, thus, could be useful for automated\ncardiovascular magnetic resonance (CMR) short-axis cine segmentation. However,\ninconsistent reports of the benefits of SSP for segmentation have made it\ndifficult to apply SSP to CMR. Therefore, this study aimed to evaluate SSP\nmethods for CMR cine segmentation.\n To this end, short-axis cine stacks of 296 subjects (90618 2D slices) were\nused for unlabeled pretraining with four SSP methods; SimCLR, positional\ncontrastive learning, DINO, and masked image modeling (MIM). Subsets of varying\nnumbers of subjects were used for supervised fine-tuning of 2D models for each\nSSP method, as well as to train a 2D baseline model from scratch. The\nfine-tuned models were compared to the baseline using the 3D Dice similarity\ncoefficient (DSC) in a test dataset of 140 subjects.\n The SSP methods showed no performance gains with the largest supervised\nfine-tuning subset compared to the baseline (DSC = 0.89). When only 10 subjects\n(231 2D slices) are available for supervised training, SSP using MIM (DSC =\n0.86) improves over training from scratch (DSC = 0.82).\n This study found that SSP is valuable for CMR cine segmentation when labeled\ntraining data is scarce, but does not aid state-of-the-art deep learning\nmethods when ample labeled data is available. Moreover, the choice of SSP\nmethod is important. The code is publicly available at:\nhttps://github.com/q-cardIA/ssp-cmr-cine-segmentation\n","authors":["Rob A. J. de Mooij","Josien P. W. Pluim","Cian M. Scannell"],"pdf_url":"https://arxiv.org/pdf/2409.18100v1.pdf","comment":"Accepted to Data Engineering in Medical Imaging (DEMI) Workshop at\n MICCAI 2024"},{"id":"http://arxiv.org/abs/2409.18099v1","updated":"2024-09-26T17:44:20Z","published":"2024-09-26T17:44:20Z","title":"EfficientCrackNet: A Lightweight Model for Crack Segmentation","summary":" Crack detection, particularly from pavement images, presents a formidable\nchallenge in the domain of computer vision due to several inherent complexities\nsuch as intensity inhomogeneity, intricate topologies, low contrast, and noisy\nbackgrounds. Automated crack detection is crucial for maintaining the\nstructural integrity of essential infrastructures, including buildings,\npavements, and bridges. Existing lightweight methods often face challenges\nincluding computational inefficiency, complex crack patterns, and difficult\nbackgrounds, leading to inaccurate detection and impracticality for real-world\napplications. To address these limitations, we propose EfficientCrackNet, a\nlightweight hybrid model combining Convolutional Neural Networks (CNNs) and\ntransformers for precise crack segmentation. EfficientCrackNet integrates\ndepthwise separable convolutions (DSC) layers and MobileViT block to capture\nboth global and local features. The model employs an Edge Extraction Method\n(EEM) and for efficient crack edge detection without pretraining, and\nUltra-Lightweight Subspace Attention Module (ULSAM) to enhance feature\nextraction. Extensive experiments on three benchmark datasets Crack500,\nDeepCrack, and GAPs384 demonstrate that EfficientCrackNet achieves superior\nperformance compared to existing lightweight models, while requiring only 0.26M\nparameters, and 0.483 FLOPs (G). The proposed model offers an optimal balance\nbetween accuracy and computational efficiency, outperforming state-of-the-art\nlightweight models, and providing a robust and adaptable solution for\nreal-world crack segmentation.\n","authors":["Abid Hasan Zim","Aquib Iqbal","Zaid Al-Huda","Asad Malik","Minoru Kuribayash"],"pdf_url":"https://arxiv.org/pdf/2409.18099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18092v1","updated":"2024-09-26T17:39:05Z","published":"2024-09-26T17:39:05Z","title":"DiffSSC: Semantic LiDAR Scan Completion using Denoising Diffusion\n Probabilistic Models","summary":" Perception systems play a crucial role in autonomous driving, incorporating\nmultiple sensors and corresponding computer vision algorithms. 3D LiDAR sensors\nare widely used to capture sparse point clouds of the vehicle's surroundings.\nHowever, such systems struggle to perceive occluded areas and gaps in the scene\ndue to the sparsity of these point clouds and their lack of semantics. To\naddress these challenges, Semantic Scene Completion (SSC) jointly predicts\nunobserved geometry and semantics in the scene given raw LiDAR measurements,\naiming for a more complete scene representation. Building on promising results\nof diffusion models in image generation and super-resolution tasks, we propose\ntheir extension to SSC by implementing the noising and denoising diffusion\nprocesses in the point and semantic spaces individually. To control the\ngeneration, we employ semantic LiDAR point clouds as conditional input and\ndesign local and global regularization losses to stabilize the denoising\nprocess. We evaluate our approach on autonomous driving datasets and our\napproach outperforms the state-of-the-art for SSC.\n","authors":["Helin Cao","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2409.18092v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2409.16147v2","updated":"2024-09-26T17:31:35Z","published":"2024-09-23T00:11:30Z","title":"Gaussian Deja-vu: Creating Controllable 3D Gaussian Head-Avatars with\n Enhanced Generalization and Personalization Abilities","summary":" Recent advancements in 3D Gaussian Splatting (3DGS) have unlocked significant\npotential for modeling 3D head avatars, providing greater flexibility than\nmesh-based methods and more efficient rendering compared to NeRF-based\napproaches. Despite these advancements, the creation of controllable 3DGS-based\nhead avatars remains time-intensive, often requiring tens of minutes to hours.\nTo expedite this process, we here introduce the ``Gaussian D\\'ej\\`a-vu\"\nframework, which first obtains a generalized model of the head avatar and then\npersonalizes the result. The generalized model is trained on large 2D\n(synthetic and real) image datasets. This model provides a well-initialized 3D\nGaussian head that is further refined using a monocular video to achieve the\npersonalized head avatar. For personalizing, we propose learnable\nexpression-aware rectification blendmaps to correct the initial 3D Gaussians,\nensuring rapid convergence without the reliance on neural networks. Experiments\ndemonstrate that the proposed method meets its objectives. It outperforms\nstate-of-the-art 3D Gaussian head avatars in terms of photorealistic quality as\nwell as reduces training time consumption to at least a quarter of the existing\nmethods, producing the avatar in minutes.\n","authors":["Peizhi Yan","Rabab Ward","Qiang Tang","Shan Du"],"pdf_url":"https://arxiv.org/pdf/2409.16147v2.pdf","comment":"11 pages, Accepted by WACV 2025 in Round 1"},{"id":"http://arxiv.org/abs/2409.18083v1","updated":"2024-09-26T17:26:18Z","published":"2024-09-26T17:26:18Z","title":"Stable Video Portraits","summary":" Rapid advances in the field of generative AI and text-to-image methods in\nparticular have transformed the way we interact with and perceive\ncomputer-generated imagery today. In parallel, much progress has been made in\n3D face reconstruction, using 3D Morphable Models (3DMM). In this paper, we\npresent SVP, a novel hybrid 2D/3D generation method that outputs photorealistic\nvideos of talking faces leveraging a large pre-trained text-to-image prior\n(2D), controlled via a 3DMM (3D). Specifically, we introduce a person-specific\nfine-tuning of a general 2D stable diffusion model which we lift to a video\nmodel by providing temporal 3DMM sequences as conditioning and by introducing a\ntemporal denoising procedure. As an output, this model generates temporally\nsmooth imagery of a person with 3DMM-based controls, i.e., a person-specific\navatar. The facial appearance of this person-specific avatar can be edited and\nmorphed to text-defined celebrities, without any fine-tuning at test time. The\nmethod is analyzed quantitatively and qualitatively, and we show that our\nmethod outperforms state-of-the-art monocular head avatar methods.\n","authors":["Mirela Ostrek","Justus Thies"],"pdf_url":"https://arxiv.org/pdf/2409.18083v1.pdf","comment":"Accepted at ECCV 2024, Project: https://svp.is.tue.mpg.de"},{"id":"http://arxiv.org/abs/2409.18082v1","updated":"2024-09-26T17:26:16Z","published":"2024-09-26T17:26:16Z","title":"SKT: Integrating State-Aware Keypoint Trajectories with Vision-Language\n Models for Robotic Garment Manipulation","summary":" Automating garment manipulation poses a significant challenge for assistive\nrobotics due to the diverse and deformable nature of garments. Traditional\napproaches typically require separate models for each garment type, which\nlimits scalability and adaptability. In contrast, this paper presents a unified\napproach using vision-language models (VLMs) to improve keypoint prediction\nacross various garment categories. By interpreting both visual and semantic\ninformation, our model enables robots to manage different garment states with a\nsingle model. We created a large-scale synthetic dataset using advanced\nsimulation techniques, allowing scalable training without extensive real-world\ndata. Experimental results indicate that the VLM-based method significantly\nenhances keypoint detection accuracy and task success rates, providing a more\nflexible and general solution for robotic garment manipulation. In addition,\nthis research also underscores the potential of VLMs to unify various garment\nmanipulation tasks within a single framework, paving the way for broader\napplications in home automation and assistive robotics for future.\n","authors":["Xin Li","Siyuan Huang","Qiaojun Yu","Zhengkai Jiang","Ce Hao","Yimeng Zhu","Hongsheng Li","Peng Gao","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2409.18082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18071v1","updated":"2024-09-26T17:18:39Z","published":"2024-09-26T17:18:39Z","title":"FreeEdit: Mask-free Reference-based Image Editing with Multi-modal\n Instruction","summary":" Introducing user-specified visual concepts in image editing is highly\npractical as these concepts convey the user's intent more precisely than\ntext-based descriptions. We propose FreeEdit, a novel approach for achieving\nsuch reference-based image editing, which can accurately reproduce the visual\nconcept from the reference image based on user-friendly language instructions.\nOur approach leverages the multi-modal instruction encoder to encode language\ninstructions to guide the editing process. This implicit way of locating the\nediting area eliminates the need for manual editing masks. To enhance the\nreconstruction of reference details, we introduce the Decoupled Residual\nReferAttention (DRRA) module. This module is designed to integrate fine-grained\nreference features extracted by a detail extractor into the image editing\nprocess in a residual way without interfering with the original self-attention.\nGiven that existing datasets are unsuitable for reference-based image editing\ntasks, particularly due to the difficulty in constructing image triplets that\ninclude a reference image, we curate a high-quality dataset, FreeBench, using a\nnewly developed twice-repainting scheme. FreeBench comprises the images before\nand after editing, detailed editing instructions, as well as a reference image\nthat maintains the identity of the edited object, encompassing tasks such as\nobject addition, replacement, and deletion. By conducting phased training on\nFreeBench followed by quality tuning, FreeEdit achieves high-quality zero-shot\nediting through convenient language instructions. We conduct extensive\nexperiments to evaluate the effectiveness of FreeEdit across multiple task\ntypes, demonstrating its superiority over existing methods. The code will be\navailable at: https://freeedit.github.io/.\n","authors":["Runze He","Kai Ma","Linjiang Huang","Shaofei Huang","Jialin Gao","Xiaoming Wei","Jiao Dai","Jizhong Han","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18071v1.pdf","comment":"14 pages, 14 figures, project website: https://freeedit.github.io/"},{"id":"http://arxiv.org/abs/2409.18057v1","updated":"2024-09-26T17:00:02Z","published":"2024-09-26T17:00:02Z","title":"LightAvatar: Efficient Head Avatar as Dynamic Neural Light Field","summary":" Recent works have shown that neural radiance fields (NeRFs) on top of\nparametric models have reached SOTA quality to build photorealistic head\navatars from a monocular video. However, one major limitation of the NeRF-based\navatars is the slow rendering speed due to the dense point sampling of NeRF,\npreventing them from broader utility on resource-constrained devices. We\nintroduce LightAvatar, the first head avatar model based on neural light fields\n(NeLFs). LightAvatar renders an image from 3DMM parameters and a camera pose\nvia a single network forward pass, without using mesh or volume rendering. The\nproposed approach, while being conceptually appealing, poses a significant\nchallenge towards real-time efficiency and training stability. To resolve them,\nwe introduce dedicated network designs to obtain proper representations for the\nNeLF model and maintain a low FLOPs budget. Meanwhile, we tap into a\ndistillation-based training strategy that uses a pretrained avatar model as\nteacher to synthesize abundant pseudo data for training. A warping field\nnetwork is introduced to correct the fitting error in the real data so that the\nmodel can learn better. Extensive experiments suggest that our method can\nachieve new SOTA image quality quantitatively or qualitatively, while being\nsignificantly faster than the counterparts, reporting 174.1 FPS (512x512\nresolution) on a consumer-grade GPU (RTX3090) with no customized optimization.\n","authors":["Huan Wang","Feitong Tan","Ziqian Bai","Yinda Zhang","Shichen Liu","Qiangeng Xu","Menglei Chai","Anish Prabhu","Rohit Pandey","Sean Fanello","Zeng Huang","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2409.18057v1.pdf","comment":"Appear in ECCV'24 CADL Workshop. Code:\n https://github.com/MingSun-Tse/LightAvatar-TensorFlow"},{"id":"http://arxiv.org/abs/2409.18055v1","updated":"2024-09-26T16:59:01Z","published":"2024-09-26T16:59:01Z","title":"Visual Data Diagnosis and Debiasing with Concept Graphs","summary":" The widespread success of deep learning models today is owed to the curation\nof extensive datasets significant in size and complexity. However, such models\nfrequently pick up inherent biases in the data during the training process,\nleading to unreliable predictions. Diagnosing and debiasing datasets is thus a\nnecessity to ensure reliable model performance. In this paper, we present\nCONBIAS, a novel framework for diagnosing and mitigating Concept co-occurrence\nBiases in visual datasets. CONBIAS represents visual datasets as knowledge\ngraphs of concepts, enabling meticulous analysis of spurious concept\nco-occurrences to uncover concept imbalances across the whole dataset.\nMoreover, we show that by employing a novel clique-based concept balancing\nstrategy, we can mitigate these imbalances, leading to enhanced performance on\ndownstream tasks. Extensive experiments show that data augmentation based on a\nbalanced concept distribution augmented by CONBIAS improves generalization\nperformance across multiple datasets compared to state-of-the-art methods. We\nwill make our code and data publicly available.\n","authors":["Rwiddhi Chakraborty","Yinong Wang","Jialu Gao","Runkai Zheng","Cheng Zhang","Fernando De la Torre"],"pdf_url":"https://arxiv.org/pdf/2409.18055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08168v3","updated":"2024-09-26T16:51:37Z","published":"2023-12-13T14:27:45Z","title":"Chat-Scene: Bridging 3D Scene and Large Language Models with Object\n Identifiers","summary":" Recent advancements in 3D Large Language Models (LLMs) have demonstrated\npromising capabilities for 3D scene understanding. However, previous methods\nexhibit deficiencies in general referencing and grounding capabilities for\nintricate scene comprehension. In this paper, we introduce the use of object\nidentifiers and object-centric representations to interact with scenes at the\nobject level. Specifically, we decompose the input 3D scene into a set of\nobject proposals, each assigned a unique identifier token, which enables\nefficient object referencing and grounding during user-assistant interactions.\nGiven the scarcity of scene-language data, we model the scene embeddings as a\nsequence of explicit object-level embeddings, derived from semantic-rich 2D or\n3D representations. By employing object identifiers, we transform diverse 3D\nscene-language tasks into a unified question-answering format, facilitating\njoint training without the need for additional task-specific heads. With\nminimal fine-tuning on all downstream tasks, our model significantly\noutperforms existing methods on benchmarks including ScanRefer, Multi3DRefer,\nScan2Cap, ScanQA, and SQA3D.\n","authors":["Haifeng Huang","Yilun Chen","Zehan Wang","Rongjie Huang","Runsen Xu","Tai Wang","Luping Liu","Xize Cheng","Yang Zhao","Jiangmiao Pang","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.08168v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18049v1","updated":"2024-09-26T16:49:58Z","published":"2024-09-26T16:49:58Z","title":"Revisit Anything: Visual Place Recognition via Image Segment Retrieval","summary":" Accurately recognizing a revisited place is crucial for embodied agents to\nlocalize and navigate. This requires visual representations to be distinct,\ndespite strong variations in camera viewpoint and scene appearance. Existing\nvisual place recognition pipelines encode the \"whole\" image and search for\nmatches. This poses a fundamental challenge in matching two images of the same\nplace captured from different camera viewpoints: \"the similarity of what\noverlaps can be dominated by the dissimilarity of what does not overlap\". We\naddress this by encoding and searching for \"image segments\" instead of the\nwhole images. We propose to use open-set image segmentation to decompose an\nimage into `meaningful' entities (i.e., things and stuff). This enables us to\ncreate a novel image representation as a collection of multiple overlapping\nsubgraphs connecting a segment with its neighboring segments, dubbed\nSuperSegment. Furthermore, to efficiently encode these SuperSegments into\ncompact vector representations, we propose a novel factorized representation of\nfeature aggregation. We show that retrieving these partial representations\nleads to significantly higher recognition recall than the typical whole image\nbased retrieval. Our segments-based approach, dubbed SegVLAD, sets a new\nstate-of-the-art in place recognition on a diverse selection of benchmark\ndatasets, while being applicable to both generic and task-specialized image\nencoders. Finally, we demonstrate the potential of our method to ``revisit\nanything'' by evaluating our method on an object instance retrieval task, which\nbridges the two disparate areas of research: visual place recognition and\nobject-goal navigation, through their common aim of recognizing goal objects\nspecific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything.\n","authors":["Kartik Garg","Sai Shubodh Puligilla","Shishir Kolathaya","Madhava Krishna","Sourav Garg"],"pdf_url":"https://arxiv.org/pdf/2409.18049v1.pdf","comment":"Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures"},{"id":"http://arxiv.org/abs/2409.18046v1","updated":"2024-09-26T16:47:32Z","published":"2024-09-26T16:47:32Z","title":"IFCap: Image-like Retrieval and Frequency-based Entity Filtering for\n Zero-shot Captioning","summary":" Recent advancements in image captioning have explored text-only training\nmethods to overcome the limitations of paired image-text data. However,\nexisting text-only training methods often overlook the modality gap between\nusing text data during training and employing images during inference. To\naddress this issue, we propose a novel approach called Image-like Retrieval,\nwhich aligns text features with visually relevant features to mitigate the\nmodality gap. Our method further enhances the accuracy of generated captions by\ndesigning a Fusion Module that integrates retrieved captions with input\nfeatures. Additionally, we introduce a Frequency-based Entity Filtering\ntechnique that significantly improves caption quality. We integrate these\nmethods into a unified framework, which we refer to as IFCap\n($\\textbf{I}$mage-like Retrieval and $\\textbf{F}$requency-based Entity\nFiltering for Zero-shot $\\textbf{Cap}$tioning). Through extensive\nexperimentation, our straightforward yet powerful approach has demonstrated its\nefficacy, outperforming the state-of-the-art methods by a significant margin in\nboth image captioning and video captioning compared to zero-shot captioning\nbased on text-only training.\n","authors":["Soeun Lee","Si-Woo Kim","Taewhan Kim","Dong-Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18046v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18042v1","updated":"2024-09-26T16:44:02Z","published":"2024-09-26T16:44:02Z","title":"EMOVA: Empowering Language Models to See, Hear and Speak with Vivid\n Emotions","summary":" GPT-4o, an omni-modal model that enables vocal conversations with diverse\nemotions and tones, marks a milestone for omni-modal foundation models.\nHowever, empowering Large Language Models to perceive and generate images,\ntexts, and speeches end-to-end with publicly available data remains challenging\nin the open-source community. Existing vision-language models rely on external\ntools for the speech processing, while speech-language models still suffer from\nlimited or even without vision-understanding abilities. To address this gap, we\npropose EMOVA (EMotionally Omni-present Voice Assistant), to enable Large\nLanguage Models with end-to-end speech capabilities while maintaining the\nleading vision-language performance. With a semantic-acoustic disentangled\nspeech tokenizer, we notice surprisingly that omni-modal alignment can further\nenhance vision-language and speech abilities compared with the corresponding\nbi-modal aligned counterparts. Moreover, a lightweight style module is proposed\nfor flexible speech style controls (e.g., emotions and pitches). For the first\ntime, EMOVA achieves state-of-the-art performance on both the vision-language\nand speech benchmarks, and meanwhile, supporting omni-modal spoken dialogue\nwith vivid emotions.\n","authors":["Kai Chen","Yunhao Gou","Runhui Huang","Zhili Liu","Daxin Tan","Jing Xu","Chunwei Wang","Yi Zhu","Yihan Zeng","Kuo Yang","Dingdong Wang","Kun Xiang","Haoyuan Li","Haoli Bai","Jianhua Han","Xiaohui Li","Weike Jin","Nian Xie","Yu Zhang","James T. Kwok","Hengshuang Zhao","Xiaodan Liang","Dit-Yan Yeung","Xiao Chen","Zhenguo Li","Wei Zhang","Qun Liu","Lanqing Hong","Lu Hou","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.18042v1.pdf","comment":"Project Page: https://emova-ollm.github.io/"},{"id":"http://arxiv.org/abs/2311.04591v4","updated":"2024-09-26T16:35:58Z","published":"2023-11-08T10:45:09Z","title":"Exploring Event-based Human Pose Estimation with 3D Event\n Representations","summary":" Human pose estimation is a fundamental and appealing task in computer vision.\nAlthough traditional cameras are commonly applied, their reliability decreases\nin scenarios under high dynamic range or heavy motion blur, where event cameras\noffer a robust solution. Predominant event-based methods accumulate events into\nframes, ignoring the asynchronous and high temporal resolution that is crucial\nfor distinguishing distinct actions. To address this issue and to unlock the 3D\npotential of event information, we introduce two 3D event representations: the\nRasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The\nRasEPC aggregates events within concise temporal slices at identical positions,\npreserving their 3D attributes along with statistical information, thereby\nsignificantly reducing memory and computational demands. Meanwhile, the DEV\nrepresentation discretizes events into voxels and projects them across three\northogonal planes, utilizing decoupled event attention to retrieve 3D cues from\nthe 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic\nevent-based dataset crafted to facilitate training and quantitative analysis in\noutdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD\ndataset, and our EV-3DPW dataset, with further qualitative validation via a\nderived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our\ncode and dataset have been made publicly available at\nhttps://github.com/MasterHow/EventPointPose.\n","authors":["Xiaoting Yin","Hao Shi","Jiaan Chen","Ze Wang","Yaozu Ye","Kailun Yang","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2311.04591v4.pdf","comment":"Accepted to Computer Vision and Image Understanding (CVPU). Extended\n version of arXiv:2206.04511. The code and dataset are available at\n https://github.com/MasterHow/EventPointPose"},{"id":"http://arxiv.org/abs/2409.18026v1","updated":"2024-09-26T16:33:16Z","published":"2024-09-26T16:33:16Z","title":"ReliOcc: Towards Reliable Semantic Occupancy Prediction via Uncertainty\n Learning","summary":" Vision-centric semantic occupancy prediction plays a crucial role in\nautonomous driving, which requires accurate and reliable predictions from\nlow-cost sensors. Although having notably narrowed the accuracy gap with LiDAR,\nthere is still few research effort to explore the reliability in predicting\nsemantic occupancy from camera. In this paper, we conduct a comprehensive\nevaluation of existing semantic occupancy prediction models from a reliability\nperspective for the first time. Despite the gradual alignment of camera-based\nmodels with LiDAR in term of accuracy, a significant reliability gap persists.\nTo addresses this concern, we propose ReliOcc, a method designed to enhance the\nreliability of camera-based occupancy networks. ReliOcc provides a\nplug-and-play scheme for existing models, which integrates hybrid uncertainty\nfrom individual voxels with sampling-based noise and relative voxels through\nmix-up learning. Besides, an uncertainty-aware calibration strategy is devised\nto further enhance model reliability in offline mode. Extensive experiments\nunder various settings demonstrate that ReliOcc significantly enhances model\nreliability while maintaining the accuracy of both geometric and semantic\npredictions. Importantly, our proposed approach exhibits robustness to sensor\nfailures and out of domain noises during inference.\n","authors":["Song Wang","Zhongdao Wang","Jiawei Yu","Wentong Li","Bailan Feng","Junbo Chen","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.18026v1.pdf","comment":"Technical report. Work in progress"},{"id":"http://arxiv.org/abs/2409.18017v1","updated":"2024-09-26T16:25:48Z","published":"2024-09-26T16:25:48Z","title":"Transferring disentangled representations: bridging the gap between\n synthetic and real images","summary":" Developing meaningful and efficient representations that separate the\nfundamental structure of the data generation mechanism is crucial in\nrepresentation learning. However, Disentangled Representation Learning has not\nfully shown its potential on real images, because of correlated generative\nfactors, their resolution and limited access to ground truth labels.\nSpecifically on the latter, we investigate the possibility of leveraging\nsynthetic data to learn general-purpose disentangled representations applicable\nto real data, discussing the effect of fine-tuning and what properties of\ndisentanglement are preserved after the transfer. We provide an extensive\nempirical study to address these issues. In addition, we propose a new\ninterpretable intervention-based metric, to measure the quality of factors\nencoding in the representation. Our results indicate that some level of\ndisentanglement, transferring a representation from synthetic to real data, is\npossible and effective.\n","authors":["Jacopo Dapueto","Nicoletta Noceti","Francesca Odone"],"pdf_url":"https://arxiv.org/pdf/2409.18017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14579v2","updated":"2024-09-26T16:25:33Z","published":"2023-12-22T10:15:15Z","title":"Synthesizing Environment-Specific People in Photographs","summary":" We present ESP, a novel method for context-aware full-body generation, that\nenables photo-realistic synthesis and inpainting of people wearing clothing\nthat is semantically appropriate for the scene depicted in an input photograph.\nESP is conditioned on a 2D pose and contextual cues that are extracted from the\nphotograph of the scene and integrated into the generation process, where the\nclothing is modeled explicitly with human parsing masks (HPM). Generated HPMs\nare used as tight guiding masks for inpainting, such that no changes are made\nto the original background. Our models are trained on a dataset containing a\nset of in-the-wild photographs of people covering a wide range of different\nenvironments. The method is analyzed quantitatively and qualitatively, and we\nshow that ESP outperforms the state-of-the-art on the task of contextual\nfull-body generation.\n","authors":["Mirela Ostrek","Carol O'Sullivan","Michael J. Black","Justus Thies"],"pdf_url":"https://arxiv.org/pdf/2312.14579v2.pdf","comment":"Accepted at ECCV 2024, Project: https://esp.is.tue.mpg.de"},{"id":"http://arxiv.org/abs/2406.08113v3","updated":"2024-09-26T16:14:54Z","published":"2024-06-12T11:50:51Z","title":"Valeo4Cast: A Modular Approach to End-to-End Forecasting","summary":" Motion forecasting is crucial in autonomous driving systems to anticipate the\nfuture trajectories of surrounding agents such as pedestrians, vehicles, and\ntraffic signals. In end-to-end forecasting, the model must jointly detect and\ntrack from sensor data (cameras or LiDARs) the past trajectories of the\ndifferent elements of the scene and predict their future locations. We depart\nfrom the current trend of tackling this task via end-to-end training from\nperception to forecasting, and instead use a modular approach. We individually\nbuild and train detection, tracking and forecasting modules. We then only use\nconsecutive finetuning steps to integrate the modules better and alleviate\ncompounding errors. We conduct an in-depth study on the finetuning strategies\nand it reveals that our simple yet effective approach significantly improves\nperformance on the end-to-end forecasting benchmark. Consequently, our solution\nranks first in the Argoverse 2 End-to-end Forecasting Challenge, with 63.82\nmAPf. We surpass forecasting results by +17.1 points over last year's winner\nand by +13.3 points over this year's runner-up. This remarkable performance in\nforecasting can be explained by our modular paradigm, which integrates\nfinetuning strategies and significantly outperforms the end-to-end-trained\ncounterparts. The code, model weights and results are made available\nhttps://github.com/valeoai/valeo4cast.\n","authors":["Yihong Xu","Éloi Zablocki","Alexandre Boulch","Gilles Puy","Mickael Chen","Florent Bartoccioni","Nermin Samet","Oriane Siméoni","Spyros Gidaris","Tuan-Hung Vu","Andrei Bursuc","Eduardo Valle","Renaud Marlet","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2406.08113v3.pdf","comment":"Winning solution of the Argoverse 2 \"Unified Detection, Tracking, and\n Forecasting\" challenge; work accepted at Road++ ECCVW 2024"},{"id":"http://arxiv.org/abs/2312.05295v2","updated":"2024-09-26T16:11:37Z","published":"2023-12-08T18:43:12Z","title":"Disentangled Clothed Avatar Generation from Text Descriptions","summary":" In this paper, we introduce a novel text-to-avatar generation method that\nseparately generates the human body and the clothes and allows high-quality\nanimation on the generated avatar. While recent advancements in text-to-avatar\ngeneration have yielded diverse human avatars from text prompts, these methods\ntypically combine all elements-clothes, hair, and body-into a single 3D\nrepresentation. Such an entangled approach poses challenges for downstream\ntasks like editing or animation. To overcome these limitations, we propose a\nnovel disentangled 3D avatar representation named Sequentially Offset-SMPL\n(SO-SMPL), building upon the SMPL model. SO-SMPL represents the human body and\nclothes with two separate meshes but associates them with offsets to ensure the\nphysical alignment between the body and the clothes. Then, we design a Score\nDistillation Sampling (SDS)-based distillation framework to generate the\nproposed SO-SMPL representation from text prompts. Our approach not only\nachieves higher texture and geometry quality and better semantic alignment with\ntext prompts, but also significantly improves the visual quality of character\nanimation, virtual try-on, and avatar editing. Project page:\nhttps://shanemankiw.github.io/SO-SMPL/.\n","authors":["Jionghao Wang","Yuan Liu","Zhiyang Dou","Zhengming Yu","Yongqing Liang","Cheng Lin","Xin Li","Wenping Wang","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2312.05295v2.pdf","comment":"Project page: https://shanemankiw.github.io/SO-SMPL/"},{"id":"http://arxiv.org/abs/2409.17996v1","updated":"2024-09-26T16:07:24Z","published":"2024-09-26T16:07:24Z","title":"PhoCoLens: Photorealistic and Consistent Reconstruction in Lensless\n Imaging","summary":" Lensless cameras offer significant advantages in size, weight, and cost\ncompared to traditional lens-based systems. Without a focusing lens, lensless\ncameras rely on computational algorithms to recover the scenes from multiplexed\nmeasurements. However, current algorithms struggle with inaccurate forward\nimaging models and insufficient priors to reconstruct high-quality images. To\novercome these limitations, we introduce a novel two-stage approach for\nconsistent and photorealistic lensless image reconstruction. The first stage of\nour approach ensures data consistency by focusing on accurately reconstructing\nthe low-frequency content with a spatially varying deconvolution method that\nadjusts to changes in the Point Spread Function (PSF) across the camera's field\nof view. The second stage enhances photorealism by incorporating a generative\nprior from pre-trained diffusion models. By conditioning on the low-frequency\ncontent retrieved in the first stage, the diffusion model effectively\nreconstructs the high-frequency details that are typically lost in the lensless\nimaging process, while also maintaining image fidelity. Our method achieves a\nsuperior balance between data fidelity and visual quality compared to existing\nmethods, as demonstrated with two popular lensless systems, PhlatCam and\nDiffuserCam. Project website: https://phocolens.github.io/.\n","authors":["Xin Cai","Zhiyuan You","Hailong Zhang","Wentao Liu","Jinwei Gu","Tianfan Xue"],"pdf_url":"https://arxiv.org/pdf/2409.17996v1.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2409.17993v1","updated":"2024-09-26T16:04:31Z","published":"2024-09-26T16:04:31Z","title":"InterNet: Unsupervised Cross-modal Homography Estimation Based on\n Interleaved Modality Transfer and Self-supervised Homography Prediction","summary":" We propose a novel unsupervised cross-modal homography estimation framework,\nbased on interleaved modality transfer and self-supervised homography\nprediction, named InterNet. InterNet integrates modality transfer and\nself-supervised homography estimation, introducing an innovative interleaved\noptimization framework to alternately promote both components. The modality\ntransfer gradually narrows the modality gaps, facilitating the self-supervised\nhomography estimation to fully leverage the synthetic intra-modal data. The\nself-supervised homography estimation progressively achieves reliable\npredictions, thereby providing robust cross-modal supervision for the modality\ntransfer. To further boost the estimation accuracy, we also formulate a\nfine-grained homography feature loss to improve the connection between two\ncomponents. Furthermore, we employ a simple yet effective distillation training\ntechnique to reduce model parameters and improve cross-domain generalization\nability while maintaining comparable performance. Experiments reveal that\nInterNet achieves the state-of-the-art (SOTA) performance among unsupervised\nmethods, and even outperforms many supervised methods such as MHN and\nLocalTrans.\n","authors":["Junchen Yu","Si-Yuan Cao","Runmin Zhang","Chenghao Zhang","Jianxin Hu","Zhu Yu","Hui-liang Shen"],"pdf_url":"https://arxiv.org/pdf/2409.17993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17988v1","updated":"2024-09-26T15:57:20Z","published":"2024-09-26T15:57:20Z","title":"Deblur e-NeRF: NeRF from Motion-Blurred Events under High-speed or\n Low-light Conditions","summary":" The stark contrast in the design philosophy of an event camera makes it\nparticularly ideal for operating under high-speed, high dynamic range and\nlow-light conditions, where standard cameras underperform. Nonetheless, event\ncameras still suffer from some amount of motion blur, especially under these\nchallenging conditions, in contrary to what most think. This is attributed to\nthe limited bandwidth of the event sensor pixel, which is mostly proportional\nto the light intensity. Thus, to ensure that event cameras can truly excel in\nsuch conditions where it has an edge over standard cameras, it is crucial to\naccount for event motion blur in downstream applications, especially\nreconstruction. However, none of the recent works on reconstructing Neural\nRadiance Fields (NeRFs) from events, nor event simulators, have considered the\nfull effects of event motion blur. To this end, we propose, Deblur e-NeRF, a\nnovel method to directly and effectively reconstruct blur-minimal NeRFs from\nmotion-blurred events generated under high-speed motion or low-light\nconditions. The core component of this work is a physically-accurate pixel\nbandwidth model proposed to account for event motion blur under arbitrary speed\nand lighting conditions. We also introduce a novel threshold-normalized total\nvariation loss to improve the regularization of large textureless patches.\nExperiments on real and novel realistically simulated sequences verify our\neffectiveness. Our code, event simulator and synthetic event dataset will be\nopen-sourced.\n","authors":["Weng Fei Low","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2409.17988v1.pdf","comment":"Accepted to ECCV 2024. Project website is accessible at\n https://wengflow.github.io/deblur-e-nerf. arXiv admin note: text overlap with\n arXiv:2006.07722 by other authors"},{"id":"http://arxiv.org/abs/2409.17987v1","updated":"2024-09-26T15:57:08Z","published":"2024-09-26T15:57:08Z","title":"LLM4Brain: Training a Large Language Model for Brain Video Understanding","summary":" Decoding visual-semantic information from brain signals, such as functional\nMRI (fMRI), across different subjects poses significant challenges, including\nlow signal-to-noise ratio, limited data availability, and cross-subject\nvariability. Recent advancements in large language models (LLMs) show\nremarkable effectiveness in processing multimodal information. In this study,\nwe introduce an LLM-based approach for reconstructing visual-semantic\ninformation from fMRI signals elicited by video stimuli. Specifically, we\nemploy fine-tuning techniques on an fMRI encoder equipped with adaptors to\ntransform brain responses into latent representations aligned with the video\nstimuli. Subsequently, these representations are mapped to textual modality by\nLLM. In particular, we integrate self-supervised domain adaptation methods to\nenhance the alignment between visual-semantic information and brain responses.\nOur proposed method achieves good results using various quantitative semantic\nmetrics, while yielding similarity with ground-truth information.\n","authors":["Ruizhe Zheng","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2409.17987v1.pdf","comment":"ECCV2024 Workshop"},{"id":"http://arxiv.org/abs/2409.17981v1","updated":"2024-09-26T15:54:18Z","published":"2024-09-26T15:54:18Z","title":"BlinkTrack: Feature Tracking over 100 FPS via Events and Images","summary":" Feature tracking is crucial for, structure from motion (SFM), simultaneous\nlocalization and mapping (SLAM), object tracking and various computer vision\ntasks. Event cameras, known for their high temporal resolution and ability to\ncapture asynchronous changes, have gained significant attention for their\npotential in feature tracking, especially in challenging conditions. However,\nevent cameras lack the fine-grained texture information that conventional\ncameras provide, leading to error accumulation in tracking. To address this, we\npropose a novel framework, BlinkTrack, which integrates event data with RGB\nimages for high-frequency feature tracking. Our method extends the traditional\nKalman filter into a learning-based framework, utilizing differentiable Kalman\nfilters in both event and image branches. This approach improves\nsingle-modality tracking, resolves ambiguities, and supports asynchronous data\nfusion. We also introduce new synthetic and augmented datasets to better\nevaluate our model. Experimental results indicate that BlinkTrack significantly\noutperforms existing event-based methods, exceeding 100 FPS with preprocessed\nevent data and 80 FPS with multi-modality data.\n","authors":["Yichen Shen","Yijin Li","Shuo Chen","Guanglin Li","Zhaoyang Huang","Hujun Bao","Zhaopeng Cui","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17978v1","updated":"2024-09-26T15:52:36Z","published":"2024-09-26T15:52:36Z","title":"HydraViT: Stacking Heads for a Scalable ViT","summary":" The architecture of Vision Transformers (ViTs), particularly the Multi-head\nAttention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs\non devices with varying constraints, such as mobile phones, requires multiple\nmodels of different sizes. However, this approach has limitations, such as\ntraining and storing each required model separately. This paper introduces\nHydraViT, a novel approach that addresses these limitations by stacking\nattention heads to achieve a scalable ViT. By repeatedly changing the size of\nthe embedded dimensions throughout each layer and their corresponding number of\nattention heads in MHA during training, HydraViT induces multiple subnetworks.\nThereby, HydraViT achieves adaptability across a wide spectrum of hardware\nenvironments while maintaining performance. Our experimental results\ndemonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10\nsubnetworks, covering a wide range of resource constraints. HydraViT achieves\nup to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy\nwith the same throughput on ImageNet-1K compared to the baselines, making it an\neffective solution for scenarios where hardware availability is diverse or\nvaries over time. Source code available at https://github.com/ds-kiel/HydraViT.\n","authors":["Janek Haberer","Ali Hojjat","Olaf Landsiedel"],"pdf_url":"https://arxiv.org/pdf/2409.17978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17977v1","updated":"2024-09-26T15:52:34Z","published":"2024-09-26T15:52:34Z","title":"Cross-Modality Attack Boosted by Gradient-Evolutionary Multiform\n Optimization","summary":" In recent years, despite significant advancements in adversarial attack\nresearch, the security challenges in cross-modal scenarios, such as the\ntransferability of adversarial attacks between infrared, thermal, and RGB\nimages, have been overlooked. These heterogeneous image modalities collected by\ndifferent hardware devices are widely prevalent in practical applications, and\nthe substantial differences between modalities pose significant challenges to\nattack transferability. In this work, we explore a novel cross-modal\nadversarial attack strategy, termed multiform attack. We propose a dual-layer\noptimization framework based on gradient-evolution, facilitating efficient\nperturbation transfer between modalities. In the first layer of optimization,\nthe framework utilizes image gradients to learn universal perturbations within\neach modality and employs evolutionary algorithms to search for shared\nperturbations with transferability across different modalities through\nsecondary optimization. Through extensive testing on multiple heterogeneous\ndatasets, we demonstrate the superiority and robustness of Multiform Attack\ncompared to existing techniques. This work not only enhances the\ntransferability of cross-modal adversarial attacks but also provides a new\nperspective for understanding security vulnerabilities in cross-modal systems.\n","authors":["Yunpeng Gong","Qingyuan Zeng","Dejun Xu","Zhenzhong Wang","Min Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.17977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17963v1","updated":"2024-09-26T15:41:18Z","published":"2024-09-26T15:41:18Z","title":"CNCA: Toward Customizable and Natural Generation of Adversarial\n Camouflage for Vehicle Detectors","summary":" Prior works on physical adversarial camouflage against vehicle detectors\nmainly focus on the effectiveness and robustness of the attack. The current\nmost successful methods optimize 3D vehicle texture at a pixel level. However,\nthis results in conspicuous and attention-grabbing patterns in the generated\ncamouflage, which humans can easily identify. To address this issue, we propose\na Customizable and Natural Camouflage Attack (CNCA) method by leveraging an\noff-the-shelf pre-trained diffusion model. By sampling the optimal texture\nimage from the diffusion model with a user-specific text prompt, our method can\ngenerate natural and customizable adversarial camouflage while maintaining high\nattack performance. With extensive experiments on the digital and physical\nworlds and user studies, the results demonstrate that our proposed method can\ngenerate significantly more natural-looking camouflage than the\nstate-of-the-art baselines while achieving competitive attack performance. Our\ncode is available at\n\\href{https://anonymous.4open.science/r/CNCA-1D54}{https://anonymous.4open.science/r/CNCA-1D54}\n","authors":["Linye Lyu","Jiawei Zhou","Daojing He","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2409.17963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10814v3","updated":"2024-09-26T15:37:58Z","published":"2023-08-21T16:03:35Z","title":"Jumping through Local Minima: Quantization in the Loss Landscape of\n Vision Transformers","summary":" Quantization scale and bit-width are the most important parameters when\nconsidering how to quantize a neural network. Prior work focuses on optimizing\nquantization scales in a global manner through gradient methods (gradient\ndescent \\& Hessian analysis). Yet, when applying perturbations to quantization\nscales, we observe a very jagged, highly non-smooth test loss landscape. In\nfact, small perturbations in quantization scale can greatly affect accuracy,\nyielding a $0.5-0.8\\%$ accuracy boost in 4-bit quantized vision transformers\n(ViTs). In this regime, gradient methods break down, since they cannot reliably\nreach local minima. In our work, dubbed Evol-Q, we use evolutionary search to\neffectively traverse the non-smooth landscape. Additionally, we propose using\nan infoNCE loss, which not only helps combat overfitting on the small\ncalibration dataset ($1,000$ images) but also makes traversing such a highly\nnon-smooth surface easier. Evol-Q improves the top-1 accuracy of a fully\nquantized ViT-Base by $10.30\\%$, $0.78\\%$, and $0.15\\%$ for $3$-bit, $4$-bit,\nand $8$-bit weight quantization levels. Extensive experiments on a variety of\nCNN and ViT architectures further demonstrate its robustness in extreme\nquantization scenarios. Our code is available at\nhttps://github.com/enyac-group/evol-q\n","authors":["Natalia Frumkin","Dibakar Gope","Diana Marculescu"],"pdf_url":"https://arxiv.org/pdf/2308.10814v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2211.09643"},{"id":"http://arxiv.org/abs/2409.17958v1","updated":"2024-09-26T15:36:10Z","published":"2024-09-26T15:36:10Z","title":"The Hard Positive Truth about Vision-Language Compositionality","summary":" Several benchmarks have concluded that our best vision-language models (e.g.,\nCLIP) are lacking in compositionality. Given an image, these benchmarks probe a\nmodel's ability to identify its associated caption amongst a set of\ncompositional distractors. In response, a surge of recent proposals show\nimprovements by finetuning CLIP with distractors as hard negatives. Our\ninvestigations reveal that these improvements have, in fact, been significantly\noverstated -- because existing benchmarks do not probe whether finetuned\nvision-language models remain invariant to hard positives. By curating an\nevaluation dataset with 112,382 hard negatives and hard positives, we uncover\nthat including hard positives decreases CLIP's performance by 12.9%, while\nhumans perform effortlessly at 99%. CLIP finetuned with hard negatives results\nin an even larger decrease, up to 38.7%. With this finding, we then produce a\n1,775,259 image-text training set with both hard negative and hard positive\ncaptions. By training with both, we see improvements on existing benchmarks\nwhile simultaneously improving performance on hard positives, indicating a more\nrobust improvement in compositionality. Our work suggests the need for future\nresearch to rigorously test and improve CLIP's understanding of semantic\nrelationships between related \"positive\" concepts.\n","authors":["Amita Kamath","Cheng-Yu Hsieh","Kai-Wei Chang","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2409.17958v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2312.14115v4","updated":"2024-09-26T15:30:00Z","published":"2023-12-21T18:40:34Z","title":"LingoQA: Visual Question Answering for Autonomous Driving","summary":" We introduce LingoQA, a novel dataset and benchmark for visual question\nanswering in autonomous driving. The dataset contains 28K unique short video\nscenarios, and 419K annotations. Evaluating state-of-the-art vision-language\nmodels on our benchmark shows that their performance is below human\ncapabilities, with GPT-4V responding truthfully to 59.6% of the questions\ncompared to 96.6% for humans. For evaluation, we propose a truthfulness\nclassifier, called Lingo-Judge, that achieves a 0.95 Spearman correlation\ncoefficient to human evaluations, surpassing existing techniques like METEOR,\nBLEU, CIDEr, and GPT-4. We establish a baseline vision-language model and run\nextensive ablation studies to understand its performance. We release our\ndataset and benchmark as an evaluation platform for vision-language models in\nautonomous driving.\n","authors":["Ana-Maria Marcu","Long Chen","Jan Hünermann","Alice Karnsund","Benoit Hanotte","Prajwal Chidananda","Saurabh Nair","Vijay Badrinarayanan","Alex Kendall","Jamie Shotton","Elahe Arani","Oleg Sinavski"],"pdf_url":"https://arxiv.org/pdf/2312.14115v4.pdf","comment":"Accepted to ECCV 2024. Benchmark and dataset are available at\n https://github.com/wayveai/LingoQA/"},{"id":"http://arxiv.org/abs/2409.17951v1","updated":"2024-09-26T15:28:25Z","published":"2024-09-26T15:28:25Z","title":"Spatial Hierarchy and Temporal Attention Guided Cross Masking for\n Self-supervised Skeleton-based Action Recognition","summary":" In self-supervised skeleton-based action recognition, the mask reconstruction\nparadigm is gaining interest in enhancing model refinement and robustness\nthrough effective masking. However, previous works primarily relied on a single\nmasking criterion, resulting in the model overfitting specific features and\noverlooking other effective information. In this paper, we introduce a\nhierarchy and attention guided cross-masking framework (HA-CM) that applies\nmasking to skeleton sequences from both spatial and temporal perspectives.\nSpecifically, in spatial graphs, we utilize hyperbolic space to maintain joint\ndistinctions and effectively preserve the hierarchical structure of\nhigh-dimensional skeletons, employing joint hierarchy as the masking criterion.\nIn temporal flows, we substitute traditional distance metrics with the global\nattention of joints for masking, addressing the convergence of distances in\nhigh-dimensional space and the lack of a global perspective. Additionally, we\nincorporate cross-contrast loss based on the cross-masking framework into the\nloss function to enhance the model's learning of instance-level features. HA-CM\nshows efficiency and universality on three public large-scale datasets, NTU-60,\nNTU-120, and PKU-MMD. The source code of our HA-CM is available at\nhttps://github.com/YinxPeng/HA-CM-main.\n","authors":["Xinpeng Yin","Wenming Cao"],"pdf_url":"https://arxiv.org/pdf/2409.17951v1.pdf","comment":"12 pages,6 figures,IEEE Trans"},{"id":"http://arxiv.org/abs/2409.17941v1","updated":"2024-09-26T15:16:32Z","published":"2024-09-26T15:16:32Z","title":"Perturb, Attend, Detect and Localize (PADL): Robust Proactive Image\n Defense","summary":" Image manipulation detection and localization have received considerable\nattention from the research community given the blooming of Generative Models\n(GMs). Detection methods that follow a passive approach may overfit to specific\nGMs, limiting their application in real-world scenarios, due to the growing\ndiversity of generative models. Recently, approaches based on a proactive\nframework have shown the possibility of dealing with this limitation. However,\nthese methods suffer from two main limitations, which raises concerns about\npotential vulnerabilities: i) the manipulation detector is not robust to noise\nand hence can be easily fooled; ii) the fact that they rely on fixed\nperturbations for image protection offers a predictable exploit for malicious\nattackers, enabling them to reverse-engineer and evade detection. To overcome\nthis issue we propose PADL, a new solution able to generate image-specific\nperturbations using a symmetric scheme of encoding and decoding based on\ncross-attention, which drastically reduces the possibility of reverse\nengineering, even when evaluated with adaptive attack [31]. Additionally, PADL\nis able to pinpoint manipulated areas, facilitating the identification of\nspecific regions that have undergone alterations, and has more generalization\npower than prior art on held-out generative models. Indeed, although being\ntrained only on an attribute manipulation GAN model [15], our method\ngeneralizes to a range of unseen models with diverse architectural designs,\nsuch as StarGANv2, BlendGAN, DiffAE, StableDiffusion and StableDiffusionXL.\nAdditionally, we introduce a novel evaluation protocol, which offers a fair\nevaluation of localisation performance in function of detection accuracy and\nbetter captures real-world scenarios.\n","authors":["Filippo Bartolucci","Iacopo Masi","Giuseppe Lisanti"],"pdf_url":"https://arxiv.org/pdf/2409.17941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15268v4","updated":"2024-09-26T15:10:58Z","published":"2023-12-23T14:36:27Z","title":"Manydepth2: Motion-Aware Self-Supervised Monocular Depth Estimation in\n Dynamic Scenes","summary":" Despite advancements in self-supervised monocular depth estimation,\nchallenges persist in dynamic scenarios due to the dependence on assumptions\nabout a static world. In this paper, we present Manydepth2, a Motion-Guided\nCost Volume Depth Net, to achieve precise depth estimation for both dynamic\nobjects and static backgrounds, all while maintaining computational efficiency.\nTo tackle the challenges posed by dynamic content, we incorporate optical flow\nand coarse monocular depth to create a novel static reference frame. This frame\nis then utilized to build a motion-guided cost volume in collaboration with the\ntarget frame. Additionally, to enhance the accuracy and resilience of the\nnetwork structure, we introduce an attention-based depth net architecture to\neffectively integrate information from feature maps with varying resolutions.\nCompared to methods with similar computational costs, Manydepth2 achieves a\nsignificant reduction of approximately five percent in root-mean-square error\nfor self-supervised monocular depth estimation on the KITTI-2015 dataset. The\ncode could be found: https://github.com/kaichen-z/Manydepth2\n","authors":["Kaichen Zhou","Jia-Wang Bian","Qian Xie","Jian-Qing Zheng","Niki Trigoni","Andrew Markham"],"pdf_url":"https://arxiv.org/pdf/2312.15268v4.pdf","comment":"Monocular Depth Estimation, Self-Supervised, Optical Flow"},{"id":"http://arxiv.org/abs/2409.07714v2","updated":"2024-09-26T15:05:43Z","published":"2024-09-12T02:50:04Z","title":"CollaMamba: Efficient Collaborative Perception with Cross-Agent\n Spatial-Temporal State Space Model","summary":" By sharing complementary perceptual information, multi-agent collaborative\nperception fosters a deeper understanding of the environment. Recent studies on\ncollaborative perception mostly utilize CNNs or Transformers to learn feature\nrepresentation and fusion in the spatial dimension, which struggle to handle\nlong-range spatial-temporal features under limited computing and communication\nresources. Holistically modeling the dependencies over extensive spatial areas\nand extended temporal frames is crucial to enhancing feature quality. To this\nend, we propose a resource efficient cross-agent spatial-temporal collaborative\nstate space model (SSM), named CollaMamba. Initially, we construct a\nfoundational backbone network based on spatial SSM. This backbone adeptly\ncaptures positional causal dependencies from both single-agent and cross-agent\nviews, yielding compact and comprehensive intermediate features while\nmaintaining linear complexity. Furthermore, we devise a history-aware feature\nboosting module based on temporal SSM, extracting contextual cues from extended\nhistorical frames to refine vague features while preserving low overhead.\nExtensive experiments across several datasets demonstrate that CollaMamba\noutperforms state-of-the-art methods, achieving higher model accuracy while\nreducing computational and communication overhead by up to 71.9% and 1/64,\nrespectively. This work pioneers the exploration of the Mamba's potential in\ncollaborative perception. The source code will be made available.\n","authors":["Yang Li","Quan Yuan","Guiyang Luo","Xiaoyuan Fu","Xuanhan Zhu","Yujia Yang","Rui Pan","Jinglin Li"],"pdf_url":"https://arxiv.org/pdf/2409.07714v2.pdf","comment":"Submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2409.17924v1","updated":"2024-09-26T15:05:29Z","published":"2024-09-26T15:05:29Z","title":"Neural Light Spheres for Implicit Image Stitching and View Synthesis","summary":" Challenging to capture, and challenging to display on a cellphone screen, the\npanorama paradoxically remains both a staple and underused feature of modern\nmobile camera applications. In this work we address both of these challenges\nwith a spherical neural light field model for implicit panoramic image\nstitching and re-rendering; able to accommodate for depth parallax,\nview-dependent lighting, and local scene motion and color changes during\ncapture. Fit during test-time to an arbitrary path panoramic video capture --\nvertical, horizontal, random-walk -- these neural light spheres jointly\nestimate the camera path and a high-resolution scene reconstruction to produce\nnovel wide field-of-view projections of the environment. Our single-layer model\navoids expensive volumetric sampling, and decomposes the scene into compact\nview-dependent ray offset and color components, with a total model size of 80\nMB per scene, and real-time (50 FPS) rendering at 1080p resolution. We\ndemonstrate improved reconstruction quality over traditional image stitching\nand radiance field methods, with significantly higher tolerance to scene motion\nand non-ideal capture settings.\n","authors":["Ilya Chugunov","Amogh Joshi","Kiran Murthy","Francois Bleibel","Felix Heide"],"pdf_url":"https://arxiv.org/pdf/2409.17924v1.pdf","comment":"Project site: https://light.princeton.edu/publication/neuls/"},{"id":"http://arxiv.org/abs/2409.17920v1","updated":"2024-09-26T15:04:13Z","published":"2024-09-26T15:04:13Z","title":"Resolving Multi-Condition Confusion for Finetuning-Free Personalized\n Image Generation","summary":" Personalized text-to-image generation methods can generate customized images\nbased on the reference images, which have garnered wide research interest.\nRecent methods propose a finetuning-free approach with a decoupled\ncross-attention mechanism to generate personalized images requiring no\ntest-time finetuning. However, when multiple reference images are provided, the\ncurrent decoupled cross-attention mechanism encounters the object confusion\nproblem and fails to map each reference image to its corresponding object,\nthereby seriously limiting its scope of application. To address the object\nconfusion problem, in this work we investigate the relevance of different\npositions of the latent image features to the target object in diffusion model,\nand accordingly propose a weighted-merge method to merge multiple reference\nimage features into the corresponding objects. Next, we integrate this\nweighted-merge method into existing pre-trained models and continue to train\nthe model on a multi-object dataset constructed from the open-sourced SA-1B\ndataset. To mitigate object confusion and reduce training costs, we propose an\nobject quality score to estimate the image quality for the selection of\nhigh-quality training samples. Furthermore, our weighted-merge training\nframework can be employed on single-object generation when a single object has\nmultiple reference images. The experiments verify that our method achieves\nsuperior performance to the state-of-the-arts on the Concept101 dataset and\nDreamBooth dataset of multi-object personalized image generation, and\nremarkably improves the performance on single-object personalized image\ngeneration. Our code is available at https://github.com/hqhQAQ/MIP-Adapter.\n","authors":["Qihan Huang","Siming Fu","Jinlong Liu","Hao Jiang","Yipeng Yu","Jie Song"],"pdf_url":"https://arxiv.org/pdf/2409.17920v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17917v1","updated":"2024-09-26T15:02:50Z","published":"2024-09-26T15:02:50Z","title":"WaSt-3D: Wasserstein-2 Distance for Scene-to-Scene Stylization on 3D\n Gaussians","summary":" While style transfer techniques have been well-developed for 2D image\nstylization, the extension of these methods to 3D scenes remains relatively\nunexplored. Existing approaches demonstrate proficiency in transferring colors\nand textures but often struggle with replicating the geometry of the scenes. In\nour work, we leverage an explicit Gaussian Splatting (GS) representation and\ndirectly match the distributions of Gaussians between style and content scenes\nusing the Earth Mover's Distance (EMD). By employing the entropy-regularized\nWasserstein-2 distance, we ensure that the transformation maintains spatial\nsmoothness. Additionally, we decompose the scene stylization problem into\nsmaller chunks to enhance efficiency. This paradigm shift reframes stylization\nfrom a pure generative process driven by latent space losses to an explicit\nmatching of distributions between two Gaussian representations. Our method\nachieves high-resolution 3D stylization by faithfully transferring details from\n3D style scenes onto the content scene. Furthermore, WaSt-3D consistently\ndelivers results across diverse content and style scenes without necessitating\nany training, as it relies solely on optimization-based techniques. See our\nproject page for additional results and source code:\n$\\href{https://compvis.github.io/wast3d/}{https://compvis.github.io/wast3d/}$.\n","authors":["Dmytro Kotovenko","Olga Grebenkova","Nikolaos Sarafianos","Avinash Paliwal","Pingchuan Ma","Omid Poursaeed","Sreyas Mohan","Yuchen Fan","Yilei Li","Rakesh Ranjan","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2409.17917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01895v2","updated":"2024-09-26T14:57:13Z","published":"2024-08-04T01:34:22Z","title":"Computational Trichromacy Reconstruction: Empowering the Color-Vision\n Deficient to Recognize Colors Using Augmented Reality","summary":" We propose an assistive technology that helps individuals with Color Vision\nDeficiencies (CVD) to recognize/name colors. A dichromat's color perception is\na reduced two-dimensional (2D) subset of a normal trichromat's three\ndimensional color (3D) perception, leading to confusion when visual stimuli\nthat appear identical to the dichromat are referred to by different color\nnames. Using our proposed system, CVD individuals can interactively induce\ndistinct perceptual changes to originally confusing colors via a computational\ncolor space transformation. By combining their original 2D precepts for colors\nwith the discriminative changes, a three dimensional color space is\nreconstructed, where the dichromat can learn to resolve color name confusions\nand accurately recognize colors. Our system is implemented as an Augmented\nReality (AR) interface on smartphones, where users interactively control the\nrotation through swipe gestures and observe the induced color shifts in the\ncamera view or in a displayed image. Through psychophysical experiments and a\nlongitudinal user study, we demonstrate that such rotational color shifts have\ndiscriminative power (initially confusing colors become distinct under\nrotation) and exhibit structured perceptual shifts dichromats can learn with\nmodest training. The AR App is also evaluated in two real-world scenarios\n(building with lego blocks and interpreting artistic works); users all report\npositive experience in using the App to recognize object colors that they\notherwise could not.\n","authors":["Yuhao Zhu","Ethan Chen","Colin Hascup","Yukang Yan","Gaurav Sharma"],"pdf_url":"https://arxiv.org/pdf/2408.01895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17908v1","updated":"2024-09-26T14:52:55Z","published":"2024-09-26T14:52:55Z","title":"LKA-ReID:Vehicle Re-Identification with Large Kernel Attention","summary":" With the rapid development of intelligent transportation systems and the\npopularity of smart city infrastructure, Vehicle Re-ID technology has become an\nimportant research field. The vehicle Re-ID task faces an important challenge,\nwhich is the high similarity between different vehicles. Existing methods use\nadditional detection or segmentation models to extract differentiated local\nfeatures. However, these methods either rely on additional annotations or\ngreatly increase the computational cost. Using attention mechanism to capture\nglobal and local features is crucial to solve the challenge of high similarity\nbetween classes in vehicle Re-ID tasks. In this paper, we propose LKA-ReID with\nlarge kernel attention. Specifically, the large kernel attention (LKA) utilizes\nthe advantages of self-attention and also benefits from the advantages of\nconvolution, which can extract the global and local features of the vehicle\nmore comprehensively. We also introduce hybrid channel attention (HCA) combines\nchannel attention with spatial information, so that the model can better focus\non channels and feature regions, and ignore background and other disturbing\ninformation. Experiments on VeRi-776 dataset demonstrated the effectiveness of\nLKA-ReID, with mAP reaches 86.65% and Rank-1 reaches 98.03%.\n","authors":["Xuezhi Xiang","Zhushan Ma","Lei Zhang","Denis Ombati","Himaloy Himu","Xiantong Zhen"],"pdf_url":"https://arxiv.org/pdf/2409.17908v1.pdf","comment":"The paper is under consideration at 2025 IEEE International\n Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)"},{"id":"http://arxiv.org/abs/2409.17895v1","updated":"2024-09-26T14:44:41Z","published":"2024-09-26T14:44:41Z","title":"Self-supervised Monocular Depth Estimation with Large Kernel Attention","summary":" Self-supervised monocular depth estimation has emerged as a promising\napproach since it does not rely on labeled training data. Most methods combine\nconvolution and Transformer to model long-distance dependencies to estimate\ndepth accurately. However, Transformer treats 2D image features as 1D\nsequences, and positional encoding somewhat mitigates the loss of spatial\ninformation between different feature blocks, tending to overlook channel\nfeatures, which limit the performance of depth estimation. In this paper, we\npropose a self-supervised monocular depth estimation network to get finer\ndetails. Specifically, we propose a decoder based on large kernel attention,\nwhich can model long-distance dependencies without compromising the\ntwo-dimension structure of features while maintaining feature channel\nadaptivity. In addition, we introduce a up-sampling module to accurately\nrecover the fine details in the depth map. Our method achieves competitive\nresults on the KITTI dataset.\n","authors":["Xuezhi Xiang","Yao Wang","Lei Zhang","Denis Ombati","Himaloy Himu","Xiantong Zhen"],"pdf_url":"https://arxiv.org/pdf/2409.17895v1.pdf","comment":"The paper is under consideration at 2025 IEEE International\n Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)"},{"id":"http://arxiv.org/abs/2409.17886v1","updated":"2024-09-26T14:35:06Z","published":"2024-09-26T14:35:06Z","title":"Upper-Body Pose-based Gaze Estimation for Privacy-Preserving 3D Gaze\n Target Detection","summary":" Gaze Target Detection (GTD), i.e., determining where a person is looking\nwithin a scene from an external viewpoint, is a challenging task, particularly\nin 3D space. Existing approaches heavily rely on analyzing the person's\nappearance, primarily focusing on their face to predict the gaze target. This\npaper presents a novel approach to tackle this problem by utilizing the\nperson's upper-body pose and available depth maps to extract a 3D gaze\ndirection and employing a multi-stage or an end-to-end pipeline to predict the\ngazed target. When predicted accurately, the human body pose can provide\nvaluable information about the head pose, which is a good approximation of the\ngaze direction, as well as the position of the arms and hands, which are linked\nto the activity the person is performing and the objects they are likely\nfocusing on. Consequently, in addition to performing gaze estimation in 3D, we\nare also able to perform GTD simultaneously. We demonstrate state-of-the-art\nresults on the most comprehensive publicly accessible 3D gaze target detection\ndataset without requiring images of the person's face, thus promoting privacy\npreservation in various application contexts. The code is available at\nhttps://github.com/intelligolabs/privacy-gtd-3D.\n","authors":["Andrea Toaiari","Vittorio Murino","Marco Cristani","Cigdem Beyan"],"pdf_url":"https://arxiv.org/pdf/2409.17886v1.pdf","comment":"Accepted in the T-CAP workshop at ECCV 2024"},{"id":"http://arxiv.org/abs/2312.04564v3","updated":"2024-09-26T14:33:24Z","published":"2023-12-07T18:59:55Z","title":"EAGLES: Efficient Accelerated 3D Gaussians with Lightweight EncodingS","summary":" Recently, 3D Gaussian splatting (3D-GS) has gained popularity in novel-view\nscene synthesis. It addresses the challenges of lengthy training times and slow\nrendering speeds associated with Neural Radiance Fields (NeRFs). Through rapid,\ndifferentiable rasterization of 3D Gaussians, 3D-GS achieves real-time\nrendering and accelerated training. They, however, demand substantial memory\nresources for both training and storage, as they require millions of Gaussians\nin their point cloud representation for each scene. We present a technique\nutilizing quantized embeddings to significantly reduce per-point memory storage\nrequirements and a coarse-to-fine training strategy for a faster and more\nstable optimization of the Gaussian point clouds. Our approach develops a\npruning stage which results in scene representations with fewer Gaussians,\nleading to faster training times and rendering speeds for real-time rendering\nof high resolution scenes. We reduce storage memory by more than an order of\nmagnitude all while preserving the reconstruction quality. We validate the\neffectiveness of our approach on a variety of datasets and scenes preserving\nthe visual quality while consuming 10-20x lesser memory and faster\ntraining/inference speed. Project page and code is available\nhttps://efficientgaussian.github.io\n","authors":["Sharath Girish","Kamal Gupta","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2312.04564v3.pdf","comment":"Website: https://efficientgaussian.github.io Code:\n https://github.com/Sharath-girish/efficientgaussian"},{"id":"http://arxiv.org/abs/2409.17880v1","updated":"2024-09-26T14:27:55Z","published":"2024-09-26T14:27:55Z","title":"Self-Distilled Depth Refinement with Noisy Poisson Fusion","summary":" Depth refinement aims to infer high-resolution depth with fine-grained edges\nand details, refining low-resolution results of depth estimation models. The\nprevailing methods adopt tile-based manners by merging numerous patches, which\nlacks efficiency and produces inconsistency. Besides, prior arts suffer from\nfuzzy depth boundaries and limited generalizability. Analyzing the fundamental\nreasons for these limitations, we model depth refinement as a noisy Poisson\nfusion problem with local inconsistency and edge deformation noises. We propose\nthe Self-distilled Depth Refinement (SDDR) framework to enforce robustness\nagainst the noises, which mainly consists of depth edge representation and\nedge-based guidance. With noisy depth predictions as input, SDDR generates\nlow-noise depth edge representations as pseudo-labels by coarse-to-fine\nself-distillation. Edge-based guidance with edge-guided gradient loss and\nedge-based fusion loss serves as the optimization objective equivalent to\nPoisson fusion. When depth maps are better refined, the labels also become more\nnoise-free. Our model can acquire strong robustness to the noises, achieving\nsignificant improvements in accuracy, edge quality, efficiency, and\ngeneralizability on five different benchmarks. Moreover, directly training\nanother model with edge labels produced by SDDR brings improvements, suggesting\nthat our method could help with training robust refinement models in future\nworks.\n","authors":["Jiaqi Li","Yiran Wang","Jinghong Zheng","Zihao Huang","Ke Xian","Zhiguo Cao","Jianming Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17880v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2407.09946v2","updated":"2024-09-26T14:27:23Z","published":"2024-07-13T17:03:16Z","title":"Low-Rank Interconnected Adaptation across Layers","summary":" Low-rank adaptation (LoRA) is a powerful parameter-efficient fine-tuning\nmethod that utilizes low-rank projectors $A$ and $B$ to learn weight updates\n$\\Delta W$ for adaptation targets $W$. Previous research has shown that LoRA is\nessentially a gradient compressor, performing random projections on the\ngradient using a fixed projection matrix $A_0$. However, this setup restricts\nthe overall weight update to be low-rank, which limits the adaptation\nperformance. In this paper, we propose low-rank interconnected adaptation\nacross layers (Lily). Specifically, we employ a hierarchical framework where\nlow-dimensional projectors (LPs) retained for downward projection at a\nparticular level, while globally-shared high-dimensional projector (HP) experts\nperform upward projection across all levels of layers. Lily uniquely connects\neach LP to all HP experts, therefore the gradient projections are no longer\ndominated by fixed projection matrices, but rather by selective combinations of\nall the projectors, thereby breaking the low-rank constraint of LoRA.\nFurthermore, Lily's cross-layer connections facilitate the capture of intricate\ninformation and dependencies across different layers, thereby enhancing the\nmodel's representational capabilities. Experiments across various modalities,\narchitectures, and model sizes underscore Lily's great performance and\nefficiency. Code is available on github https://github.com/yibozhong/lily.\n","authors":["Yibo Zhong","Yao Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.09946v2.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2409.17854v1","updated":"2024-09-26T14:00:00Z","published":"2024-09-26T14:00:00Z","title":"Visualization of Age Distributions as Elements of Medical Data-Stories","summary":" In various fields, including medicine, age distributions are crucial. Despite\nwidespread media coverage of health topics, there remains a need to enhance\nhealth communication. Narrative medical visualization is promising for\nimproving information comprehension and retention. This study explores the most\neffective ways to present age distributions of diseases through narrative\nvisualizations. We conducted a thorough analysis of existing visualizations,\nheld workshops with a broad audience, and reviewed relevant literature. From\nthis, we identified design choices focusing on comprehension, aesthetics,\nengagement, and memorability. We specifically tested three pictogram variants:\npictograms as bars, stacked pictograms, and annotations. After evaluating 18\nvisualizations with 72 participants and three expert reviews, we determined\nthat annotations were most effective for comprehension and aesthetics. However,\ntraditional bar charts were preferred for engagement, and other variants were\nmore memorable. The study provides a set of design recommendations based on\nthese insights.\n","authors":["Sophia Dowlatabadi","Bernhard Preim","Monique Meuschke"],"pdf_url":"https://arxiv.org/pdf/2409.17854v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.17851v1","updated":"2024-09-26T13:57:05Z","published":"2024-09-26T13:57:05Z","title":"A New Dataset for Monocular Depth Estimation Under Viewpoint Shifts","summary":" Monocular depth estimation is a critical task for autonomous driving and many\nother computer vision applications. While significant progress has been made in\nthis field, the effects of viewpoint shifts on depth estimation models remain\nlargely underexplored. This paper introduces a novel dataset and evaluation\nmethodology to quantify the impact of different camera positions and\norientations on monocular depth estimation performance. We propose a ground\ntruth strategy based on homography estimation and object detection, eliminating\nthe need for expensive lidar sensors. We collect a diverse dataset of road\nscenes from multiple viewpoints and use it to assess the robustness of a modern\ndepth estimation model to geometric shifts. After assessing the validity of our\nstrategy on a public dataset, we provide valuable insights into the limitations\nof current models and highlight the importance of considering viewpoint\nvariations in real-world applications.\n","authors":["Aurel Pjetri","Stefano Caprasecca","Leonardo Taccari","Matteo Simoncini","Henrique Piñeiro Monteagudo","Walter Wallace","Douglas Coimbra de Andrade","Francesco Sambo","Andrew David Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2409.17851v1.pdf","comment":"17 pages, 5 figures. Accepted at ECCV 2024 2nd Workshop on\n Vision-Centric Autonomous Driving (VCAD)"},{"id":"http://arxiv.org/abs/2404.04693v2","updated":"2024-09-26T13:53:33Z","published":"2024-04-06T17:41:36Z","title":"OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera\n Fusion for Colorizing Point Clouds","summary":" A Colored point cloud, as a simple and efficient 3D representation, has many\nadvantages in various fields, including robotic navigation and scene\nreconstruction. This representation is now commonly used in 3D reconstruction\ntasks relying on cameras and LiDARs. However, fusing data from these two types\nof sensors is poorly performed in many existing frameworks, leading to\nunsatisfactory mapping results, mainly due to inaccurate camera poses. This\npaper presents OmniColor, a novel and efficient algorithm to colorize point\nclouds using an independent 360-degree camera. Given a LiDAR-based point cloud\nand a sequence of panorama images with initial coarse camera poses, our\nobjective is to jointly optimize the poses of all frames for mapping images\nonto geometric reconstructions. Our pipeline works in an off-the-shelf manner\nthat does not require any feature extraction or matching process. Instead, we\nfind optimal poses by directly maximizing the photometric consistency of LiDAR\nmaps. In experiments, we show that our method can overcome the severe visual\ndistortion of omnidirectional images and greatly benefit from the wide field of\nview (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy\nand stability. The code will be released at\nhttps://github.com/liubonan123/OmniColor/.\n","authors":["Bonan Liu","Guoyang Zhao","Jianhao Jiao","Guang Cai","Chengyang Li","Handi Yin","Yuyang Wang","Ming Liu","Pan Hui"],"pdf_url":"https://arxiv.org/pdf/2404.04693v2.pdf","comment":"2024 IEEE International Conference on Robotics and Automation (ICRA)"},{"id":"http://arxiv.org/abs/2403.10542v2","updated":"2024-09-26T13:38:48Z","published":"2024-03-08T23:04:14Z","title":"SF-MMCN: Low-Power Sever Flow Multi-Mode Diffusion Model Accelerator","summary":" Generative Artificial Intelligence (AI) has become incredibly popular in\nrecent years, and the significance of traditional accelerators in dealing with\nlarge-scale parameters is urgent. With the diffusion model's parallel\nstructure, the hardware design challenge has skyrocketed because of the\nmultiple layers operating simultaneously. Convolution Neural Network (CNN)\naccelerators have been designed and developed rapidly, especially for\nhigh-speed inference. Often, CNN models with parallel structures are deployed.\nIn these CNN accelerators, many Processing Elements (PE) are required to\nperform parallel computations, mainly the multiply and accumulation (MAC)\noperation, resulting in high power consumption and a large silicon area. In\nthis work, a Server Flow Multi-Mode CNN Unit (SF-MMCN) is proposed to reduce\nthe number of PE while improving the operation efficiency of the CNN\naccelerator. The pipelining technique is introduced into Server Flow to process\nparallel computations. The proposed SF-MMCN is implemented with TSMC 90-nm CMOS\ntechnology. It is evaluated with VGG-16, ResNet-18, and U-net. The evaluation\nresults show that the proposed SF-MMCN can reduce the power consumption by 92%,\nand the silicon area by 70%, while improving the efficiency of operation by\nnearly 81 times. A new FoM, area efficiency (GOPs/mm^2) is also introduced to\nevaluate the performance of the accelerator in terms of the ratio throughput\n(GOPs) and silicon area (mm^2). In this FoM, SF-MMCN improves area efficiency\nby 18 times (18.42).\n","authors":["Huan-Ke Hsu","I-Chyn Wey","T. Hui Teo"],"pdf_url":"https://arxiv.org/pdf/2403.10542v2.pdf","comment":"16 pages, 16 figures; extend the CNN to process Diffusion Model\n (possible this is the first reported hardware Diffusion Model implementation)"},{"id":"http://arxiv.org/abs/2407.17380v2","updated":"2024-09-26T13:37:04Z","published":"2024-07-24T16:04:18Z","title":"2D and 3D Deep Learning Models for MRI-based Parkinson's Disease\n Classification: A Comparative Analysis of Convolutional Kolmogorov-Arnold\n Networks, Convolutional Neural Networks, and Graph Convolutional Networks","summary":" Parkinson's Disease (PD) diagnosis remains challenging. This study applies\nConvolutional Kolmogorov-Arnold Networks (ConvKANs), integrating learnable\nspline-based activation functions into convolutional layers, for PD\nclassification using structural MRI. The first 3D implementation of ConvKANs\nfor medical imaging is presented, comparing their performance to Convolutional\nNeural Networks (CNNs) and Graph Convolutional Networks (GCNs) across three\nopen-source datasets. Isolated analyses assessed performance within individual\ndatasets, using cross-validation techniques. Holdout analyses evaluated\ncross-dataset generalizability by training models on two datasets and testing\non the third, mirroring real-world clinical scenarios. In isolated analyses, 2D\nConvKANs achieved the highest AUC of 0.99 (95% CI: 0.98-0.99) on the PPMI\ndataset, outperforming 2D CNNs (AUC: 0.97, p = 0.0092). 3D models showed\npromise, with 3D CNN and 3D ConvKAN reaching an AUC of 0.85 on PPMI. In holdout\nanalyses, 3D ConvKAN demonstrated superior generalization, achieving an AUC of\n0.85 on early-stage PD data. GCNs underperformed in 2D but improved in 3D\nimplementations. These findings highlight ConvKANs' potential for PD detection,\nemphasize the importance of 3D analysis in capturing subtle brain changes, and\nunderscore cross-dataset generalization challenges. This study advances\nAI-assisted PD diagnosis using structural MRI and emphasizes the need for\nlarger-scale validation.\n","authors":["Salil B Patel","Vicky Goh","James F FitzGerald","Chrystalina A Antoniades"],"pdf_url":"https://arxiv.org/pdf/2407.17380v2.pdf","comment":"7 figures"},{"id":"http://arxiv.org/abs/2406.04769v2","updated":"2024-09-26T13:31:40Z","published":"2024-06-07T09:15:29Z","title":"Diffusion-based Generative Image Outpainting for Recovery of\n FOV-Truncated CT Images","summary":" Field-of-view (FOV) recovery of truncated chest CT scans is crucial for\naccurate body composition analysis, which involves quantifying skeletal muscle\nand subcutaneous adipose tissue (SAT) on CT slices. This, in turn, enables\ndisease prognostication. Here, we present a method for recovering truncated CT\nslices using generative image outpainting. We train a diffusion model and apply\nit to truncated CT slices generated by simulating a small FOV. Our model\nreliably recovers the truncated anatomy and outperforms the previous\nstate-of-the-art despite being trained on 87% less data.\n","authors":["Michelle Espranita Liman","Daniel Rueckert","Florian J. Fintelmann","Philip Müller"],"pdf_url":"https://arxiv.org/pdf/2406.04769v2.pdf","comment":"Shared last authorship: Florian J. Fintelmann and Philip M\\\"uller"},{"id":"http://arxiv.org/abs/2409.17830v1","updated":"2024-09-26T13:29:40Z","published":"2024-09-26T13:29:40Z","title":"Unsupervised Learning Based Multi-Scale Exposure Fusion","summary":" Unsupervised learning based multi-scale exposure fusion (ULMEF) is efficient\nfor fusing differently exposed low dynamic range (LDR) images into a higher\nquality LDR image for a high dynamic range (HDR) scene. Unlike supervised\nlearning, loss functions play a crucial role in the ULMEF. In this paper, novel\nloss functions are proposed for the ULMEF and they are defined by using all the\nimages to be fused and other differently exposed images from the same HDR\nscene. The proposed loss functions can guide the proposed ULMEF to learn more\nreliable information from the HDR scene than existing loss functions which are\ndefined by only using the set of images to be fused. As such, the quality of\nthe fused image is significantly improved. The proposed ULMEF also adopts a\nmulti-scale strategy that includes a multi-scale attention module to\neffectively preserve the scene depth and local contrast in the fused image.\nMeanwhile, the proposed ULMEF can be adopted to achieve exposure interpolation\nand exposure extrapolation. Extensive experiments show that the proposed ULMEF\nalgorithm outperforms state-of-the-art exposure fusion algorithms.\n","authors":["Chaobing Zheng","Shiqian Wu","Zhenggguo Li"],"pdf_url":"https://arxiv.org/pdf/2409.17830v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2409.17823v1","updated":"2024-09-26T13:21:02Z","published":"2024-09-26T13:21:02Z","title":"Kendall's $τ$ Coefficient for Logits Distillation","summary":" Knowledge distillation typically employs the Kullback-Leibler (KL) divergence\nto constrain the student model's output to match the soft labels provided by\nthe teacher model exactly. However, sometimes the optimization direction of the\nKL divergence loss is not always aligned with the task loss, where a smaller KL\ndivergence could lead to erroneous predictions that diverge from the soft\nlabels. This limitation often results in suboptimal optimization for the\nstudent. Moreover, even under temperature scaling, the KL divergence loss\nfunction tends to overly focus on the larger-valued channels in the logits,\ndisregarding the rich inter-class information provided by the multitude of\nsmaller-valued channels. This hard constraint proves too challenging for\nlightweight students, hindering further knowledge distillation. To address this\nissue, we propose a plug-and-play ranking loss based on Kendall's $\\tau$\ncoefficient, called Rank-Kendall Knowledge Distillation (RKKD). RKKD balances\nthe attention to smaller-valued channels by constraining the order of channel\nvalues in student logits, providing more inter-class relational information.\nThe rank constraint on the top-valued channels helps avoid suboptimal traps\nduring optimization. We also discuss different differentiable forms of\nKendall's $\\tau$ coefficient and demonstrate that the proposed ranking loss\nfunction shares a consistent optimization objective with the KL divergence.\nExtensive experiments on the CIFAR-100 and ImageNet datasets show that our RKKD\ncan enhance the performance of various knowledge distillation baselines and\noffer broad improvements across multiple teacher-student architecture\ncombinations.\n","authors":["Yuchen Guan","Runxi Cheng","Kang Liu","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2409.17823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16042v2","updated":"2024-09-26T13:18:24Z","published":"2024-09-24T12:44:27Z","title":"Enhanced Unsupervised Image-to-Image Translation Using Contrastive\n Learning and Histogram of Oriented Gradients","summary":" Image-to-Image Translation is a vital area of computer vision that focuses on\ntransforming images from one visual domain to another while preserving their\ncore content and structure. However, this field faces two major challenges:\nfirst, the data from the two domains are often unpaired, making it difficult to\ntrain generative adversarial networks effectively; second, existing methods\ntend to produce artifacts or hallucinations during image generation, leading to\na decline in image quality. To address these issues, this paper proposes an\nenhanced unsupervised image-to-image translation method based on the\nContrastive Unpaired Translation (CUT) model, incorporating Histogram of\nOriented Gradients (HOG) features. This novel approach ensures the preservation\nof the semantic structure of images, even without semantic labels, by\nminimizing the loss between the HOG features of input and generated images. The\nmethod was tested on translating synthetic game environments from GTA5 dataset\nto realistic urban scenes in cityscapes dataset, demonstrating significant\nimprovements in reducing hallucinations and enhancing image quality.\n","authors":["Wanchen Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.16042v2.pdf","comment":"Critical Errors in Data or Analysis"},{"id":"http://arxiv.org/abs/2409.17805v1","updated":"2024-09-26T12:58:01Z","published":"2024-09-26T12:58:01Z","title":"Cascade Prompt Learning for Vision-Language Model Adaptation","summary":" Prompt learning has surfaced as an effective approach to enhance the\nperformance of Vision-Language Models (VLMs) like CLIP when applied to\ndownstream tasks. However, current learnable prompt tokens are primarily used\nfor the single phase of adapting to tasks (i.e., adapting prompt), easily\nleading to overfitting risks. In this work, we propose a novel Cascade Prompt\nLearning CasPL framework to enable prompt learning to serve both generic and\nspecific expertise (i.e., boosting and adapting prompt) simultaneously.\nSpecifically, CasPL is a new learning paradigm comprising two distinct phases\nof learnable prompts: the first boosting prompt is crafted to extract\ndomain-general knowledge from a senior larger CLIP teacher model by aligning\ntheir predicted logits using extensive unlabeled domain images. The second\nadapting prompt is then cascaded with the frozen first set to fine-tune the\ndownstream tasks, following the approaches employed in prior research. In this\nmanner, CasPL can effectively capture both domain-general and task-specific\nrepresentations into explicitly different gradual groups of prompts, thus\npotentially alleviating overfitting issues in the target domain. It's worth\nnoting that CasPL serves as a plug-and-play module that can seamlessly\nintegrate into any existing prompt learning approach. CasPL achieves a\nsignificantly better balance between performance and inference speed, which is\nespecially beneficial for deploying smaller VLM models in resource-constrained\nenvironments. Compared to the previous state-of-the-art method PromptSRC, CasPL\nshows an average improvement of 1.85% for base classes, 3.44% for novel\nclasses, and 2.72% for the harmonic mean over 11 image classification datasets.\nCode is publicly available at: https://github.com/megvii-research/CasPL.\n","authors":["Ge Wu","Xin Zhang","Zheng Li","Zhaowei Chen","Jiajun Liang","Jian Yang","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2409.17805v1.pdf","comment":"ECCV2024"},{"id":"http://arxiv.org/abs/2406.10615v2","updated":"2024-09-26T12:55:43Z","published":"2024-06-15T12:27:35Z","title":"Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation","summary":" Given the high cost of collecting robotic data in the real world, sample\nefficiency is a consistently compelling pursuit in robotics. In this paper, we\nintroduce SGRv2, an imitation learning framework that enhances sample\nefficiency through improved visual and action representations. Central to the\ndesign of SGRv2 is the incorporation of a critical inductive bias-action\nlocality, which posits that robot's actions are predominantly influenced by the\ntarget object and its interactions with the local environment. Extensive\nexperiments in both simulated and real-world settings demonstrate that action\nlocality is essential for boosting sample efficiency. SGRv2 excels in RLBench\ntasks with keyframe control using merely 5 demonstrations and surpasses the RVT\nbaseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and\nMimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR.\nIn real-world environments, with only eight demonstrations, SGRv2 can perform a\nvariety of tasks at a markedly higher success rate compared to baseline models.\nProject website: http://sgrv2-robot.github.io\n","authors":["Tong Zhang","Yingdong Hu","Jiacheng You","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2406.10615v2.pdf","comment":"CoRL 2024. Project website: http://sgrv2-robot.github.io"},{"id":"http://arxiv.org/abs/2409.17792v1","updated":"2024-09-26T12:37:50Z","published":"2024-09-26T12:37:50Z","title":"Reblurring-Guided Single Image Defocus Deblurring: A Learning Framework\n with Misaligned Training Pairs","summary":" For single image defocus deblurring, acquiring well-aligned training pairs\n(or training triplets), i.e., a defocus blurry image, an all-in-focus sharp\nimage (and a defocus blur map), is an intricate task for the development of\ndeblurring models. Existing image defocus deblurring methods typically rely on\ntraining data collected by specialized imaging equipment, presupposing that\nthese pairs or triplets are perfectly aligned. However, in practical scenarios\ninvolving the collection of real-world data, direct acquisition of training\ntriplets is infeasible, and training pairs inevitably encounter spatial\nmisalignment issues. In this work, we introduce a reblurring-guided learning\nframework for single image defocus deblurring, enabling the learning of a\ndeblurring network even with misaligned training pairs. Specifically, we first\npropose a baseline defocus deblurring network that utilizes spatially varying\ndefocus blur map as degradation prior to enhance the deblurring performance.\nThen, to effectively learn the baseline defocus deblurring network with\nmisaligned training pairs, our reblurring module ensures spatial consistency\nbetween the deblurred image, the reblurred image and the input blurry image by\nreconstructing spatially variant isotropic blur kernels. Moreover, the\nspatially variant blur derived from the reblurring module can serve as pseudo\nsupervision for defocus blur map during training, interestingly transforming\ntraining pairs into training triplets. Additionally, we have collected a new\ndataset specifically for single image defocus deblurring (SDD) with typical\nmisalignments, which not only substantiates our proposed method but also serves\nas a benchmark for future research.\n","authors":["Xinya Shu","Yu Li","Dongwei Ren","Xiaohe Wu","Jin Li","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2409.17792v1.pdf","comment":"The source code and dataset are available at\n https://github.com/ssscrystal/Reblurring-guided-JDRL"},{"id":"http://arxiv.org/abs/2409.17790v1","updated":"2024-09-26T12:37:22Z","published":"2024-09-26T12:37:22Z","title":"CASPFormer: Trajectory Prediction from BEV Images with Deformable\n Attention","summary":" Motion prediction is an important aspect for Autonomous Driving (AD) and\nAdvance Driver Assistance Systems (ADAS). Current state-of-the-art motion\nprediction methods rely on High Definition (HD) maps for capturing the\nsurrounding context of the ego vehicle. Such systems lack scalability in\nreal-world deployment as HD maps are expensive to produce and update in\nreal-time. To overcome this issue, we propose Context Aware Scene Prediction\nTransformer (CASPFormer), which can perform multi-modal motion prediction from\nrasterized Bird-Eye-View (BEV) images. Our system can be integrated with any\nupstream perception module that is capable of generating BEV images. Moreover,\nCASPFormer directly decodes vectorized trajectories without any postprocessing.\nTrajectories are decoded recurrently using deformable attention, as it is\ncomputationally efficient and provides the network with the ability to focus\nits attention on the important spatial locations of the BEV images. In\naddition, we also address the issue of mode collapse for generating multiple\nscene-consistent trajectories by incorporating learnable mode queries. We\nevaluate our model on the nuScenes dataset and show that it reaches\nstate-of-the-art across multiple metrics\n","authors":["Harsh Yadav","Maximilian Schaefer","Kun Zhao","Tobias Meisen"],"pdf_url":"https://arxiv.org/pdf/2409.17790v1.pdf","comment":"Under Review at ICPR 2024, Kolkata"},{"id":"http://arxiv.org/abs/2405.07865v4","updated":"2024-09-26T12:18:49Z","published":"2024-05-13T15:53:18Z","title":"AnoVox: A Benchmark for Multimodal Anomaly Detection in Autonomous\n Driving","summary":" The scale-up of autonomous vehicles depends heavily on their ability to deal\nwith anomalies, such as rare objects on the road. In order to handle such\nsituations, it is necessary to detect anomalies in the first place. Anomaly\ndetection for autonomous driving has made great progress in the past years but\nsuffers from poorly designed benchmarks with a strong focus on camera data. In\nthis work, we propose AnoVox, the largest benchmark for ANOmaly detection in\nautonomous driving to date. AnoVox incorporates large-scale multimodal sensor\ndata and spatial VOXel ground truth, allowing for the comparison of methods\nindependent of their used sensor. We propose a formal definition of normality\nand provide a compliant training dataset. AnoVox is the first benchmark to\ncontain both content and temporal anomalies.\n","authors":["Daniel Bogdoll","Iramm Hamdard","Lukas Namgyu Rößler","Felix Geisler","Muhammed Bayram","Felix Wang","Jan Imhof","Miguel de Campos","Anushervon Tabarov","Yitian Yang","Hanno Gottschalk","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2405.07865v4.pdf","comment":"Daniel Bogdoll, Iramm Hamdard, and Lukas Namgyu R\\\"o{\\ss}ler\n contributed equally. Accepted for publication at ECCV 2024 W-CODA workshop"},{"id":"http://arxiv.org/abs/2409.17778v1","updated":"2024-09-26T12:16:11Z","published":"2024-09-26T12:16:11Z","title":"Taming Diffusion Prior for Image Super-Resolution with Domain Shift SDEs","summary":" Diffusion-based image super-resolution (SR) models have attracted substantial\ninterest due to their powerful image restoration capabilities. However,\nprevailing diffusion models often struggle to strike an optimal balance between\nefficiency and performance. Typically, they either neglect to exploit the\npotential of existing extensive pretrained models, limiting their generative\ncapacity, or they necessitate a dozens of forward passes starting from random\nnoises, compromising inference efficiency. In this paper, we present DoSSR, a\nDomain Shift diffusion-based SR model that capitalizes on the generative powers\nof pretrained diffusion models while significantly enhancing efficiency by\ninitiating the diffusion process with low-resolution (LR) images. At the core\nof our approach is a domain shift equation that integrates seamlessly with\nexisting diffusion models. This integration not only improves the use of\ndiffusion prior but also boosts inference efficiency. Moreover, we advance our\nmethod by transitioning the discrete shift process to a continuous formulation,\ntermed as DoS-SDEs. This advancement leads to the fast and customized solvers\nthat further enhance sampling efficiency. Empirical results demonstrate that\nour proposed method achieves state-of-the-art performance on synthetic and\nreal-world datasets, while notably requiring only 5 sampling steps. Compared to\nprevious diffusion prior based methods, our approach achieves a remarkable\nspeedup of 5-7 times, demonstrating its superior efficiency. Code:\nhttps://github.com/QinpengCui/DoSSR.\n","authors":["Qinpeng Cui","Yixuan Liu","Xinyi Zhang","Qiqi Bao","Zhongdao Wang","Qingmin Liao","Li Wang","Tian Lu","Emad Barsoum"],"pdf_url":"https://arxiv.org/pdf/2409.17778v1.pdf","comment":"This paper is accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17777v1","updated":"2024-09-26T12:15:13Z","published":"2024-09-26T12:15:13Z","title":"Harnessing Shared Relations via Multimodal Mixup Contrastive Learning\n for Multimodal Classification","summary":" Deep multimodal learning has shown remarkable success by leveraging\ncontrastive learning to capture explicit one-to-one relations across\nmodalities. However, real-world data often exhibits shared relations beyond\nsimple pairwise associations. We propose M3CoL, a Multimodal Mixup Contrastive\nLearning approach to capture nuanced shared relations inherent in multimodal\ndata. Our key contribution is a Mixup-based contrastive loss that learns robust\nrepresentations by aligning mixed samples from one modality with their\ncorresponding samples from other modalities thereby capturing shared relations\nbetween them. For multimodal classification tasks, we introduce a framework\nthat integrates a fusion module with unimodal prediction modules for auxiliary\nsupervision during training, complemented by our proposed Mixup-based\ncontrastive loss. Through extensive experiments on diverse datasets (N24News,\nROSMAP, BRCA, and Food-101), we demonstrate that M3CoL effectively captures\nshared multimodal relations and generalizes across domains. It outperforms\nstate-of-the-art methods on N24News, ROSMAP, and BRCA, while achieving\ncomparable performance on Food-101. Our work highlights the significance of\nlearning shared relations for robust multimodal learning, opening up promising\navenues for future research.\n","authors":["Raja Kumar","Raghav Singhal","Pranamya Kulkarni","Deval Mehta","Kshitij Jadhav"],"pdf_url":"https://arxiv.org/pdf/2409.17777v1.pdf","comment":"RK and RS contributed equally to this work, 20 Pages, 8 Figures, 9\n Tables"},{"id":"http://arxiv.org/abs/2409.17775v1","updated":"2024-09-26T12:13:52Z","published":"2024-09-26T12:13:52Z","title":"UNICORN: A Deep Learning Model for Integrating Multi-Stain Data in\n Histopathology","summary":" Background: The integration of multi-stain histopathology images through deep\nlearning poses a significant challenge in digital histopathology. Current\nmulti-modal approaches struggle with data heterogeneity and missing data. This\nstudy aims to overcome these limitations by developing a novel transformer\nmodel for multi-stain integration that can handle missing data during training\nas well as inference. Methods: We propose UNICORN (UNiversal modality\nIntegration Network for CORonary classificatioN) a multi-modal transformer\ncapable of processing multi-stain histopathology for atherosclerosis severity\nclass prediction. The architecture comprises a two-stage, end-to-end trainable\nmodel with specialized modules utilizing transformer self-attention blocks. The\ninitial stage employs domain-specific expert modules to extract features from\neach modality. In the subsequent stage, an aggregation expert module integrates\nthese features by learning the interactions between the different data\nmodalities. Results: Evaluation was performed using a multi-class dataset of\natherosclerotic lesions from the Munich Cardiovascular Studies Biobank\n(MISSION), using over 4,000 paired multi-stain whole slide images (WSIs) from\n170 deceased individuals on 7 prespecified segments of the coronary tree, each\nstained according to four histopathological protocols. UNICORN achieved a\nclassification accuracy of 0.67, outperforming other state-of-the-art models.\nThe model effectively identifies relevant tissue phenotypes across stainings\nand implicitly models disease progression. Conclusion: Our proposed multi-modal\ntransformer model addresses key challenges in medical data analysis, including\ndata heterogeneity and missing modalities. Explainability and the model's\neffectiveness in predicting atherosclerosis progression underscores its\npotential for broader applications in medical research.\n","authors":["Valentin Koch","Sabine Bauer","Valerio Luppberger","Michael Joner","Heribert Schunkert","Julia A. Schnabel","Moritz von Scheidt","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2409.17775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17763v1","updated":"2024-09-26T11:58:41Z","published":"2024-09-26T11:58:41Z","title":"Confidence intervals uncovered: Are we ready for real-world medical\n imaging AI?","summary":" Medical imaging is spearheading the AI transformation of healthcare.\nPerformance reporting is key to determine which methods should be translated\ninto clinical practice. Frequently, broad conclusions are simply derived from\nmean performance values. In this paper, we argue that this common practice is\noften a misleading simplification as it ignores performance variability. Our\ncontribution is threefold. (1) Analyzing all MICCAI segmentation papers (n =\n221) published in 2023, we first observe that more than 50\\% of papers do not\nassess performance variability at all. Moreover, only one (0.5\\%) paper\nreported confidence intervals (CIs) for model performance. (2) To address the\nreporting bottleneck, we show that the unreported standard deviation (SD) in\nsegmentation papers can be approximated by a second-order polynomial function\nof the mean Dice similarity coefficient (DSC). Based on external validation\ndata from 56 previous MICCAI challenges, we demonstrate that this approximation\ncan accurately reconstruct the CI of a method using information provided in\npublications. (3) Finally, we reconstructed 95\\% CIs around the mean DSC of\nMICCAI 2023 segmentation papers. The median CI width was 0.03 which is three\ntimes larger than the median performance gap between the first and second\nranked method. For more than 60\\% of papers, the mean performance of the\nsecond-ranked method was within the CI of the first-ranked method. We conclude\nthat current publications typically do not provide sufficient evidence to\nsupport which models could potentially be translated into clinical practice.\n","authors":["Evangelia Christodoulou","Annika Reinke","Rola Houhou","Piotr Kalinowski","Selen Erkan","Carole H. Sudre","Ninon Burgos","Sofiène Boutaj","Sophie Loizillon","Maëlys Solal","Nicola Rieke","Veronika Cheplygina","Michela Antonelli","Leon D. Mayer","Minu D. Tizabi","M. Jorge Cardoso","Amber Simpson","Paul F. Jäger","Annette Kopp-Schneider","Gaël Varoquaux","Olivier Colliot","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2409.17763v1.pdf","comment":"Paper accepted at MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2409.17759v1","updated":"2024-09-26T11:53:25Z","published":"2024-09-26T11:53:25Z","title":"LGFN: Lightweight Light Field Image Super-Resolution using Local\n Convolution Modulation and Global Attention Feature Extraction","summary":" Capturing different intensity and directions of light rays at the same scene\nLight field (LF) can encode the 3D scene cues into a 4D LF image which has a\nwide range of applications (i.e. post-capture refocusing and depth sensing). LF\nimage super-resolution (SR) aims to improve the image resolution limited by the\nperformance of LF camera sensor. Although existing methods have achieved\npromising results the practical application of these models is limited because\nthey are not lightweight enough. In this paper we propose a lightweight model\nnamed LGFN which integrates the local and global features of different views\nand the features of different channels for LF image SR. Specifically owing to\nneighboring regions of the same pixel position in different sub-aperture images\nexhibit similar structural relationships we design a lightweight CNN-based\nfeature extraction module (namely DGCE) to extract local features better\nthrough feature modulation. Meanwhile as the position beyond the boundaries in\nthe LF image presents a large disparity we propose an efficient spatial\nattention module (namely ESAM) which uses decomposable large-kernel convolution\nto obtain an enlarged receptive field and an efficient channel attention module\n(namely ECAM). Compared with the existing LF image SR models with large\nparameter our model has a parameter of 0.45M and a FLOPs of 19.33G which has\nachieved a competitive effect. Extensive experiments with ablation studies\ndemonstrate the effectiveness of our proposed method which ranked the second\nplace in the Track 2 Fidelity & Efficiency of NTIRE2024 Light Field Super\nResolution Challenge and the seventh place in the Track 1 Fidelity.\n","authors":["Zhongxin Yu","Liang Chen","Zhiyun Zeng","Kunping Yang","Shaofei Luo","Shaorui Chen","Cheng Zhong"],"pdf_url":"https://arxiv.org/pdf/2409.17759v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.09369v3","updated":"2024-09-26T11:38:12Z","published":"2024-09-14T08:47:45Z","title":"Interpretable Vision-Language Survival Analysis with Ordinal Inductive\n Bias for Computational Pathology","summary":" Histopathology Whole-Slide Images (WSIs) provide an important tool to assess\ncancer prognosis in computational pathology (CPATH). While existing survival\nanalysis (SA) approaches have made exciting progress, they are generally\nlimited to adopting highly-expressive architectures and only coarse-grained\npatient-level labels to learn prognostic visual representations from gigapixel\nWSIs. Such learning paradigm suffers from important performance bottlenecks,\nwhen facing present scarce training data and standard multi-instance learning\n(MIL) framework in CPATH. To overcome it, this paper, for the first time,\nproposes a new Vision-Language-based SA (VLSA) paradigm. Concretely, (1) VLSA\nis driven by pathology VL foundation models. It no longer relies on\nhigh-capability networks and shows the advantage of data efficiency. (2) In\nvision-end, VLSA encodes prognostic language prior and then employs it as\nauxiliary signals to guide the aggregating of prognostic visual features at\ninstance level, thereby compensating for the weak supervision in MIL. Moreover,\ngiven the characteristics of SA, we propose i) ordinal survival prompt learning\nto transform continuous survival labels into textual prompts; and ii) ordinal\nincidence function as prediction target to make SA compatible with VL-based\nprediction. Notably, VLSA's predictions can be interpreted intuitively by our\nShapley values-based method. The extensive experiments on five datasets confirm\nthe effectiveness of our scheme. Our VLSA could pave a new way for SA in CPATH\nby offering weakly-supervised MIL an effective means to learn valuable\nprognostic clues from gigapixel WSIs. Our source code is available at\nhttps://github.com/liupei101/VLSA.\n","authors":["Pei Liu","Luping Ji","Jiaxiang Gou","Bo Fu","Mao Ye"],"pdf_url":"https://arxiv.org/pdf/2409.09369v3.pdf","comment":"24 pages, 11 tables, 6 figures"},{"id":"http://arxiv.org/abs/2401.01008v3","updated":"2024-09-26T11:35:22Z","published":"2023-12-13T17:05:37Z","title":"Fast Sampling Through The Reuse Of Attention Maps In Diffusion Models","summary":" Text-to-image diffusion models have demonstrated unprecedented capabilities\nfor flexible and realistic image synthesis. Nevertheless, these models rely on\na time-consuming sampling procedure, which has motivated attempts to reduce\ntheir latency. When improving efficiency, researchers often use the original\ndiffusion model to train an additional network designed specifically for fast\nimage generation. In contrast, our approach seeks to reduce latency directly,\nwithout any retraining, fine-tuning, or knowledge distillation. In particular,\nwe find the repeated calculation of attention maps to be costly yet redundant,\nand instead suggest reusing them during sampling. Our specific reuse strategies\nare based on ODE theory, which implies that the later a map is reused, the\nsmaller the distortion in the final image. We empirically compare these reuse\nstrategies with few-step sampling procedures of comparable latency, finding\nthat reuse generates images that are closer to those produced by the original\nhigh-latency diffusion model.\n","authors":["Rosco Hunter","Łukasz Dudziak","Mohamed S. Abdelfattah","Abhinav Mehrotra","Sourav Bhattacharya","Hongkai Wen"],"pdf_url":"https://arxiv.org/pdf/2401.01008v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12844v2","updated":"2024-09-26T11:29:04Z","published":"2024-02-20T09:13:15Z","title":"ICON: Improving Inter-Report Consistency in Radiology Report Generation\n via Lesion-aware Mixup Augmentation","summary":" Previous research on radiology report generation has made significant\nprogress in terms of increasing the clinical accuracy of generated reports. In\nthis paper, we emphasize another crucial quality that it should possess, i.e.,\ninter-report consistency, which refers to the capability of generating\nconsistent reports for semantically equivalent radiographs. This quality is\neven of greater significance than the overall report accuracy in terms of\nensuring the system's credibility, as a system prone to providing conflicting\nresults would severely erode users' trust. Regrettably, existing approaches\nstruggle to maintain inter-report consistency, exhibiting biases towards common\npatterns and susceptibility to lesion variants. To address this issue, we\npropose ICON, which improves the inter-report consistency of radiology report\ngeneration. Aiming to enhance the system's ability to capture similarities in\nsemantically equivalent lesions, our approach first involves extracting lesions\nfrom input images and examining their characteristics. Then, we introduce a\nlesion-aware mixup technique to ensure that the representations of the\nsemantically equivalent lesions align with the same attributes, achieved\nthrough a linear combination during the training phase. Extensive experiments\non three publicly available chest X-ray datasets verify the effectiveness of\nour approach, both in terms of improving the consistency and accuracy of the\ngenerated reports.\n","authors":["Wenjun Hou","Yi Cheng","Kaishuai Xu","Yan Hu","Wenjie Li","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.12844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17747v1","updated":"2024-09-26T11:23:59Z","published":"2024-09-26T11:23:59Z","title":"Text Image Generation for Low-Resource Languages with Dual Translation\n Learning","summary":" Scene text recognition in low-resource languages frequently faces challenges\ndue to the limited availability of training datasets derived from real-world\nscenes. This study proposes a novel approach that generates text images in\nlow-resource languages by emulating the style of real text images from\nhigh-resource languages. Our approach utilizes a diffusion model that is\nconditioned on binary states: ``synthetic'' and ``real.'' The training of this\nmodel involves dual translation tasks, where it transforms plain text images\ninto either synthetic or real text images, based on the binary states. This\napproach not only effectively differentiates between the two domains but also\nfacilitates the model's explicit recognition of characters in the target\nlanguage. Furthermore, to enhance the accuracy and variety of generated text\nimages, we introduce two guidance techniques: Fidelity-Diversity Balancing\nGuidance and Fidelity Enhancement Guidance. Our experimental results\ndemonstrate that the text images generated by our proposed framework can\nsignificantly improve the performance of scene text recognition models for\nlow-resource languages.\n","authors":["Chihiro Noguchi","Shun Fukuda","Shoichiro Mihara","Masao Yamanaka"],"pdf_url":"https://arxiv.org/pdf/2409.17747v1.pdf","comment":"23 pages, 11 figures"},{"id":"http://arxiv.org/abs/2405.06945v2","updated":"2024-09-26T11:21:27Z","published":"2024-05-11T07:56:19Z","title":"Direct Learning of Mesh and Appearance via 3D Gaussian Splatting","summary":" Accurately reconstructing a 3D scene including explicit geometry information\nis both attractive and challenging. Geometry reconstruction can benefit from\nincorporating differentiable appearance models, such as Neural Radiance Fields\nand 3D Gaussian Splatting (3DGS). However, existing methods encounter\nefficiency issues due to indirect geometry learning and the paradigm of\nseparately modeling geometry and surface appearance. In this work, we propose a\nlearnable scene model that incorporates 3DGS with an explicit geometry\nrepresentation, namely a mesh. Our model learns the mesh and appearance in an\nend-to-end manner, where we bind 3D Gaussians to the mesh faces and perform\ndifferentiable rendering of 3DGS to obtain photometric supervision. The model\ncreates an effective information pathway to supervise the learning of both 3DGS\nand mesh. Experimental results demonstrate that the learned scene model not\nonly achieves state-of-the-art efficiency and rendering quality but also\nsupports manipulation using the explicit mesh. In addition, our model has a\nunique advantage in adapting to scene updates, thanks to the end-to-end\nlearning of both mesh and appearance.\n","authors":["Ancheng Lin","Jun Li"],"pdf_url":"https://arxiv.org/pdf/2405.06945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17740v1","updated":"2024-09-26T11:15:15Z","published":"2024-09-26T11:15:15Z","title":"AnyLogo: Symbiotic Subject-Driven Diffusion System with Gemini Status","summary":" Diffusion models have made compelling progress on facilitating\nhigh-throughput daily production. Nevertheless, the appealing customized\nrequirements are remain suffered from instance-level finetuning for authentic\nfidelity. Prior zero-shot customization works achieve the semantic consistence\nthrough the condensed injection of identity features, while addressing detailed\nlow-level signatures through complex model configurations and subject-specific\nfabrications, which significantly break the statistical coherence within the\noverall system and limit the applicability across various scenarios. To\nfacilitate the generic signature concentration with rectified efficiency, we\npresent \\textbf{AnyLogo}, a zero-shot region customizer with remarkable detail\nconsistency, building upon the symbiotic diffusion system with eliminated\ncumbersome designs. Streamlined as vanilla image generation, we discern that\nthe rigorous signature extraction and creative content generation are\npromisingly compatible and can be systematically recycled within a single\ndenoising model. In place of the external configurations, the gemini status of\nthe denoising model promote the reinforced subject transmission efficiency and\ndisentangled semantic-signature space with continuous signature decoration.\nMoreover, the sparse recycling paradigm is adopted to prevent the duplicated\nrisk with compressed transmission quota for diversified signature stimulation.\nExtensive experiments on constructed logo-level benchmarks demonstrate the\neffectiveness and practicability of our methods.\n","authors":["Jinghao Zhang","Wen Qian","Hao Luo","Fan Wang","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.17740v1.pdf","comment":"13 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.17729v1","updated":"2024-09-26T10:58:31Z","published":"2024-09-26T10:58:31Z","title":"Neural Implicit Representation for Highly Dynamic LiDAR Mapping and\n Odometry","summary":" Recent advancements in Simultaneous Localization and Mapping (SLAM) have\nincreasingly highlighted the robustness of LiDAR-based techniques. At the same\ntime, Neural Radiance Fields (NeRF) have introduced new possibilities for 3D\nscene reconstruction, exemplified by SLAM systems. Among these, NeRF-LOAM has\nshown notable performance in NeRF-based SLAM applications. However, despite its\nstrengths, these systems often encounter difficulties in dynamic outdoor\nenvironments due to their inherent static assumptions. To address these\nlimitations, this paper proposes a novel method designed to improve\nreconstruction in highly dynamic outdoor scenes. Based on NeRF-LOAM, the\nproposed approach consists of two primary components. First, we separate the\nscene into static background and dynamic foreground. By identifying and\nexcluding dynamic elements from the mapping process, this segmentation enables\nthe creation of a dense 3D map that accurately represents the static background\nonly. The second component extends the octree structure to support\nmulti-resolution representation. This extension not only enhances\nreconstruction quality but also aids in the removal of dynamic objects\nidentified by the first module. Additionally, Fourier feature encoding is\napplied to the sampled points, capturing high-frequency information and leading\nto more complete reconstruction results. Evaluations on various datasets\ndemonstrate that our method achieves more competitive results compared to\ncurrent state-of-the-art approaches.\n","authors":["Qi Zhang","He Wang","Ru Li","Wenbin Li"],"pdf_url":"https://arxiv.org/pdf/2409.17729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17728v1","updated":"2024-09-26T10:57:02Z","published":"2024-09-26T10:57:02Z","title":"AlterMOMA: Fusion Redundancy Pruning for Camera-LiDAR Fusion Models with\n Alternative Modality Masking","summary":" Camera-LiDAR fusion models significantly enhance perception performance in\nautonomous driving. The fusion mechanism leverages the strengths of each\nmodality while minimizing their weaknesses. Moreover, in practice, camera-LiDAR\nfusion models utilize pre-trained backbones for efficient training. However, we\nargue that directly loading single-modal pre-trained camera and LiDAR backbones\ninto camera-LiDAR fusion models introduces similar feature redundancy across\nmodalities due to the nature of the fusion mechanism. Unfortunately, existing\npruning methods are developed explicitly for single-modal models, and thus,\nthey struggle to effectively identify these specific redundant parameters in\ncamera-LiDAR fusion models. In this paper, to address the issue above on\ncamera-LiDAR fusion models, we propose a novelty pruning framework Alternative\nModality Masking Pruning (AlterMOMA), which employs alternative masking on each\nmodality and identifies the redundant parameters. Specifically, when one\nmodality parameters are masked (deactivated), the absence of features from the\nmasked backbone compels the model to reactivate previous redundant features of\nthe other modality backbone. Therefore, these redundant features and relevant\nredundant parameters can be identified via the reactivation process. The\nredundant parameters can be pruned by our proposed importance score evaluation\nfunction, Alternative Evaluation (AlterEva), which is based on the observation\nof the loss changes when certain modality parameters are activated and\ndeactivated. Extensive experiments on the nuScene and KITTI datasets\nencompassing diverse tasks, baseline models, and pruning algorithms showcase\nthat AlterMOMA outperforms existing pruning methods, attaining state-of-the-art\nperformance.\n","authors":["Shiqi Sun","Yantao Lu","Ning Liu","Bo Jiang","JinChao Chen","Ying Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17728v1.pdf","comment":"17 pages, 3 figures, Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17727v1","updated":"2024-09-26T10:56:35Z","published":"2024-09-26T10:56:35Z","title":"Robotic-CLIP: Fine-tuning CLIP on Action Data for Robotic Applications","summary":" Vision language models have played a key role in extracting meaningful\nfeatures for various robotic applications. Among these, Contrastive\nLanguage-Image Pretraining (CLIP) is widely used in robotic tasks that require\nboth vision and natural language understanding. However, CLIP was trained\nsolely on static images paired with text prompts and has not yet been fully\nadapted for robotic tasks involving dynamic actions. In this paper, we\nintroduce Robotic-CLIP to enhance robotic perception capabilities. We first\ngather and label large-scale action data, and then build our Robotic-CLIP by\nfine-tuning CLIP on 309,433 videos (~7.4 million frames) of action data using\ncontrastive learning. By leveraging action data, Robotic-CLIP inherits CLIP's\nstrong image performance while gaining the ability to understand actions in\nrobotic contexts. Intensive experiments show that our Robotic-CLIP outperforms\nother CLIP-based models across various language-driven robotic tasks.\nAdditionally, we demonstrate the practical effectiveness of Robotic-CLIP in\nreal-world grasping applications.\n","authors":["Nghia Nguyen","Minh Nhat Vu","Tung D. Ta","Baoru Huang","Thieu Vo","Ngan Le","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.17727v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2409.17720v1","updated":"2024-09-26T10:43:09Z","published":"2024-09-26T10:43:09Z","title":"Scene Understanding in Pick-and-Place Tasks: Analyzing Transformations\n Between Initial and Final Scenes","summary":" With robots increasingly collaborating with humans in everyday tasks, it is\nimportant to take steps toward robotic systems capable of understanding the\nenvironment. This work focuses on scene understanding to detect pick and place\ntasks given initial and final images from the scene. To this end, a dataset is\ncollected for object detection and pick and place task detection. A YOLOv5\nnetwork is subsequently trained to detect the objects in the initial and final\nscenes. Given the detected objects and their bounding boxes, two methods are\nproposed to detect the pick and place tasks which transform the initial scene\ninto the final scene. A geometric method is proposed which tracks objects'\nmovements in the two scenes and works based on the intersection of the bounding\nboxes which moved within scenes. Contrarily, the CNN-based method utilizes a\nConvolutional Neural Network to classify objects with intersected bounding\nboxes into 5 classes, showing the spatial relationship between the involved\nobjects. The performed pick and place tasks are then derived from analyzing the\nexperiments with both scenes. Results show that the CNN-based method, using a\nVGG16 backbone, outscores the geometric method by roughly 12 percentage points\nin certain scenarios, with an overall success rate of 84.3%.\n","authors":["Seraj Ghasemi","Hamed Hosseini","MohammadHossein Koosheshi","Mehdi Tale Masouleh","Ahmad Kalhor"],"pdf_url":"https://arxiv.org/pdf/2409.17720v1.pdf","comment":"Conference Paper, ICEE 2024, 7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17717v1","updated":"2024-09-26T10:40:23Z","published":"2024-09-26T10:40:23Z","title":"Behaviour4All: in-the-wild Facial Behaviour Analysis Toolkit","summary":" In this paper, we introduce Behavior4All, a comprehensive, open-source\ntoolkit for in-the-wild facial behavior analysis, integrating Face\nLocalization, Valence-Arousal Estimation, Basic Expression Recognition and\nAction Unit Detection, all within a single framework. Available in both\nCPU-only and GPU-accelerated versions, Behavior4All leverages 12 large-scale,\nin-the-wild datasets consisting of over 5 million images from diverse\ndemographic groups. It introduces a novel framework that leverages distribution\nmatching and label co-annotation to address tasks with non-overlapping\nannotations, encoding prior knowledge of their relatedness. In the largest\nstudy of its kind, Behavior4All outperforms both state-of-the-art and toolkits\nin overall performance as well as fairness across all databases and tasks. It\nalso demonstrates superior generalizability on unseen databases and on compound\nexpression recognition. Finally, Behavior4All is way times faster than other\ntoolkits.\n","authors":["Dimitrios Kollias","Chunchang Shao","Odysseus Kaloidas","Ioannis Patras"],"pdf_url":"https://arxiv.org/pdf/2409.17717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00230v3","updated":"2024-09-26T10:27:58Z","published":"2024-03-30T03:19:50Z","title":"Latent Watermark: Inject and Detect Watermarks in Latent Diffusion Space","summary":" Watermarking is a tool for actively identifying and attributing the images\ngenerated by latent diffusion models. Existing methods face the dilemma of\nimage quality and watermark robustness. Watermarks with superior image quality\nusually have inferior robustness against attacks such as blurring and JPEG\ncompression, while watermarks with superior robustness usually significantly\ndamage image quality. This dilemma stems from the traditional paradigm where\nwatermarks are injected and detected in pixel space, relying on pixel\nperturbation for watermark detection and resilience against attacks. In this\npaper, we highlight that an effective solution to the problem is to both inject\nand detect watermarks in the latent diffusion space, and propose Latent\nWatermark with a progressive training strategy. It weakens the direct\nconnection between quality and robustness and thus alleviates their\ncontradiction. We conduct evaluations on two datasets and against 10 watermark\nattacks. Six metrics measure the image quality and watermark robustness.\nResults show that compared to the recently proposed methods such as\nStableSignature, StegaStamp, RoSteALS, LaWa, TreeRing, and DiffuseTrace, LW not\nonly surpasses them in terms of robustness but also offers superior image\nquality. Our code will be available at\nhttps://github.com/RichardSunnyMeng/LatentWatermark.\n","authors":["Zheling Meng","Bo Peng","Jing Dong"],"pdf_url":"https://arxiv.org/pdf/2404.00230v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05024v2","updated":"2024-09-26T09:55:49Z","published":"2024-09-08T08:33:32Z","title":"Deep Self-Cleansing for Medical Image Segmentation with Noisy Labels","summary":" Medical image segmentation is crucial in the field of medical imaging, aiding\nin disease diagnosis and surgical planning. Most established segmentation\nmethods rely on supervised deep learning, in which clean and precise labels are\nessential for supervision and significantly impact the performance of models.\nHowever, manually delineated labels often contain noise, such as missing labels\nand inaccurate boundary delineation, which can hinder networks from correctly\nmodeling target characteristics. In this paper, we propose a deep\nself-cleansing segmentation framework that can preserve clean labels while\ncleansing noisy ones in the training phase. To achieve this, we devise a\ngaussian mixture model-based label filtering module that distinguishes noisy\nlabels from clean labels. Additionally, we develop a label cleansing module to\ngenerate pseudo low-noise labels for identified noisy samples. The preserved\nclean labels and pseudo-labels are then used jointly to supervise the network.\nValidated on a clinical liver tumor dataset and a public cardiac diagnosis\ndataset, our method can effectively suppress the interference from noisy labels\nand achieve prominent segmentation performance.\n","authors":["Jiahua Dong","Yue Zhang","Qiuli Wang","Ruofeng Tong","Shihong Ying","Shaolin Gong","Xuanpu Zhang","Lanfen Lin","Yen-Wei Chen","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.05024v2.pdf","comment":"31 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.17686v1","updated":"2024-09-26T09:51:11Z","published":"2024-09-26T09:51:11Z","title":"MoGenTS: Motion Generation based on Spatial-Temporal Joint Modeling","summary":" Motion generation from discrete quantization offers many advantages over\ncontinuous regression, but at the cost of inevitable approximation errors.\nPrevious methods usually quantize the entire body pose into one code, which not\nonly faces the difficulty in encoding all joints within one vector but also\nloses the spatial relationship between different joints. Differently, in this\nwork we quantize each individual joint into one vector, which i) simplifies the\nquantization process as the complexity associated with a single joint is\nmarkedly lower than that of the entire pose; ii) maintains a spatial-temporal\nstructure that preserves both the spatial relationships among joints and the\ntemporal movement patterns; iii) yields a 2D token map, which enables the\napplication of various 2D operations widely used in 2D images. Grounded in the\n2D motion quantization, we build a spatial-temporal modeling framework, where\n2D joint VQVAE, temporal-spatial 2D masking technique, and spatial-temporal 2D\nattention are proposed to take advantage of spatial-temporal signals among the\n2D tokens. Extensive experiments demonstrate that our method significantly\noutperforms previous methods across different datasets, with a $26.6\\%$\ndecrease of FID on HumanML3D and a $29.9\\%$ decrease on KIT-ML.\n","authors":["Weihao Yuan","Weichao Shen","Yisheng He","Yuan Dong","Xiaodong Gu","Zilong Dong","Liefeng Bo","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17686v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17682v1","updated":"2024-09-26T09:48:24Z","published":"2024-09-26T09:48:24Z","title":"Dark Miner: Defend against unsafe generation for text-to-image diffusion\n models","summary":" Text-to-image diffusion models have been demonstrated with unsafe generation\ndue to unfiltered large-scale training data, such as violent, sexual, and\nshocking images, necessitating the erasure of unsafe concepts. Most existing\nmethods focus on modifying the generation probabilities conditioned on the\ntexts containing unsafe descriptions. However, they fail to guarantee safe\ngeneration for unseen texts in the training phase, especially for the prompts\nfrom adversarial attacks. In this paper, we re-analyze the erasure task and\npoint out that existing methods cannot guarantee the minimization of the total\nprobabilities of unsafe generation. To tackle this problem, we propose Dark\nMiner. It entails a recurring three-stage process that comprises mining,\nverifying, and circumventing. It greedily mines embeddings with maximum\ngeneration probabilities of unsafe concepts and reduces unsafe generation more\neffectively. In the experiments, we evaluate its performance on two\ninappropriate concepts, two objects, and two styles. Compared with 6 previous\nstate-of-the-art methods, our method achieves better erasure and defense\nresults in most cases, especially under 4 state-of-the-art attacks, while\npreserving the model's native generation capability. Our code will be available\non GitHub.\n","authors":["Zheling Meng","Bo Peng","Xiaochuan Jin","Yue Jiang","Jing Dong","Wei Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2409.17682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17680v1","updated":"2024-09-26T09:43:50Z","published":"2024-09-26T09:43:50Z","title":"Event-based Stereo Depth Estimation: A Survey","summary":" Stereopsis has widespread appeal in robotics as it is the predominant way by\nwhich living beings perceive depth to navigate our 3D world. Event cameras are\nnovel bio-inspired sensors that detect per-pixel brightness changes\nasynchronously, with very high temporal resolution and high dynamic range,\nenabling machine perception in high-speed motion and broad illumination\nconditions. The high temporal precision also benefits stereo matching, making\ndisparity (depth) estimation a popular research area for event cameras ever\nsince its inception. Over the last 30 years, the field has evolved rapidly,\nfrom low-latency, low-power circuit design to current deep learning (DL)\napproaches driven by the computer vision community. The bibliography is vast\nand difficult to navigate for non-experts due its highly interdisciplinary\nnature. Past surveys have addressed distinct aspects of this topic, in the\ncontext of applications, or focusing only on a specific class of techniques,\nbut have overlooked stereo datasets. This survey provides a comprehensive\noverview, covering both instantaneous stereo and long-term methods suitable for\nsimultaneous localization and mapping (SLAM), along with theoretical and\nempirical comparisons. It is the first to extensively review DL methods as well\nas stereo datasets, even providing practical suggestions for creating new\nbenchmarks to advance the field. The main advantages and challenges faced by\nevent-based stereo depth estimation are also discussed. Despite significant\nprogress, challenges remain in achieving optimal performance in not only\naccuracy but also efficiency, a cornerstone of event-based computing. We\nidentify several gaps and propose future research directions. We hope this\nsurvey inspires future research in this area, by serving as an accessible entry\npoint for newcomers, as well as a practical guide for seasoned researchers in\nthe community.\n","authors":["Suman Ghosh","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2409.17680v1.pdf","comment":"28 pages, 20 figures, 7 tables"},{"id":"http://arxiv.org/abs/2409.17675v1","updated":"2024-09-26T09:34:33Z","published":"2024-09-26T09:34:33Z","title":"EM-Net: Efficient Channel and Frequency Learning with Mamba for 3D\n Medical Image Segmentation","summary":" Convolutional neural networks have primarily led 3D medical image\nsegmentation but may be limited by small receptive fields. Transformer models\nexcel in capturing global relationships through self-attention but are\nchallenged by high computational costs at high resolutions. Recently, Mamba, a\nstate space model, has emerged as an effective approach for sequential\nmodeling. Inspired by its success, we introduce a novel Mamba-based 3D medical\nimage segmentation model called EM-Net. It not only efficiently captures\nattentive interaction between regions by integrating and selecting channels,\nbut also effectively utilizes frequency domain to harmonize the learning of\nfeatures across varying scales, while accelerating training speed.\nComprehensive experiments on two challenging multi-organ datasets with other\nstate-of-the-art (SOTA) algorithms show that our method exhibits better\nsegmentation accuracy while requiring nearly half the parameter size of SOTA\nmodels and 2x faster training speed.\n","authors":["Ao Chang","Jiajun Zeng","Ruobing Huang","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2409.17675v1.pdf","comment":"10 pages, 3 figures, accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2409.17674v1","updated":"2024-09-26T09:33:20Z","published":"2024-09-26T09:33:20Z","title":"Self-Supervised Learning of Deviation in Latent Representation for\n Co-speech Gesture Video Generation","summary":" Gestures are pivotal in enhancing co-speech communication. While recent works\nhave mostly focused on point-level motion transformation or fully supervised\nmotion representations through data-driven approaches, we explore the\nrepresentation of gestures in co-speech, with a focus on self-supervised\nrepresentation and pixel-level motion deviation, utilizing a diffusion model\nwhich incorporates latent motion features. Our approach leverages\nself-supervised deviation in latent representation to facilitate hand gestures\ngeneration, which are crucial for generating realistic gesture videos. Results\nof our first experiment demonstrate that our method enhances the quality of\ngenerated videos, with an improvement from 2.7 to 4.5% for FGD, DIV, and FVD,\nand 8.1% for PSNR, 2.5% for SSIM over the current state-of-the-art methods.\n","authors":["Huan Yang","Jiahui Chen","Chaofan Ding","Runhua Shi","Siyu Xiong","Qingqi Hong","Xiaoqi Mo","Xinhan Di"],"pdf_url":"https://arxiv.org/pdf/2409.17674v1.pdf","comment":"5 pages, 5 figures, conference"},{"id":"http://arxiv.org/abs/2404.09486v2","updated":"2024-09-26T09:31:48Z","published":"2024-04-15T06:15:46Z","title":"MMCode: Benchmarking Multimodal Large Language Models for Code\n Generation with Visually Rich Programming Problems","summary":" Programming often involves converting detailed and complex specifications\ninto code, a process during which developers typically utilize visual aids to\nmore effectively convey concepts. While recent developments in Large Multimodal\nModels have demonstrated remarkable abilities in visual reasoning and\nmathematical tasks, there is little work on investigating whether these models\ncan effectively interpret visual elements for code generation. To this end, we\npresent MMCode, the first multi-modal coding dataset for evaluating algorithmic\nproblem-solving skills in visually rich contexts. MMCode contains 3,548\nquestions and 6,620 images collected from real-world programming challenges\nharvested from 10 code competition websites, presenting significant challenges\ndue to the extreme demand for reasoning abilities. Our experiment results show\nthat current state-of-the-art models struggle to solve these problems. The\nresults highlight the lack of powerful vision-code models, and we hope MMCode\ncan serve as an inspiration for future works in this domain. The data and code\nare publicly available at https://github.com/likaixin2000/MMCode.\n","authors":["Kaixin Li","Yuchen Tian","Qisheng Hu","Ziyang Luo","Zhiyong Huang","Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2404.09486v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17671v1","updated":"2024-09-26T09:30:37Z","published":"2024-09-26T09:30:37Z","title":"Leveraging Anthropometric Measurements to Improve Human Mesh Estimation\n and Ensure Consistent Body Shapes","summary":" The basic body shape of a person does not change within a single video.\nHowever, most SOTA human mesh estimation (HME) models output a slightly\ndifferent body shape for each video frame, which results in inconsistent body\nshapes for the same person. In contrast, we leverage anthropometric\nmeasurements like tailors are already obtaining from humans for centuries. We\ncreate a model called A2B that converts such anthropometric measurements to\nbody shape parameters of human mesh models. Moreover, we find that finetuned\nSOTA 3D human pose estimation (HPE) models outperform HME models regarding the\nprecision of the estimated keypoints. We show that applying inverse kinematics\n(IK) to the results of such a 3D HPE model and combining the resulting body\npose with the A2B body shape leads to superior and consistent human meshes for\nchallenging datasets like ASPset or fit3D, where we can lower the MPJPE by over\n30 mm compared to SOTA HME models. Further, replacing HME models estimates of\nthe body shape parameters with A2B model results not only increases the\nperformance of these HME models, but also leads to consistent body shapes.\n","authors":["Katja Ludwig","Julian Lorenz","Daniel Kienzle","Tuan Bui","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2409.17671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17663v1","updated":"2024-09-26T09:21:48Z","published":"2024-09-26T09:21:48Z","title":"Explanation Bottleneck Models","summary":" Recent concept-based interpretable models have succeeded in providing\nmeaningful explanations by pre-defined concept sets. However, the dependency on\nthe pre-defined concepts restricts the application because of the limited\nnumber of concepts for explanations. This paper proposes a novel interpretable\ndeep neural network called explanation bottleneck models (XBMs). XBMs generate\na text explanation from the input without pre-defined concepts and then predict\na final task prediction based on the generated explanation by leveraging\npre-trained vision-language encoder-decoder models. To achieve both the target\ntask performance and the explanation quality, we train XBMs through the target\ntask loss with the regularization penalizing the explanation decoder via the\ndistillation from the frozen pre-trained decoder. Our experiments, including a\ncomparison to state-of-the-art concept bottleneck models, confirm that XBMs\nprovide accurate and fluent natural language explanations without pre-defined\nconcept sets. Code will be available at https://github.com/yshinya6/xbm/.\n","authors":["Shin'ya Yamaguchi","Kosuke Nishida"],"pdf_url":"https://arxiv.org/pdf/2409.17663v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.15897v2","updated":"2024-09-26T09:04:58Z","published":"2023-12-26T06:20:55Z","title":"Recursive Distillation for Open-Set Distributed Robot Localization","summary":" A typical assumption in state-of-the-art self-localization models is that an\nannotated training dataset is available for the target workspace. However, this\nis not necessarily true when a robot travels around the general open world.\nThis work introduces a novel training scheme for open-world distributed robot\nsystems. In our scheme, a robot (``student\") can ask the other robots it meets\nat unfamiliar places (``teachers\") for guidance. Specifically, a\npseudo-training dataset is reconstructed from the teacher model and then used\nfor continual learning of the student model under domain, class, and vocabulary\nincremental setup. Unlike typical knowledge transfer schemes, our scheme\nintroduces only minimal assumptions on the teacher model, so that it can handle\nvarious types of open-set teachers, including those uncooperative, untrainable\n(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In\nthis paper, we investigate a ranking function as an instance of such generic\nmodels, using a challenging data-free recursive distillation scenario, where a\nstudent once trained can recursively join the next-generation open teacher set.\n","authors":["Kenta Tsukahara","Kanji Tanaka"],"pdf_url":"https://arxiv.org/pdf/2312.15897v2.pdf","comment":"5 pages, 4 figures, technical report"},{"id":"http://arxiv.org/abs/2402.18411v3","updated":"2024-09-26T09:00:00Z","published":"2024-02-28T15:31:45Z","title":"Unsupervised Cross-Domain Image Retrieval via Prototypical Optimal\n Transport","summary":" Unsupervised cross-domain image retrieval (UCIR) aims to retrieve images\nsharing the same category across diverse domains without relying on labeled\ndata. Prior approaches have typically decomposed the UCIR problem into two\ndistinct tasks: intra-domain representation learning and cross-domain feature\nalignment. However, these segregated strategies overlook the potential\nsynergies between these tasks. This paper introduces ProtoOT, a novel Optimal\nTransport formulation explicitly tailored for UCIR, which integrates\nintra-domain feature representation learning and cross-domain alignment into a\nunified framework. ProtoOT leverages the strengths of the K-means clustering\nmethod to effectively manage distribution imbalances inherent in UCIR. By\nutilizing K-means for generating initial prototypes and approximating class\nmarginal distributions, we modify the constraints in Optimal Transport\naccordingly, significantly enhancing its performance in UCIR scenarios.\nFurthermore, we incorporate contrastive learning into the ProtoOT framework to\nfurther improve representation learning. This encourages local semantic\nconsistency among features with similar semantics, while also explicitly\nenforcing separation between features and unmatched prototypes, thereby\nenhancing global discriminativeness. ProtoOT surpasses existing\nstate-of-the-art methods by a notable margin across benchmark datasets.\nNotably, on DomainNet, ProtoOT achieves an average P@200 enhancement of 18.17%,\nand on Office-Home, it demonstrates a P@15 improvement of 3.83%.\n","authors":["Bin Li","Ye Shi","Qian Yu","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2402.18411v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18576v5","updated":"2024-09-26T08:57:49Z","published":"2023-11-30T14:15:39Z","title":"Fixed-length Dense Descriptor for Efficient Fingerprint Matching","summary":" In fingerprint matching, fixed-length descriptors generally offer greater\nefficiency compared to minutiae set, but the recognition accuracy is not as\ngood as that of the latter. Although much progress has been made in deep\nlearning based fixed-length descriptors recently, they often fall short when\ndealing with incomplete or partial fingerprints, diverse fingerprint poses, and\nsignificant background noise. In this paper, we propose a three-dimensional\nrepresentation called Fixed-length Dense Descriptor (FDD) for efficient\nfingerprint matching. FDD features great spatial properties, enabling it to\ncapture the spatial relationships of the original fingerprints, thereby\nenhancing interpretability and robustness. Our experiments on various\nfingerprint datasets reveal that FDD outperforms other fixed-length\ndescriptors, especially in matching fingerprints of different areas,\ncross-modal fingerprint matching, and fingerprint matching with background\nnoise.\n","authors":["Zhiyu Pan","Yongjie Duan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.18576v5.pdf","comment":"Accepted by WIFS 2024"},{"id":"http://arxiv.org/abs/2409.17649v1","updated":"2024-09-26T08:55:44Z","published":"2024-09-26T08:55:44Z","title":"Provable Performance Guarantees of Copy Detection Patterns","summary":" Copy Detection Patterns (CDPs) are crucial elements in modern security\napplications, playing a vital role in safeguarding industries such as food,\npharmaceuticals, and cosmetics. Current performance evaluations of CDPs\npredominantly rely on empirical setups using simplistic metrics like Hamming\ndistances or Pearson correlation. These methods are often inadequate due to\ntheir sensitivity to distortions, degradation, and their limitations to\nstationary statistics of printing and imaging. Additionally, machine\nlearning-based approaches suffer from distribution biases and fail to\ngeneralize to unseen counterfeit samples. Given the critical importance of CDPs\nin preventing counterfeiting, including the counterfeit vaccines issue\nhighlighted during the COVID-19 pandemic, there is an urgent need for provable\nperformance guarantees across various criteria. This paper aims to establish a\ntheoretical framework to derive optimal criteria for the analysis,\noptimization, and future development of CDP authentication technologies,\nensuring their reliability and effectiveness in diverse security scenarios.\n","authors":["Joakim Tutt","Slava Voloshynovskiy"],"pdf_url":"https://arxiv.org/pdf/2409.17649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17647v1","updated":"2024-09-26T08:51:29Z","published":"2024-09-26T08:51:29Z","title":"MECD: Unlocking Multi-Event Causal Discovery in Video Reasoning","summary":" Video causal reasoning aims to achieve a high-level understanding of video\ncontent from a causal perspective. However, current video reasoning tasks are\nlimited in scope, primarily executed in a question-answering paradigm and\nfocusing on short videos containing only a single event and simple causal\nrelationships, lacking comprehensive and structured causality analysis for\nvideos with multiple events. To fill this gap, we introduce a new task and\ndataset, Multi-Event Causal Discovery (MECD). It aims to uncover the causal\nrelationships between events distributed chronologically across long videos.\nGiven visual segments and textual descriptions of events, MECD requires\nidentifying the causal associations between these events to derive a\ncomprehensive, structured event-level video causal diagram explaining why and\nhow the final result event occurred. To address MECD, we devise a novel\nframework inspired by the Granger Causality method, using an efficient\nmask-based event prediction model to perform an Event Granger Test, which\nestimates causality by comparing the predicted result event when premise events\nare masked versus unmasked. Furthermore, we integrate causal inference\ntechniques such as front-door adjustment and counterfactual inference to\naddress challenges in MECD like causality confounding and illusory causality.\nExperiments validate the effectiveness of our framework in providing causal\nrelationships in multi-event videos, outperforming GPT-4o and VideoLLaVA by\n5.7% and 4.1%, respectively.\n","authors":["Tieyuan Chen","Huabin Liu","Tianyao He","Yihang Chen","Chaofan Gan","Xiao Ma","Cheng Zhong","Yang Zhang","Yingxue Wang","Hui Lin","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2409.17647v1.pdf","comment":"Accepted at NeurIPS 2024 as a spotlight paper"},{"id":"http://arxiv.org/abs/2409.15246v2","updated":"2024-09-26T08:48:03Z","published":"2024-09-23T17:42:05Z","title":"On-Air Deep Learning Integrated Semantic Inference Models for Enhanced\n Earth Observation Satellite Networks","summary":" Earth Observation (EO) systems play a crucial role in achieving Sustainable\nDevelopment Goals by collecting and analyzing vital global data through\nsatellite networks. These systems are essential for tasks like mapping,\ndisaster monitoring, and resource management, but they face challenges in\nprocessing and transmitting large volumes of EO data, especially in specialized\nfields such as agriculture and real-time disaster response. Domain-adapted\nLarge Language Models (LLMs) provide a promising solution by facilitating data\nfusion between extensive EO data and semantic EO data. By improving integration\nand interpretation of diverse datasets, LLMs address the challenges of\nprocessing specialized information in agriculture and disaster response\napplications. This fusion enhances the accuracy and relevance of transmitted\ndata. This paper presents a framework for semantic communication in EO\nsatellite networks, aimed at improving data transmission efficiency and overall\nsystem performance through cognitive processing techniques. The proposed system\nemploys Discrete-Task-Oriented Source-Channel Coding (DT-JSCC) and Semantic\nData Augmentation (SA) to focus on relevant information while minimizing\ncommunication overhead. By integrating cognitive semantic processing and\ninter-satellite links, the framework enhances the analysis and transmission of\nmultispectral satellite imagery, improving object detection, pattern\nrecognition, and real-time decision-making. The introduction of Cognitive\nSemantic Augmentation (CSA) allows satellites to process and transmit semantic\ninformation, boosting adaptability to changing environments and application\nneeds. This end-to-end architecture is tailored for next-generation satellite\nnetworks, such as those supporting 6G, and demonstrates significant\nimprovements in efficiency and accuracy.\n","authors":["Hong-fu Chou","Vu Nguyen Ha","Prabhu Thiruvasagam","Thanh-Dung Le","Geoffrey Eappen","Ti Ti Nguyen","Luis M. Garces-Socarras","Jorge L. Gonzalez-Rios","Juan Carlos Merlano-Duncan","Symeon Chatzinotas"],"pdf_url":"https://arxiv.org/pdf/2409.15246v2.pdf","comment":"18 pages, 10 figures, magazine"},{"id":"http://arxiv.org/abs/2409.17634v1","updated":"2024-09-26T08:31:27Z","published":"2024-09-26T08:31:27Z","title":"P4Q: Learning to Prompt for Quantization in Visual-language Models","summary":" Large-scale pre-trained Vision-Language Models (VLMs) have gained prominence\nin various visual and multimodal tasks, yet the deployment of VLMs on\ndownstream application platforms remains challenging due to their prohibitive\nrequirements of training samples and computing resources. Fine-tuning and\nquantization of VLMs can substantially reduce the sample and computation costs,\nwhich are in urgent need. There are two prevailing paradigms in quantization,\nQuantization-Aware Training (QAT) can effectively quantize large-scale VLMs but\nincur a huge training cost, while low-bit Post-Training Quantization (PTQ)\nsuffers from a notable performance drop. We propose a method that balances\nfine-tuning and quantization named ``Prompt for Quantization'' (P4Q), in which\nwe design a lightweight architecture to leverage contrastive loss supervision\nto enhance the recognition performance of a PTQ model. Our method can\neffectively reduce the gap between image features and text features caused by\nlow-bit quantization, based on learnable prompts to reorganize textual\nrepresentations and a low-bit adapter to realign the distributions of image and\ntext features. We also introduce a distillation loss based on cosine similarity\npredictions to distill the quantized model using a full-precision teacher.\nExtensive experimental results demonstrate that our P4Q method outperforms\nprior arts, even achieving comparable results to its full-precision\ncounterparts. For instance, our 8-bit P4Q can theoretically compress the\nCLIP-ViT/B-32 by 4 $\\times$ while achieving 66.94\\% Top-1 accuracy,\noutperforming the learnable prompt fine-tuned full-precision model by 2.24\\%\nwith negligible additional parameters on the ImageNet dataset.\n","authors":["Huixin Sun","Runqi Wang","Yanjing Li","Xianbin Cao","Xiaolong Jiang","Yao Hu","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16723v2","updated":"2024-09-26T08:28:48Z","published":"2024-09-25T08:22:00Z","title":"EAGLE: Towards Efficient Arbitrary Referring Visual Prompts\n Comprehension for Multimodal Large Language Models","summary":" Recently, Multimodal Large Language Models (MLLMs) have sparked great\nresearch interests owing to their exceptional content-reasoning and\ninstruction-following capabilities. To effectively instruct an MLLM, in\naddition to conventional language expressions, the practice of referring to\nobjects by painting with brushes on images has emerged as a prevalent tool\n(referred to as \"referring visual prompts\") due to its efficacy in aligning the\nuser's intention with specific image regions. To accommodate the most common\nreferring visual prompts, namely points, boxes, and masks, existing approaches\ninitially utilize specialized feature encoding modules to capture the semantics\nof the highlighted areas indicated by these prompts. Subsequently, these\nencoded region features are adapted to MLLMs through fine-tuning on a\nmeticulously curated multimodal instruction dataset. However, such designs\nsuffer from redundancy in architecture. Moreover, they face challenges in\neffectively generalizing when encountering a diverse range of arbitrary\nreferring visual prompts in real-life scenarios. To address the above issues,\nwe propose EAGLE, a novel MLLM that empowers comprehension of arbitrary\nreferring visual prompts with less training efforts than existing approaches.\nSpecifically, our EAGLE maintains the innate format of the referring visual\nprompts as colored patches rendered on the given image for conducting the\ninstruction tuning. Our approach embeds referring visual prompts as spatial\nconcepts conveying specific spatial areas comprehensible to the MLLM, with the\nsemantic comprehension of these regions originating from the MLLM itself.\nBesides, we also propose a Geometry-Agnostic Learning paradigm (GAL) to further\ndisentangle the MLLM's region-level comprehension with the specific formats of\nreferring visual prompts. Extensive experiments are conducted to prove the\neffectiveness of our proposed method.\n","authors":["Jiacheng Zhang","Yang Jiao","Shaoxiang Chen","Jingjing Chen","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.16723v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17629v1","updated":"2024-09-26T08:23:04Z","published":"2024-09-26T08:23:04Z","title":"Hand-object reconstruction via interaction-aware graph attention\n mechanism","summary":" Estimating the poses of both a hand and an object has become an important\narea of research due to the growing need for advanced vision computing. The\nprimary challenge involves understanding and reconstructing how hands and\nobjects interact, such as contact and physical plausibility. Existing\napproaches often adopt a graph neural network to incorporate spatial\ninformation of hand and object meshes. However, these approaches have not fully\nexploited the potential of graphs without modification of edges within and\nbetween hand- and object-graphs. We propose a graph-based refinement method\nthat incorporates an interaction-aware graph-attention mechanism to account for\nhand-object interactions. Using edges, we establish connections among closely\ncorrelated nodes, both within individual graphs and across different graphs.\nExperiments demonstrate the effectiveness of our proposed method with notable\nimprovements in the realm of physical plausibility.\n","authors":["Taeyun Woo","Tae-Kyun Kim","Jinah Park"],"pdf_url":"https://arxiv.org/pdf/2409.17629v1.pdf","comment":"7 pages, Accepted by ICIP 2024"},{"id":"http://arxiv.org/abs/2405.17251v2","updated":"2024-09-26T08:22:52Z","published":"2024-05-27T15:07:04Z","title":"GenWarp: Single Image to Novel Views with Semantic-Preserving Generative\n Warping","summary":" Generating novel views from a single image remains a challenging task due to\nthe complexity of 3D scenes and the limited diversity in the existing\nmulti-view datasets to train a model on. Recent research combining large-scale\ntext-to-image (T2I) models with monocular depth estimation (MDE) has shown\npromise in handling in-the-wild images. In these methods, an input view is\ngeometrically warped to novel views with estimated depth maps, then the warped\nimage is inpainted by T2I models. However, they struggle with noisy depth maps\nand loss of semantic details when warping an input view to novel viewpoints. In\nthis paper, we propose a novel approach for single-shot novel view synthesis, a\nsemantic-preserving generative warping framework that enables T2I generative\nmodels to learn where to warp and where to generate, through augmenting\ncross-view attention with self-attention. Our approach addresses the\nlimitations of existing methods by conditioning the generative model on source\nview images and incorporating geometric warping signals. Qualitative and\nquantitative evaluations demonstrate that our model outperforms existing\nmethods in both in-domain and out-of-domain scenarios. Project page is\navailable at https://GenWarp-NVS.github.io/.\n","authors":["Junyoung Seo","Kazumi Fukuda","Takashi Shibuya","Takuya Narihira","Naoki Murata","Shoukang Hu","Chieh-Hsin Lai","Seungryong Kim","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2405.17251v2.pdf","comment":"Accepted to NeurIPS 2024 / Project page:\n https://GenWarp-NVS.github.io"},{"id":"http://arxiv.org/abs/2309.11531v2","updated":"2024-09-26T08:20:59Z","published":"2023-09-20T10:50:28Z","title":"EPTQ: Enhanced Post-Training Quantization via Hessian-guided\n Network-wise Optimization","summary":" Quantization is a key method for deploying deep neural networks on edge\ndevices with limited memory and computation resources. Recent improvements in\nPost-Training Quantization (PTQ) methods were achieved by an additional local\noptimization process for learning the weight quantization rounding policy.\nHowever, a gap exists when employing network-wise optimization with small\nrepresentative datasets. In this paper, we propose a new method for enhanced\nPTQ (EPTQ) that employs a network-wise quantization optimization process, which\nbenefits from considering cross-layer dependencies during optimization. EPTQ\nenables network-wise optimization with a small representative dataset using a\nnovel sample-layer attention score based on a label-free Hessian matrix upper\nbound. The label-free approach makes our method suitable for the PTQ scheme. We\ngive a theoretical analysis for the said bound and use it to construct a\nknowledge distillation loss that guides the optimization to focus on the more\nsensitive layers and samples. In addition, we leverage the Hessian upper bound\nto improve the weight quantization parameters selection by focusing on the more\nsensitive elements in the weight tensors. Empirically, by employing EPTQ we\nachieve state-of-the-art results on various models, tasks, and datasets,\nincluding ImageNet classification, COCO object detection, and Pascal-VOC for\nsemantic segmentation.\n","authors":["Ofir Gordon","Elad Cohen","Hai Victor Habi","Arnon Netzer"],"pdf_url":"https://arxiv.org/pdf/2309.11531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14220v2","updated":"2024-09-26T08:13:43Z","published":"2024-09-21T18:52:07Z","title":"Masks and Boxes: Combining the Best of Both Worlds for Multi-Object\n Tracking","summary":" Multi-object tracking (MOT) involves identifying and consistently tracking\nobjects across video sequences. Traditional tracking-by-detection methods,\nwhile effective, often require extensive tuning and lack generalizability. On\nthe other hand, segmentation mask-based methods are more generic but struggle\nwith tracking management, making them unsuitable for MOT. We propose a novel\napproach, McByte, which incorporates a temporally propagated segmentation mask\nas a strong association cue within a tracking-by-detection framework. By\ncombining bounding box and mask information, McByte enhances robustness and\ngeneralizability without per-sequence tuning. Evaluated on four benchmark\ndatasets - DanceTrack, MOT17, SoccerNet-tracking 2022, and KITTI-tracking -\nMcByte demonstrates performance gain in all cases examined. At the same time,\nit outperforms existing mask-based methods. Implementation code will be\nprovided upon acceptance.\n","authors":["Tomasz Stanczyk","Francois Bremond"],"pdf_url":"https://arxiv.org/pdf/2409.14220v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13818v3","updated":"2024-09-26T08:07:16Z","published":"2024-08-25T12:22:50Z","title":"HER2 and FISH Status Prediction in Breast Biopsy H&E-Stained Images\n Using Deep Learning","summary":" The current standard for detecting human epidermal growth factor receptor 2\n(HER2) status in breast cancer patients relies on HER2 amplification,\nidentified through fluorescence in situ hybridization (FISH) or\nimmunohistochemistry (IHC). However, hematoxylin and eosin (H\\&E) tumor stains\nare more widely available, and accurately predicting HER2 status using H\\&E\ncould reduce costs and expedite treatment selection. Deep Learning algorithms\nfor H&E have shown effectiveness in predicting various cancer features and\nclinical outcomes, including moderate success in HER2 status prediction. In\nthis work, we employed a customized weak supervision classification technique\ncombined with MoCo-v2 contrastive learning to predict HER2 status. We trained\nour pipeline on 182 publicly available H&E Whole Slide Images (WSIs) from The\nCancer Genome Atlas (TCGA), for which annotations by the pathology team at Yale\nSchool of Medicine are publicly available. Our pipeline achieved an Area Under\nthe Curve (AUC) of 0.85 across four different test folds. Additionally, we\ntested our model on 44 H&E slides from the TCGA-BRCA dataset, which had an HER2\nscore of 2+ and included corresponding HER2 status and FISH test results. These\ncases are considered equivocal for IHC, requiring an expensive FISH test on\ntheir IHC slides for disambiguation. Our pipeline demonstrated an AUC of 0.81\non these challenging H&E slides. Reducing the need for FISH test can have\nsignificant implications in cancer treatment equity for underserved\npopulations.\n","authors":["Ardhendu Sekhar","Vrinda Goel","Garima Jain","Abhijeet Patil","Ravi Kant Gupta","Tripti Bameta","Swapnil Rane","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2408.13818v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17612v1","updated":"2024-09-26T08:03:19Z","published":"2024-09-26T08:03:19Z","title":"Diversity-Driven Synthesis: Enhancing Dataset Distillation through\n Directed Weight Adjustment","summary":" The sharp increase in data-related expenses has motivated research into\ncondensing datasets while retaining the most informative features. Dataset\ndistillation has thus recently come to the fore. This paradigm generates\nsynthetic dataset that are representative enough to replace the original\ndataset in training a neural network. To avoid redundancy in these synthetic\ndatasets, it is crucial that each element contains unique features and remains\ndiverse from others during the synthesis stage. In this paper, we provide a\nthorough theoretical and empirical analysis of diversity within synthesized\ndatasets. We argue that enhancing diversity can improve the parallelizable yet\nisolated synthesizing approach. Specifically, we introduce a novel method that\nemploys dynamic and directed weight adjustment techniques to modulate the\nsynthesis process, thereby maximizing the representativeness and diversity of\neach synthetic instance. Our method ensures that each batch of synthetic data\nmirrors the characteristics of a large, varying subset of the original dataset.\nExtensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet,\nand ImageNet-1K, demonstrate the superior performance of our method,\nhighlighting its effectiveness in producing diverse and representative\nsynthetic datasets with minimal computational expense.\n","authors":["Jiawei Du","Xin Zhang","Juncheng Hu","Wenxin Huang","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.17612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06190v2","updated":"2024-09-26T07:56:50Z","published":"2024-08-12T14:40:38Z","title":"FruitNeRF: A Unified Neural Radiance Field based Fruit Counting\n Framework","summary":" We introduce FruitNeRF, a unified novel fruit counting framework that\nleverages state-of-the-art view synthesis methods to count any fruit type\ndirectly in 3D. Our framework takes an unordered set of posed images captured\nby a monocular camera and segments fruit in each image. To make our system\nindependent of the fruit type, we employ a foundation model that generates\nbinary segmentation masks for any fruit. Utilizing both modalities, RGB and\nsemantic, we train a semantic neural radiance field. Through uniform volume\nsampling of the implicit Fruit Field, we obtain fruit-only point clouds. By\napplying cascaded clustering on the extracted point cloud, our approach\nachieves precise fruit count.The use of neural radiance fields provides\nsignificant advantages over conventional methods such as object tracking or\noptical flow, as the counting itself is lifted into 3D. Our method prevents\ndouble counting fruit and avoids counting irrelevant fruit.We evaluate our\nmethodology using both real-world and synthetic datasets. The real-world\ndataset consists of three apple trees with manually counted ground truths, a\nbenchmark apple dataset with one row and ground truth fruit location, while the\nsynthetic dataset comprises various fruit types including apple, plum, lemon,\npear, peach, and mango.Additionally, we assess the performance of fruit\ncounting using the foundation model compared to a U-Net.\n","authors":["Lukas Meyer","Andreas Gilson","Ute Schmid","Marc Stamminger"],"pdf_url":"https://arxiv.org/pdf/2408.06190v2.pdf","comment":"Project Page: https://meyerls.github.io/fruit_nerf/"},{"id":"http://arxiv.org/abs/2409.17610v1","updated":"2024-09-26T07:55:57Z","published":"2024-09-26T07:55:57Z","title":"ZALM3: Zero-Shot Enhancement of Vision-Language Alignment via In-Context\n Information in Multi-Turn Multimodal Medical Dialogue","summary":" The rocketing prosperity of large language models (LLMs) in recent years has\nboosted the prevalence of vision-language models (VLMs) in the medical sector.\nIn our online medical consultation scenario, a doctor responds to the texts and\nimages provided by a patient in multiple rounds to diagnose her/his health\ncondition, forming a multi-turn multimodal medical dialogue format. Unlike\nhigh-quality images captured by professional equipment in traditional medical\nvisual question answering (Med-VQA), the images in our case are taken by\npatients' mobile phones. These images have poor quality control, with issues\nsuch as excessive background elements and the lesion area being significantly\noff-center, leading to degradation of vision-language alignment in the model\ntraining phase. In this paper, we propose ZALM3, a Zero-shot strategy to\nimprove vision-language ALignment in Multi-turn Multimodal Medical dialogue.\nSince we observe that the preceding text conversations before an image can\ninfer the regions of interest (RoIs) in the image, ZALM3 employs an LLM to\nsummarize the keywords from the preceding context and a visual grounding model\nto extract the RoIs. The updated images eliminate unnecessary background noise\nand provide more effective vision-language alignment. To better evaluate our\nproposed method, we design a new subjective assessment metric for multi-turn\nunimodal/multimodal medical dialogue to provide a fine-grained performance\ncomparison. Our experiments across three different clinical departments\nremarkably demonstrate the efficacy of ZALM3 with statistical significance.\n","authors":["Zhangpu Li","Changhong Zou","Suxue Ma","Zhicheng Yang","Chen Du","Youbao Tang","Zhenjie Cao","Ning Zhang","Jui-Hsin Lai","Ruei-Sung Lin","Yuan Ni","Xingzhi Sun","Jing Xiao","Kai Zhang","Mei Han"],"pdf_url":"https://arxiv.org/pdf/2409.17610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17608v1","updated":"2024-09-26T07:48:20Z","published":"2024-09-26T07:48:20Z","title":"Appearance Blur-driven AutoEncoder and Motion-guided Memory Module for\n Video Anomaly Detection","summary":" Video anomaly detection (VAD) often learns the distribution of normal samples\nand detects the anomaly through measuring significant deviations, but the\nundesired generalization may reconstruct a few anomalies thus suppressing the\ndeviations. Meanwhile, most VADs cannot cope with cross-dataset validation for\nnew target domains, and few-shot methods must laboriously rely on model-tuning\nfrom the target domain to complete domain adaptation. To address these\nproblems, we propose a novel VAD method with a motion-guided memory module to\nachieve cross-dataset validation with zero-shot. First, we add Gaussian blur to\nthe raw appearance images, thereby constructing the global pseudo-anomaly,\nwhich serves as the input to the network. Then, we propose multi-scale residual\nchannel attention to deblur the pseudo-anomaly in normal samples. Next, memory\nitems are obtained by recording the motion features in the training phase,\nwhich are used to retrieve the motion features from the raw information in the\ntesting phase. Lastly, our method can ignore the blurred real anomaly through\nattention and rely on motion memory items to increase the normality gap between\nnormal and abnormal motion. Extensive experiments on three benchmark datasets\ndemonstrate the effectiveness of the proposed method. Compared with\ncross-domain methods, our method achieves competitive performance without\nadaptation during testing.\n","authors":["Jiahao Lyu","Minghua Zhao","Jing Hu","Xuewen Huang","Shuangli Du","Cheng Shi","Zhiyong Lv"],"pdf_url":"https://arxiv.org/pdf/2409.17608v1.pdf","comment":"13 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.03944v2","updated":"2024-09-26T07:47:50Z","published":"2024-07-22T03:56:27Z","title":"Improving Fast Adversarial Training Paradigm: An Example Taxonomy\n Perspective","summary":" While adversarial training is an effective defense method against adversarial\nattacks, it notably increases the training cost. To this end, fast adversarial\ntraining (FAT) is presented for efficient training and has become a hot\nresearch topic. However, FAT suffers from catastrophic overfitting, which leads\nto a performance drop compared with multi-step adversarial training. However,\nthe cause of catastrophic overfitting remains unclear and lacks exploration. In\nthis paper, we present an example taxonomy in FAT, which identifies that\ncatastrophic overfitting is caused by the imbalance between the inner and outer\noptimization in FAT. Furthermore, we investigated the impact of varying degrees\nof training loss, revealing a correlation between training loss and\ncatastrophic overfitting. Based on these observations, we redesign the loss\nfunction in FAT with the proposed dynamic label relaxation to concentrate the\nloss range and reduce the impact of misclassified examples. Meanwhile, we\nintroduce batch momentum initialization to enhance the diversity to prevent\ncatastrophic overfitting in an efficient manner. Furthermore, we also propose\nCatastrophic Overfitting aware Loss Adaptation (COLA), which employs a separate\ntraining strategy for examples based on their loss degree. Our proposed method,\nnamed example taxonomy aware FAT (ETA), establishes an improved paradigm for\nFAT. Experiment results demonstrate our ETA achieves state-of-the-art\nperformance. Comprehensive experiments on four standard datasets demonstrate\nthe competitiveness of our proposed method.\n","authors":["Jie Gui","Chengze Jiang","Minjing Dong","Kun Tong","Xinli Shi","Yuan Yan Tang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.03944v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2409.17605v1","updated":"2024-09-26T07:43:12Z","published":"2024-09-26T07:43:12Z","title":"Good Data Is All Imitation Learning Needs","summary":" In this paper, we address the limitations of traditional teacher-student\nmodels, imitation learning, and behaviour cloning in the context of\nAutonomous/Automated Driving Systems (ADS), where these methods often struggle\nwith incomplete coverage of real-world scenarios. To enhance the robustness of\nsuch models, we introduce the use of Counterfactual Explanations (CFEs) as a\nnovel data augmentation technique for end-to-end ADS. CFEs, by generating\ntraining samples near decision boundaries through minimal input modifications,\nlead to a more comprehensive representation of expert driver strategies,\nparticularly in safety-critical scenarios. This approach can therefore help\nimprove the model's ability to handle rare and challenging driving events, such\nas anticipating darting out pedestrians, ultimately leading to safer and more\ntrustworthy decision-making for ADS. Our experiments in the CARLA simulator\ndemonstrate that CF-Driver outperforms the current state-of-the-art method,\nachieving a higher driving score and lower infraction rates. Specifically,\nCF-Driver attains a driving score of 84.2, surpassing the previous best model\nby 15.02 percentage points. These results highlight the effectiveness of\nincorporating CFEs in training end-to-end ADS. To foster further research, the\nCF-Driver code is made publicly available.\n","authors":["Amir Samadi","Konstantinos Koufos","Kurt Debattista","Mehrdad Dianati"],"pdf_url":"https://arxiv.org/pdf/2409.17605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17601v1","updated":"2024-09-26T07:35:23Z","published":"2024-09-26T07:35:23Z","title":"TA-Cleaner: A Fine-grained Text Alignment Backdoor Defense Strategy for\n Multimodal Contrastive Learning","summary":" Pre-trained large models for multimodal contrastive learning, such as CLIP,\nhave been widely recognized in the industry as highly susceptible to\ndata-poisoned backdoor attacks. This poses significant risks to downstream\nmodel training. In response to such potential threats, finetuning offers a\nsimpler and more efficient defense choice compared to retraining large models\nwith augmented data. In the supervised learning domain, fine-tuning defense\nstrategies can achieve excellent defense performance. However, in the\nunsupervised and semi-supervised domain, we find that when CLIP faces some\ncomplex attack techniques, the existing fine-tuning defense strategy,\nCleanCLIP, has some limitations on defense performance. The synonym\nsubstitution of its text-augmentation is insufficient to enhance the text\nfeature space. To compensate for this weakness, we improve it by proposing a\nfine-grained \\textbf{T}ext \\textbf{A}lignment \\textbf{C}leaner (TA-Cleaner) to\ncut off feature connections of backdoor triggers. We randomly select a few\nsamples for positive and negative subtext generation at each epoch of\nCleanCLIP, and align the subtexts to the images to strengthen the text\nself-supervision. We evaluate the effectiveness of our TA-Cleaner against six\nattack algorithms and conduct comprehensive zero-shot classification tests on\nImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves\nstate-of-the-art defensiveness among finetuning-based defense techniques. Even\nwhen faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms\nCleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\\% and 63.88\\%,\nrespectively.\n","authors":["Yuan Xun","Siyuan Liang","Xiaojun Jia","Xinwei Liu","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2409.17601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17597v1","updated":"2024-09-26T07:24:09Z","published":"2024-09-26T07:24:09Z","title":"Unifying Dimensions: A Linear Adaptive Approach to Lightweight Image\n Super-Resolution","summary":" Window-based transformers have demonstrated outstanding performance in\nsuper-resolution tasks due to their adaptive modeling capabilities through\nlocal self-attention (SA). However, they exhibit higher computational\ncomplexity and inference latency than convolutional neural networks. In this\npaper, we first identify that the adaptability of the Transformers is derived\nfrom their adaptive spatial aggregation and advanced structural design, while\ntheir high latency results from the computational costs and memory layout\ntransformations associated with the local SA. To simulate this aggregation\napproach, we propose an effective convolution-based linear focal separable\nattention (FSA), allowing for long-range dynamic modeling with linear\ncomplexity. Additionally, we introduce an effective dual-branch structure\ncombined with an ultra-lightweight information exchange module (IEM) to enhance\nthe aggregation of information by the Token Mixer. Finally, with respect to the\nstructure, we modify the existing spatial-gate-based feedforward neural\nnetworks by incorporating a self-gate mechanism to preserve high-dimensional\nchannel information, enabling the modeling of more complex relationships. With\nthese advancements, we construct a convolution-based Transformer framework\nnamed the linear adaptive mixer network (LAMNet). Extensive experiments\ndemonstrate that LAMNet achieves better performance than existing SA-based\nTransformer methods while maintaining the computational efficiency of\nconvolutional neural networks, which can achieve a \\(3\\times\\) speedup of\ninference time. The code will be publicly available at:\nhttps://github.com/zononhzy/LAMNet.\n","authors":["Zhenyu Hu","Wanjie Sun"],"pdf_url":"https://arxiv.org/pdf/2409.17597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17937v3","updated":"2024-09-26T07:23:49Z","published":"2024-03-26T17:59:58Z","title":"Efficient Video Object Segmentation via Modulated Cross-Attention Memory","summary":" Recently, transformer-based approaches have shown promising results for\nsemi-supervised video object segmentation. However, these approaches typically\nstruggle on long videos due to increased GPU memory demands, as they frequently\nexpand the memory bank every few frames. We propose a transformer-based\napproach, named MAVOS, that introduces an optimized and dynamic long-term\nmodulated cross-attention (MCA) memory to model temporal smoothness without\nrequiring frequent memory expansion. The proposed MCA effectively encodes both\nlocal and global features at various levels of granularity while efficiently\nmaintaining consistent speed regardless of the video length. Extensive\nexperiments on multiple benchmarks, LVOS, Long-Time Video, and DAVIS 2017,\ndemonstrate the effectiveness of our proposed contributions leading to\nreal-time inference and markedly reduced memory demands without any degradation\nin segmentation accuracy on long videos. Compared to the best existing\ntransformer-based approach, our MAVOS increases the speed by 7.6x, while\nsignificantly reducing the GPU memory by 87% with comparable segmentation\nperformance on short and long video datasets. Notably on the LVOS dataset, our\nMAVOS achieves a J&F score of 63.3% while operating at 37 frames per second\n(FPS) on a single V100 GPU. Our code and models will be publicly available at:\nhttps://github.com/Amshaker/MAVOS.\n","authors":["Abdelrahman Shaker","Syed Talal Wasim","Martin Danelljan","Salman Khan","Ming-Hsuan Yang","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.17937v3.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2409.17589v1","updated":"2024-09-26T07:12:04Z","published":"2024-09-26T07:12:04Z","title":"Improving Fast Adversarial Training via Self-Knowledge Guidance","summary":" Adversarial training has achieved remarkable advancements in defending\nagainst adversarial attacks. Among them, fast adversarial training (FAT) is\ngaining attention for its ability to achieve competitive robustness with fewer\ncomputing resources. Existing FAT methods typically employ a uniform strategy\nthat optimizes all training data equally without considering the influence of\ndifferent examples, which leads to an imbalanced optimization. However, this\nimbalance remains unexplored in the field of FAT. In this paper, we conduct a\ncomprehensive study of the imbalance issue in FAT and observe an obvious class\ndisparity regarding their performances. This disparity could be embodied from a\nperspective of alignment between clean and robust accuracy. Based on the\nanalysis, we mainly attribute the observed misalignment and disparity to the\nimbalanced optimization in FAT, which motivates us to optimize different\ntraining data adaptively to enhance robustness. Specifically, we take disparity\nand misalignment into consideration. First, we introduce self-knowledge guided\nregularization, which assigns differentiated regularization weights to each\nclass based on its training state, alleviating class disparity. Additionally,\nwe propose self-knowledge guided label relaxation, which adjusts label\nrelaxation according to the training accuracy, alleviating the misalignment and\nimproving robustness. By combining these methods, we formulate the\nSelf-Knowledge Guided FAT (SKG-FAT), leveraging naturally generated knowledge\nduring training to enhance the adversarial robustness without compromising\ntraining efficiency. Extensive experiments on four standard datasets\ndemonstrate that the SKG-FAT improves the robustness and preserves competitive\nclean accuracy, outperforming the state-of-the-art methods.\n","authors":["Chengze Jiang","Junkai Wang","Minjing Dong","Jie Gui","Xinli Shi","Yuan Cao","Yuan Yan Tang","James Tin-Yau Kwok"],"pdf_url":"https://arxiv.org/pdf/2409.17589v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2409.17583v1","updated":"2024-09-26T07:01:29Z","published":"2024-09-26T07:01:29Z","title":"Let the Quantum Creep In: Designing Quantum Neural Network Models by\n Gradually Swapping Out Classical Components","summary":" Artificial Intelligence (AI), with its multiplier effect and wide\napplications in multiple areas, could potentially be an important application\nof quantum computing. Since modern AI systems are often built on neural\nnetworks, the design of quantum neural networks becomes a key challenge in\nintegrating quantum computing into AI. To provide a more fine-grained\ncharacterisation of the impact of quantum components on the performance of\nneural networks, we propose a framework where classical neural network layers\nare gradually replaced by quantum layers that have the same type of input and\noutput while keeping the flow of information between layers unchanged,\ndifferent from most current research in quantum neural network, which favours\nan end-to-end quantum model. We start with a simple three-layer classical\nneural network without any normalisation layers or activation functions, and\ngradually change the classical layers to the corresponding quantum versions. We\nconduct numerical experiments on image classification datasets such as the\nMNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of\nperformance brought by the systematic introduction of quantum components.\nThrough this framework, our research sheds new light on the design of future\nquantum neural network models where it could be more favourable to search for\nmethods and frameworks that harness the advantages from both the classical and\nquantum worlds.\n","authors":["Peiyong Wang","Casey. R. Myers","Lloyd C. L. Hollenberg","Udaya Parampalli"],"pdf_url":"https://arxiv.org/pdf/2409.17583v1.pdf","comment":"50 pages (including Appendix), many figures, accepted as a poster on\n QTML2024. Code available at\n https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In"},{"id":"http://arxiv.org/abs/2409.17576v1","updated":"2024-09-26T06:46:40Z","published":"2024-09-26T06:46:40Z","title":"ID$^3$: Identity-Preserving-yet-Diversified Diffusion Models for\n Synthetic Face Recognition","summary":" Synthetic face recognition (SFR) aims to generate synthetic face datasets\nthat mimic the distribution of real face data, which allows for training face\nrecognition models in a privacy-preserving manner. Despite the remarkable\npotential of diffusion models in image generation, current diffusion-based SFR\nmodels struggle with generalization to real-world faces. To address this\nlimitation, we outline three key objectives for SFR: (1) promoting diversity\nacross identities (inter-class diversity), (2) ensuring diversity within each\nidentity by injecting various facial attributes (intra-class diversity), and\n(3) maintaining identity consistency within each identity group (intra-class\nidentity preservation). Inspired by these goals, we introduce a\ndiffusion-fueled SFR model termed $\\text{ID}^3$. $\\text{ID}^3$ employs an\nID-preserving loss to generate diverse yet identity-consistent facial\nappearances. Theoretically, we show that minimizing this loss is equivalent to\nmaximizing the lower bound of an adjusted conditional log-likelihood over\nID-preserving data. This equivalence motivates an ID-preserving sampling\nalgorithm, which operates over an adjusted gradient vector field, enabling the\ngeneration of fake face recognition datasets that approximate the distribution\nof real-world faces. Extensive experiments across five challenging benchmarks\nvalidate the advantages of $\\text{ID}^3$.\n","authors":["Shen Li","Jianqing Xu","Jiaying Wu","Miao Xiong","Ailin Deng","Jiazhen Ji","Yuge Huang","Wenjie Feng","Shouhong Ding","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2409.17576v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.12598v2","updated":"2024-09-26T06:31:25Z","published":"2024-08-22T17:59:01Z","title":"ND-SDF: Learning Normal Deflection Fields for High-Fidelity Indoor\n Reconstruction","summary":" Neural implicit reconstruction via volume rendering has demonstrated its\neffectiveness in recovering dense 3D surfaces. However, it is non-trivial to\nsimultaneously recover meticulous geometry and preserve smoothness across\nregions with differing characteristics. To address this issue, previous methods\ntypically employ geometric priors, which are often constrained by the\nperformance of the prior models. In this paper, we propose ND-SDF, which learns\na Normal Deflection field to represent the angular deviation between the scene\nnormal and the prior normal. Unlike previous methods that uniformly apply\ngeometric priors on all samples, introducing significant bias in accuracy, our\nproposed normal deflection field dynamically learns and adapts the utilization\nof samples based on their specific characteristics, thereby improving both the\naccuracy and effectiveness of the model. Our method not only obtains smooth\nweakly textured regions such as walls and floors but also preserves the\ngeometric details of complex structures. In addition, we introduce a novel ray\nsampling strategy based on the deflection angle to facilitate the unbiased\nrendering process, which significantly improves the quality and accuracy of\nintricate surfaces, especially on thin structures. Consistent improvements on\nvarious challenging datasets demonstrate the superiority of our method.\n","authors":["Ziyu Tang","Weicai Ye","Yifan Wang","Di Huang","Hujun Bao","Tong He","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17566v1","updated":"2024-09-26T06:28:05Z","published":"2024-09-26T06:28:05Z","title":"Flexiffusion: Segment-wise Neural Architecture Search for Flexible\n Denoising Schedule","summary":" Diffusion models are cutting-edge generative models adept at producing\ndiverse, high-quality images. Despite their effectiveness, these models often\nrequire significant computational resources owing to their numerous sequential\ndenoising steps and the significant inference cost of each step. Recently,\nNeural Architecture Search (NAS) techniques have been employed to automatically\nsearch for faster generation processes. However, NAS for diffusion is\ninherently time-consuming as it requires estimating thousands of diffusion\nmodels to search for the optimal one. In this paper, we introduce Flexiffusion,\na novel training-free NAS paradigm designed to accelerate diffusion models by\nconcurrently optimizing generation steps and network structures. Specifically,\nwe partition the generation process into isometric step segments, each\nsequentially composed of a full step, multiple partial steps, and several null\nsteps. The full step computes all network blocks, while the partial step\ninvolves part of the blocks, and the null step entails no computation.\nFlexiffusion autonomously explores flexible step combinations for each segment,\nsubstantially reducing search costs and enabling greater acceleration compared\nto the state-of-the-art (SOTA) method for diffusion models. Our searched models\nreported speedup factors of $2.6\\times$ and $1.5\\times$ for the original\nLDM-4-G and the SOTA, respectively. The factors for Stable Diffusion V1.5 and\nthe SOTA are $5.1\\times$ and $2.0\\times$. We also verified the performance of\nFlexiffusion on multiple datasets, and positive experiment results indicate\nthat Flexiffusion can effectively reduce redundancy in diffusion models.\n","authors":["Hongtao Huang","Xiaojun Chang","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2409.17566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17565v1","updated":"2024-09-26T06:27:26Z","published":"2024-09-26T06:27:26Z","title":"Pixel-Space Post-Training of Latent Diffusion Models","summary":" Latent diffusion models (LDMs) have made significant advancements in the\nfield of image generation in recent years. One major advantage of LDMs is their\nability to operate in a compressed latent space, allowing for more efficient\ntraining and deployment. However, despite these advantages, challenges with\nLDMs still remain. For example, it has been observed that LDMs often generate\nhigh-frequency details and complex compositions imperfectly. We hypothesize\nthat one reason for these flaws is due to the fact that all pre- and\npost-training of LDMs are done in latent space, which is typically $8 \\times 8$\nlower spatial-resolution than the output images. To address this issue, we\npropose adding pixel-space supervision in the post-training process to better\npreserve high-frequency details. Experimentally, we show that adding a\npixel-space objective significantly improves both supervised quality\nfine-tuning and preference-based post-training by a large margin on a\nstate-of-the-art DiT transformer and U-Net diffusion models in both visual\nquality and visual flaw metrics, while maintaining the same text alignment\nquality.\n","authors":["Christina Zhang","Simran Motwani","Matthew Yu","Ji Hou","Felix Juefei-Xu","Sam Tsai","Peter Vajda","Zijian He","Jialiang Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17564v1","updated":"2024-09-26T06:27:15Z","published":"2024-09-26T06:27:15Z","title":"General Compression Framework for Efficient Transformer Object Tracking","summary":" Transformer-based trackers have established a dominant role in the field of\nvisual object tracking. While these trackers exhibit promising performance,\ntheir deployment on resource-constrained devices remains challenging due to\ninefficiencies. To improve the inference efficiency and reduce the computation\ncost, prior approaches have aimed to either design lightweight trackers or\ndistill knowledge from larger teacher models into more compact student\ntrackers. However, these solutions often sacrifice accuracy for speed. Thus, we\npropose a general model compression framework for efficient transformer object\ntracking, named CompressTracker, to reduce the size of a pre-trained tracking\nmodel into a lightweight tracker with minimal performance degradation. Our\napproach features a novel stage division strategy that segments the transformer\nlayers of the teacher model into distinct stages, enabling the student model to\nemulate each corresponding teacher stage more effectively. Additionally, we\nalso design a unique replacement training technique that involves randomly\nsubstituting specific stages in the student model with those from the teacher\nmodel, as opposed to training the student model in isolation. Replacement\ntraining enhances the student model's ability to replicate the teacher model's\nbehavior. To further forcing student model to emulate teacher model, we\nincorporate prediction guidance and stage-wise feature mimicking to provide\nadditional supervision during the teacher model's compression process. Our\nframework CompressTracker is structurally agnostic, making it compatible with\nany transformer architecture. We conduct a series of experiment to verify the\neffectiveness and generalizability of CompressTracker. Our CompressTracker-4\nwith 4 transformer layers, which is compressed from OSTrack, retains about 96%\nperformance on LaSOT (66.1% AUC) while achieves 2.17x speed up.\n","authors":["Lingyi Hong","Jinglun Li","Xinyu Zhou","Shilin Yan","Pinxue Guo","Kaixun Jiang","Zhaoyu Chen","Shuyong Gao","Wei Zhang","Hong Lu","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17560v1","updated":"2024-09-26T06:12:08Z","published":"2024-09-26T06:12:08Z","title":"Dynamic Subframe Splitting and Spatio-Temporal Motion Entangled Sparse\n Attention for RGB-E Tracking","summary":" Event-based bionic camera asynchronously captures dynamic scenes with high\ntemporal resolution and high dynamic range, offering potential for the\nintegration of events and RGB under conditions of illumination degradation and\nfast motion. Existing RGB-E tracking methods model event characteristics\nutilising attention mechanism of Transformer before integrating both\nmodalities. Nevertheless, these methods involve aggregating the event stream\ninto a single event frame, lacking the utilisation of the temporal information\ninherent in the event stream.Moreover, the traditional attention mechanism is\nwell-suited for dense semantic features, while the attention mechanism for\nsparse event features require revolution. In this paper, we propose a dynamic\nevent subframe splitting strategy to split the event stream into more\nfine-grained event clusters, aiming to capture spatio-temporal features that\ncontain motion cues. Based on this, we design an event-based sparse attention\nmechanism to enhance the interaction of event features in temporal and spatial\ndimensions. The experimental results indicate that our method outperforms\nexisting state-of-the-art methods on the FE240 and COESOT datasets, providing\nan effective processing manner for the event data.\n","authors":["Pengcheng Shao","Tianyang Xu","Xuefeng Zhu","Xiaojun Wu","Josef Kittler"],"pdf_url":"https://arxiv.org/pdf/2409.17560v1.pdf","comment":"15 pages, 8 figures, conference"},{"id":"http://arxiv.org/abs/2409.16225v2","updated":"2024-09-26T06:04:21Z","published":"2024-09-24T16:38:41Z","title":"VideoPatchCore: An Effective Method to Memorize Normality for Video\n Anomaly Detection","summary":" Video anomaly detection (VAD) is a crucial task in video analysis and\nsurveillance within computer vision. Currently, VAD is gaining attention with\nmemory techniques that store the features of normal frames. The stored features\nare utilized for frame reconstruction, identifying an abnormality when a\nsignificant difference exists between the reconstructed and input frames.\nHowever, this approach faces several challenges due to the simultaneous\noptimization required for both the memory and encoder-decoder model. These\nchallenges include increased optimization difficulty, complexity of\nimplementation, and performance variability depending on the memory size. To\naddress these challenges,we propose an effective memory method for VAD, called\nVideoPatchCore. Inspired by PatchCore, our approach introduces a structure that\nprioritizes memory optimization and configures three types of memory tailored\nto the characteristics of video data. This method effectively addresses the\nlimitations of existing memory-based methods, achieving good performance\ncomparable to state-of-the-art methods. Furthermore, our method requires no\ntraining and is straightforward to implement, making VAD tasks more accessible.\nOur code is available online at github.com/SkiddieAhn/Paper-VideoPatchCore.\n","authors":["Sunghyun Ahn","Youngwan Jo","Kijung Lee","Sanghyun Park"],"pdf_url":"https://arxiv.org/pdf/2409.16225v2.pdf","comment":"Accepted to ACCV 2024"},{"id":"http://arxiv.org/abs/2409.17555v1","updated":"2024-09-26T05:57:35Z","published":"2024-09-26T05:57:35Z","title":"Advancing Open-Set Domain Generalization Using Evidential Bi-Level\n Hardest Domain Scheduler","summary":" In Open-Set Domain Generalization (OSDG), the model is exposed to both new\nvariations of data appearance (domains) and open-set conditions, where both\nknown and novel categories are present at test time. The challenges of this\ntask arise from the dual need to generalize across diverse domains and\naccurately quantify category novelty, which is critical for applications in\ndynamic environments. Recently, meta-learning techniques have demonstrated\nsuperior results in OSDG, effectively orchestrating the meta-train and -test\ntasks by employing varied random categories and predefined domain partition\nstrategies. These approaches prioritize a well-designed training schedule over\ntraditional methods that focus primarily on data augmentation and the\nenhancement of discriminative feature learning. The prevailing meta-learning\nmodels in OSDG typically utilize a predefined sequential domain scheduler to\nstructure data partitions. However, a crucial aspect that remains inadequately\nexplored is the influence brought by strategies of domain schedulers during\ntraining. In this paper, we observe that an adaptive domain scheduler benefits\nmore in OSDG compared with prefixed sequential and random domain schedulers. We\npropose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve\nan adaptive domain scheduler. This method strategically sequences domains by\nassessing their reliabilities in utilizing a follower network, trained with\nconfidence scores learned in an evidential manner, regularized by max rebiasing\ndiscrepancy, and optimized in a bi-level manner. The results show that our\nmethod substantially improves OSDG performance and achieves more discriminative\nembeddings for both the seen and unseen categories. The source code will be\navailable at https://github.com/KPeng9510/EBiL-HaDS.\n","authors":["Kunyu Peng","Di Wen","Kailun Yang","Ao Luo","Yufan Chen","Jia Fu","M. Saquib Sarfraz","Alina Roitberg","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2409.17555v1.pdf","comment":"Accepted to NeurIPS 2024. The source code will be available at\n https://github.com/KPeng9510/EBiL-HaDS"},{"id":"http://arxiv.org/abs/2406.06911v3","updated":"2024-09-26T05:47:36Z","published":"2024-06-11T03:09:37Z","title":"AsyncDiff: Parallelizing Diffusion Models by Asynchronous Denoising","summary":" Diffusion models have garnered significant interest from the community for\ntheir great generative ability across various applications. However, their\ntypical multi-step sequential-denoising nature gives rise to high cumulative\nlatency, thereby precluding the possibilities of parallel computation. To\naddress this, we introduce AsyncDiff, a universal and plug-and-play\nacceleration scheme that enables model parallelism across multiple devices. Our\napproach divides the cumbersome noise prediction model into multiple\ncomponents, assigning each to a different device. To break the dependency chain\nbetween these components, it transforms the conventional sequential denoising\ninto an asynchronous process by exploiting the high similarity between hidden\nstates in consecutive diffusion steps. Consequently, each component is\nfacilitated to compute in parallel on separate devices. The proposed strategy\nsignificantly reduces inference latency while minimally impacting the\ngenerative quality. Specifically, for the Stable Diffusion v2.1, AsyncDiff\nachieves a 2.7x speedup with negligible degradation and a 4.0x speedup with\nonly a slight reduction of 0.38 in CLIP Score, on four NVIDIA A5000 GPUs. Our\nexperiments also demonstrate that AsyncDiff can be readily applied to video\ndiffusion models with encouraging performances. The code is available at\nhttps://github.com/czg1225/AsyncDiff.\n","authors":["Zigeng Chen","Xinyin Ma","Gongfan Fang","Zhenxiong Tan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2406.06911v3.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2312.05284v4","updated":"2024-09-26T05:41:56Z","published":"2023-12-08T12:48:53Z","title":"SlimSAM: 0.1% Data Makes Segment Anything Slim","summary":" Current approaches for compressing the Segment Anything Model (SAM) yield\ncommendable results, yet necessitate extensive data to train a new network from\nscratch. Employing conventional pruning techniques can remarkably reduce data\nrequirements but would suffer from a degradation in performance. To address\nthis challenging trade-off, we introduce SlimSAM, a novel data-efficient SAM\ncompression method that achieves superior performance with extremely less\ntraining data. The essence of SlimSAM is encapsulated in the alternate slimming\nframework which effectively enhances knowledge inheritance under severely\nlimited training data availability and exceptional pruning ratio. Diverging\nfrom prior techniques, our framework progressively compresses the model by\nalternately pruning and distilling distinct, decoupled sub-structures.\nDisturbed Taylor pruning is also proposed to address the misalignment between\nthe pruning objective and training target, thereby boosting the\npost-distillation after pruning. SlimSAM yields significant performance\nimprovements while demanding over 10 times less training data than any other\nexisting compression methods. Even when compared to the original SAM, SlimSAM\nachieves approaching performance while reducing parameter counts to merely 1.4%\n(9.1M), MACs to 0.8% (23G), and requiring only 0.1% (10k) of the SAM training\ndata. The code is available at http://github.com/czg1225/SlimSAM.\n","authors":["Zigeng Chen","Gongfan Fang","Xinyin Ma","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2312.05284v4.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17547v1","updated":"2024-09-26T05:33:30Z","published":"2024-09-26T05:33:30Z","title":"Triple Point Masking","summary":" Existing 3D mask learning methods encounter performance bottlenecks under\nlimited data, and our objective is to overcome this limitation. In this paper,\nwe introduce a triple point masking scheme, named TPM, which serves as a\nscalable framework for pre-training of masked autoencoders to achieve\nmulti-mask learning for 3D point clouds. Specifically, we augment the baselines\nwith two additional mask choices (i.e., medium mask and low mask) as our core\ninsight is that the recovery process of an object can manifest in diverse ways.\nPrevious high-masking schemes focus on capturing the global representation but\nlack the fine-grained recovery capability, so that the generated pre-trained\nweights tend to play a limited role in the fine-tuning process. With the\nsupport of the proposed TPM, available methods can exhibit more flexible and\naccurate completion capabilities, enabling the potential autoencoder in the\npre-training stage to consider multiple representations of a single 3D object.\nIn addition, an SVM-guided weight selection module is proposed to fill the\nencoder parameters for downstream networks with the optimal weight during the\nfine-tuning stage, maximizing linear accuracy and facilitating the acquisition\nof intricate representations for new objects. Extensive experiments show that\nthe four baselines equipped with the proposed TPM achieve comprehensive\nperformance improvements on various downstream tasks.\n","authors":["Jiaming Liu","Linghe Kong","Yue Wu","Maoguo Gong","Hao Li","Qiguang Miao","Wenping Ma","Can Qin"],"pdf_url":"https://arxiv.org/pdf/2409.17547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05769v2","updated":"2024-09-26T05:10:23Z","published":"2024-05-09T13:45:04Z","title":"Exploring Text-Guided Single Image Editing for Remote Sensing Images","summary":" Artificial intelligence generative content (AIGC) has significantly impacted\nimage generation in the field of remote sensing. However, the equally important\narea of remote sensing image (RSI) editing has not received sufficient\nattention. Deep learning based editing methods generally involve two sequential\nstages: generation and editing. During the generation stage, consistency in\ncontent and details between the original and edited images must be maintained,\nwhile in the editing stage, controllability and accuracy of the edits should be\nensured. For natural images, these challenges can be tackled by training\ngenerative backbones on large-scale benchmark datasets and using text guidance\nbased on vision-language models (VLMs). However, these previously effective\napproaches become less viable for RSIs due to two reasons: First, existing\ngenerative RSI benchmark datasets do not fully capture the diversity of remote\nsensing scenarios, particularly in terms of variations in sensors, object\ntypes, and resolutions. Consequently, the generalization capacity of the\ntrained backbone model is often inadequate for universal editing tasks on RSIs.\nSecond, the large spatial resolution of RSIs exacerbates the problem in VLMs\nwhere a single text semantic corresponds to multiple image semantics, leading\nto the introduction of incorrect semantics when using text to guide RSI\nediting. To solve above problems, this paper proposes a text-guided RSI editing\nmethod that is controllable but stable, and can be trained using only a single\nimage. It adopts a multi-scale training approach to preserve consistency\nwithout the need for training on extensive benchmark datasets, while leveraging\nRSI pre-trained VLMs and prompt ensembling (PE) to ensure accuracy and\ncontrollability in the text-guided editing process.\n","authors":["Fangzhou Han","Lingyu Si","Hongwei Dong","Lamei Zhang","Hao Chen","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2405.05769v2.pdf","comment":"14 pages, 14 figures, submitted to IEEE Transactions on Geoscience\n and Remote Sensing"},{"id":"http://arxiv.org/abs/2409.17533v1","updated":"2024-09-26T04:40:38Z","published":"2024-09-26T04:40:38Z","title":"CAMOT: Camera Angle-aware Multi-Object Tracking","summary":" This paper proposes CAMOT, a simple camera angle estimator for multi-object\ntracking to tackle two problems: 1) occlusion and 2) inaccurate distance\nestimation in the depth direction. Under the assumption that multiple objects\nare located on a flat plane in each video frame, CAMOT estimates the camera\nangle using object detection. In addition, it gives the depth of each object,\nenabling pseudo-3D MOT. We evaluated its performance by adding it to various 2D\nMOT methods on the MOT17 and MOT20 datasets and confirmed its effectiveness.\nApplying CAMOT to ByteTrack, we obtained 63.8% HOTA, 80.6% MOTA, and 78.5% IDF1\nin MOT17, which are state-of-the-art results. Its computational cost is\nsignificantly lower than the existing deep-learning-based depth estimators for\ntracking.\n","authors":["Felix Limanta","Kuniaki Uto","Koichi Shinoda"],"pdf_url":"https://arxiv.org/pdf/2409.17533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17531v1","updated":"2024-09-26T04:36:19Z","published":"2024-09-26T04:36:19Z","title":"SimVG: A Simple Framework for Visual Grounding with Decoupled\n Multi-modal Fusion","summary":" Visual grounding is a common vision task that involves grounding descriptive\nsentences to the corresponding regions of an image. Most existing methods use\nindependent image-text encoding and apply complex hand-crafted modules or\nencoder-decoder architectures for modal interaction and query reasoning.\nHowever, their performance significantly drops when dealing with complex\ntextual expressions. This is because the former paradigm only utilizes limited\ndownstream data to fit the multi-modal feature fusion. Therefore, it is only\neffective when the textual expressions are relatively simple. In contrast,\ngiven the wide diversity of textual expressions and the uniqueness of\ndownstream training data, the existing fusion module, which extracts multimodal\ncontent from a visual-linguistic context, has not been fully investigated. In\nthis paper, we present a simple yet robust transformer-based framework, SimVG,\nfor visual grounding. Specifically, we decouple visual-linguistic feature\nfusion from downstream tasks by leveraging existing multimodal pre-trained\nmodels and incorporating additional object tokens to facilitate deep\nintegration of downstream and pre-training tasks. Furthermore, we design a\ndynamic weight-balance distillation method in the multi-branch synchronous\nlearning process to enhance the representation capability of the simpler\nbranch. This branch only consists of a lightweight MLP, which simplifies the\nstructure and improves reasoning speed. Experiments on six widely used VG\ndatasets, i.e., RefCOCO/+/g, ReferIt, Flickr30K, and GRefCOCO, demonstrate the\nsuperiority of SimVG. Finally, the proposed method not only achieves\nimprovements in efficiency and convergence speed but also attains new\nstate-of-the-art performance on these benchmarks. Codes and models will be\navailable at \\url{https://github.com/Dmmm1997/SimVG}.\n","authors":["Ming Dai","Lingfeng Yang","Yihao Xu","Zhenhua Feng","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2409.17531v1.pdf","comment":"21pages, 11figures, NeurIPS2024"},{"id":"http://arxiv.org/abs/2409.17526v1","updated":"2024-09-26T04:27:44Z","published":"2024-09-26T04:27:44Z","title":"Drone Stereo Vision for Radiata Pine Branch Detection and Distance\n Measurement: Integrating SGBM and Segmentation Models","summary":" Manual pruning of radiata pine trees presents significant safety risks due to\ntheir substantial height and the challenging terrains in which they thrive. To\naddress these risks, this research proposes the development of a drone-based\npruning system equipped with specialized pruning tools and a stereo vision\ncamera, enabling precise detection and trimming of branches. Deep learning\nalgorithms, including YOLO and Mask R-CNN, are employed to ensure accurate\nbranch detection, while the Semi-Global Matching algorithm is integrated to\nprovide reliable distance estimation. The synergy between these techniques\nfacilitates the precise identification of branch locations and enables\nefficient, targeted pruning. Experimental results demonstrate that the combined\nimplementation of YOLO and SGBM enables the drone to accurately detect branches\nand measure their distances from the drone. This research not only improves the\nsafety and efficiency of pruning operations but also makes a significant\ncontribution to the advancement of drone technology in the automation of\nagricultural and forestry practices, laying a foundational framework for\nfurther innovations in environmental management.\n","authors":["Yida Lin","Bing Xue","Mengjie Zhang","Sam Schofield","Richard Green"],"pdf_url":"https://arxiv.org/pdf/2409.17526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17524v1","updated":"2024-09-26T04:23:17Z","published":"2024-09-26T04:23:17Z","title":"JoyType: A Robust Design for Multilingual Visual Text Creation","summary":" Generating images with accurately represented text, especially in non-Latin\nlanguages, poses a significant challenge for diffusion models. Existing\napproaches, such as the integration of hint condition diagrams via auxiliary\nnetworks (e.g., ControlNet), have made strides towards addressing this issue.\nHowever, diffusion models often fall short in tasks requiring controlled text\ngeneration, such as specifying particular fonts or producing text in small\nfonts. In this paper, we introduce a novel approach for multilingual visual\ntext creation, named JoyType, designed to maintain the font style of text\nduring the image generation process. Our methodology begins with assembling a\ntraining dataset, JoyType-1M, comprising 1 million pairs of data. Each pair\nincludes an image, its description, and glyph instructions corresponding to the\nfont style within the image. We then developed a text control network, Font\nControlNet, tasked with extracting font style information to steer the image\ngeneration. To further enhance our model's ability to maintain font style,\nnotably in generating small-font text, we incorporated a multi-layer OCR-aware\nloss into the diffusion process. This enhancement allows JoyType to direct text\nrendering using low-level descriptors. Our evaluations, based on both visual\nand accuracy metrics, demonstrate that JoyType significantly outperforms\nexisting state-of-the-art methods. Additionally, JoyType can function as a\nplugin, facilitating the creation of varied image styles in conjunction with\nother stable diffusion models on HuggingFace and CivitAI. Our project is\nopen-sourced on https://jdh-algo.github.io/JoyType/.\n","authors":["Chao Li","Chen Jiang","Xiaolong Liu","Jun Zhao","Guoxin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17524v1.pdf","comment":"Under Review at AAAI 2025"},{"id":"http://arxiv.org/abs/2409.17523v1","updated":"2024-09-26T04:17:27Z","published":"2024-09-26T04:17:27Z","title":"EAGLE: Egocentric AGgregated Language-video Engine","summary":" The rapid evolution of egocentric video analysis brings new insights into\nunderstanding human activities and intentions from a first-person perspective.\nDespite this progress, the fragmentation in tasks like action recognition,\nprocedure learning, and moment retrieval, \\etc, coupled with inconsistent\nannotations and isolated model development, hinders a holistic interpretation\nof video content. In response, we introduce the EAGLE (Egocentric AGgregated\nLanguage-video Engine) model and the EAGLE-400K dataset to provide a unified\nframework that integrates various egocentric video understanding tasks.\nEAGLE-400K, the \\textit{first} large-scale instruction-tuning dataset tailored\nfor egocentric video, features 400K diverse samples to enhance a broad spectrum\nof tasks from activity recognition to procedure knowledge learning. Moreover,\nEAGLE, a strong video multimodal large language model (MLLM), is designed to\neffectively capture both spatial and temporal information. In addition, we\npropose a set of evaluation metrics designed to facilitate a thorough\nassessment of MLLM for egocentric video understanding. Our extensive\nexperiments demonstrate EAGLE's superior performance over existing models,\nhighlighting its ability to balance task-specific understanding with holistic\nvideo interpretation. With EAGLE, we aim to pave the way for research\nopportunities and practical applications in real-world scenarios.\n","authors":["Jing Bi","Yunlong Tang","Luchuan Song","Ali Vosoughi","Nguyen Nguyen","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17523v1.pdf","comment":"Accepted by ACMMM 24"},{"id":"http://arxiv.org/abs/2409.17519v1","updated":"2024-09-26T04:02:20Z","published":"2024-09-26T04:02:20Z","title":"Robotic Environmental State Recognition with Pre-Trained Vision-Language\n Models and Black-Box Optimization","summary":" In order for robots to autonomously navigate and operate in diverse\nenvironments, it is essential for them to recognize the state of their\nenvironment. On the other hand, the environmental state recognition has\ntraditionally involved distinct methods tailored to each state to be\nrecognized. In this study, we perform a unified environmental state recognition\nfor robots through the spoken language with pre-trained large-scale\nvision-language models. We apply Visual Question Answering and Image-to-Text\nRetrieval, which are tasks of Vision-Language Models. We show that with our\nmethod, it is possible to recognize not only whether a room door is\nopen/closed, but also whether a transparent door is open/closed and whether\nwater is running in a sink, without training neural networks or manual\nprogramming. In addition, the recognition accuracy can be improved by selecting\nappropriate texts from the set of prepared texts based on black-box\noptimization. For each state recognition, only the text set and its weighting\nneed to be changed, eliminating the need to prepare multiple different models\nand programs, and facilitating the management of source code and computer\nresource. We experimentally demonstrate the effectiveness of our method and\napply it to the recognition behavior on a mobile robot, Fetch.\n","authors":["Kento Kawaharazuka","Yoshiki Obinata","Naoaki Kanazawa","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2409.17519v1.pdf","comment":"Accepted at Advanced Robotics, website -\n https://haraduka.github.io/vlm-bbo/"},{"id":"http://arxiv.org/abs/2404.05705v2","updated":"2024-09-26T03:58:11Z","published":"2024-04-08T17:42:08Z","title":"Learning 3D-Aware GANs from Unposed Images with Template Feature Field","summary":" Collecting accurate camera poses of training images has been shown to well\nserve the learning of 3D-aware generative adversarial networks (GANs) yet can\nbe quite expensive in practice. This work targets learning 3D-aware GANs from\nunposed images, for which we propose to perform on-the-fly pose estimation of\ntraining images with a learned template feature field (TeFF). Concretely, in\naddition to a generative radiance field as in previous approaches, we ask the\ngenerator to also learn a field from 2D semantic features while sharing the\ndensity from the radiance field. Such a framework allows us to acquire a\ncanonical 3D feature template leveraging the dataset mean discovered by the\ngenerative model, and further efficiently estimate the pose parameters on real\ndata. Experimental results on various challenging datasets demonstrate the\nsuperiority of our approach over state-of-the-art alternatives from both the\nqualitative and the quantitative perspectives.\n","authors":["Xinya Chen","Hanlei Guo","Yanrui Bin","Shangzhan Zhang","Yuanbo Yang","Yue Wang","Yujun Shen","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2404.05705v2.pdf","comment":"https://XDimlab.github.io/TeFF"},{"id":"http://arxiv.org/abs/2409.17512v1","updated":"2024-09-26T03:47:34Z","published":"2024-09-26T03:47:34Z","title":"SCOMatch: Alleviating Overtrusting in Open-set Semi-supervised Learning","summary":" Open-set semi-supervised learning (OSSL) leverages practical open-set\nunlabeled data, comprising both in-distribution (ID) samples from seen classes\nand out-of-distribution (OOD) samples from unseen classes, for semi-supervised\nlearning (SSL). Prior OSSL methods initially learned the decision boundary\nbetween ID and OOD with labeled ID data, subsequently employing self-training\nto refine this boundary. These methods, however, suffer from the tendency to\novertrust the labeled ID data: the scarcity of labeled data caused the\ndistribution bias between the labeled samples and the entire ID data, which\nmisleads the decision boundary to overfit. The subsequent self-training\nprocess, based on the overfitted result, fails to rectify this problem. In this\npaper, we address the overtrusting issue by treating OOD samples as an\nadditional class, forming a new SSL process.\n Specifically, we propose SCOMatch, a novel OSSL method that 1) selects\nreliable OOD samples as new labeled data with an OOD memory queue and a\ncorresponding update strategy and 2) integrates the new SSL process into the\noriginal task through our Simultaneous Close-set and Open-set self-training.\nSCOMatch refines the decision boundary of ID and OOD classes across the entire\ndataset, thereby leading to improved results. Extensive experimental results\nshow that SCOMatch significantly outperforms the state-of-the-art methods on\nvarious benchmarks. The effectiveness is further verified through ablation\nstudies and visualization.\n","authors":["Zerun Wang","Liuyu Xiang","Lang Huang","Jiafeng Mao","Ling Xiao","Toshihiko Yamasaki"],"pdf_url":"https://arxiv.org/pdf/2409.17512v1.pdf","comment":"ECCV 2024 accepted"},{"id":"http://arxiv.org/abs/2409.17510v1","updated":"2024-09-26T03:40:12Z","published":"2024-09-26T03:40:12Z","title":"NeuroPath: A Neural Pathway Transformer for Joining the Dots of Human\n Connectomes","summary":" Although modern imaging technologies allow us to study connectivity between\ntwo distinct brain regions in-vivo, an in-depth understanding of how anatomical\nstructure supports brain function and how spontaneous functional fluctuations\nemerge remarkable cognition is still elusive. Meanwhile, tremendous efforts\nhave been made in the realm of machine learning to establish the nonlinear\nmapping between neuroimaging data and phenotypic traits. However, the absence\nof neuroscience insight in the current approaches poses significant challenges\nin understanding cognitive behavior from transient neural activities. To\naddress this challenge, we put the spotlight on the coupling mechanism of\nstructural connectivity (SC) and functional connectivity (FC) by formulating\nsuch network neuroscience question into an expressive graph representation\nlearning problem for high-order topology. Specifically, we introduce the\nconcept of topological detour to characterize how a ubiquitous instance of FC\n(direct link) is supported by neural pathways (detour) physically wired by SC,\nwhich forms a cyclic loop interacted by brain structure and function. In the\nclich\\'e of machine learning, the multi-hop detour pathway underlying SC-FC\ncoupling allows us to devise a novel multi-head self-attention mechanism within\nTransformer to capture multi-modal feature representation from paired graphs of\nSC and FC. Taken together, we propose a biological-inspired deep model, coined\nas NeuroPath, to find putative connectomic feature representations from the\nunprecedented amount of neuroimages, which can be plugged into various\ndownstream applications such as task recognition and disease diagnosis. We have\nevaluated NeuroPath on large-scale public datasets including HCP and UK Biobank\nunder supervised and zero-shot learning, where the state-of-the-art performance\nby our NeuroPath indicates great potential in network neuroscience.\n","authors":["Ziquan Wei","Tingting Dan","Jiaqi Ding","Paul J Laurienti","Guorong Wu"],"pdf_url":"https://arxiv.org/pdf/2409.17510v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.00591v4","updated":"2024-09-26T03:34:23Z","published":"2024-08-01T14:20:47Z","title":"Regional quality estimation for echocardiography using deep learning","summary":" Automatic estimation of cardiac ultrasound image quality can be beneficial\nfor guiding operators and ensuring the accuracy of clinical measurements.\nPrevious work often fails to distinguish the view correctness of the\nechocardiogram from the image quality. Additionally, previous studies only\nprovide a global image quality value, which limits their practical utility. In\nthis work, we developed and compared three methods to estimate image quality:\n1) classic pixel-based metrics like the generalized contrast-to-noise ratio\n(gCNR) on myocardial segments as region of interest and left ventricle lumen as\nbackground, obtained using a U-Net segmentation 2) local image coherence\nderived from a U-Net model that predicts coherence from B-Mode images 3) a deep\nconvolutional network that predicts the quality of each region directly in an\nend-to-end fashion. We evaluate each method against manual regional image\nquality annotations by three experienced cardiologists. The results indicate\npoor performance of the gCNR metric, with Spearman correlation to the\nannotations of rho = 0.24. The end-to-end learning model obtains the best\nresult, rho = 0.69, comparable to the inter-observer correlation, rho = 0.63.\nFinally, the coherence-based method, with rho = 0.58, outperformed the\nclassical metrics and is more generic than the end-to-end approach. The image\nquality prediction tool is available as an open source Python library at\nhttps://github.com/GillesVanDeVyver/arqee.\n","authors":["Gilles Van De Vyver","Svein-Erik Måsøy","Håvard Dalen","Bjørnar Leangen Grenne","Espen Holte","Sindre Hellum Olaisen","John Nyberg","Andreas Østvik","Lasse Løvstakken","Erik Smistad"],"pdf_url":"https://arxiv.org/pdf/2408.00591v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17508v1","updated":"2024-09-26T03:33:26Z","published":"2024-09-26T03:33:26Z","title":"Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task\n Learning Via Connector-MoE","summary":" Multi-modal large language models (MLLMs) have shown impressive capabilities\nas a general-purpose interface for various visual and linguistic tasks.\nHowever, building a unified MLLM for multi-task learning in the medical field\nremains a thorny challenge. To mitigate the tug-of-war problem of multi-modal\nmulti-task optimization, recent advances primarily focus on improving the LLM\ncomponents, while neglecting the connector that bridges the gap between\nmodalities. In this paper, we introduce Uni-Med, a novel medical generalist\nfoundation model which consists of a universal visual feature extraction\nmodule, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting\nfrom the proposed CMoE that leverages a well-designed router with a mixture of\nprojection experts at the connector, Uni-Med achieves efficient solution to the\ntug-of-war problem and can perform six different medical tasks including\nquestion answering, visual question answering, report generation, referring\nexpression comprehension, referring expression generation and image\nclassification. To the best of our knowledge, Uni-Med is the first effort to\ntackle multi-task interference at the connector. Extensive ablation experiments\nvalidate the effectiveness of introducing CMoE under any configuration, with up\nto an average 8% performance gains. We further provide interpretation analysis\nof the tug-of-war problem from the perspective of gradient optimization and\nparameter statistics. Compared to previous state-of-the-art medical MLLMs,\nUni-Med achieves competitive or superior evaluation metrics on diverse tasks.\nCode, data and model will be soon available at GitHub.\n","authors":["Xun Zhu","Ying Hu","Fanbin Mo","Miao Li","Ji Wu"],"pdf_url":"https://arxiv.org/pdf/2409.17508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21341v2","updated":"2024-09-26T03:31:28Z","published":"2024-07-31T05:15:24Z","title":"High-throughput 3D shape completion of potato tubers on a harvester","summary":" Potato yield is an important metric for farmers to further optimize their\ncultivation practices. Potato yield can be estimated on a harvester using an\nRGB-D camera that can estimate the three-dimensional (3D) volume of individual\npotato tubers. A challenge, however, is that the 3D shape derived from RGB-D\nimages is only partially completed, underestimating the actual volume. To\naddress this issue, we developed a 3D shape completion network, called CoRe++,\nwhich can complete the 3D shape from RGB-D images. CoRe++ is a deep learning\nnetwork that consists of a convolutional encoder and a decoder. The encoder\ncompresses RGB-D images into latent vectors that are used by the decoder to\ncomplete the 3D shape using the deep signed distance field network (DeepSDF).\nTo evaluate our CoRe++ network, we collected partial and complete 3D point\nclouds of 339 potato tubers on an operational harvester in Japan. On the 1425\nRGB-D images in the test set (representing 51 unique potato tubers), our\nnetwork achieved a completion accuracy of 2.8 mm on average. For volumetric\nestimation, the root mean squared error (RMSE) was 22.6 ml, and this was better\nthan the RMSE of the linear regression (31.1 ml) and the base model (36.9 ml).\nWe found that the RMSE can be further reduced to 18.2 ml when performing the 3D\nshape completion in the center of the RGB-D image. With an average 3D shape\ncompletion time of 10 milliseconds per tuber, we can conclude that CoRe++ is\nboth fast and accurate enough to be implemented on an operational harvester for\nhigh-throughput potato yield estimation. Our method can also be applied to\nother tuber, fruit and vegetable crops, thereby enabling versatile, accurate\nand real-time yield monitoring in precision agriculture. Our code, network\nweights and dataset are publicly available at\nhttps://github.com/UTokyo-FieldPhenomics-Lab/corepp.git.\n","authors":["Pieter M. Blok","Federico Magistri","Cyrill Stachniss","Haozhou Wang","James Burridge","Wei Guo"],"pdf_url":"https://arxiv.org/pdf/2407.21341v2.pdf","comment":"20 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2409.17503v1","updated":"2024-09-26T03:21:21Z","published":"2024-09-26T03:21:21Z","title":"Shape-intensity knowledge distillation for robust medical image\n segmentation","summary":" Many medical image segmentation methods have achieved impressive results.\nYet, most existing methods do not take into account the shape-intensity prior\ninformation. This may lead to implausible segmentation results, in particular\nfor images of unseen datasets. In this paper, we propose a novel approach to\nincorporate joint shape-intensity prior information into the segmentation\nnetwork. Specifically, we first train a segmentation network (regarded as the\nteacher network) on class-wise averaged training images to extract valuable\nshape-intensity information, which is then transferred to a student\nsegmentation network with the same network architecture as the teacher via\nknowledge distillation. In this way, the student network regarded as the final\nsegmentation model can effectively integrate the shape-intensity prior\ninformation, yielding more accurate segmentation results. Despite its\nsimplicity, experiments on five medical image segmentation tasks of different\nmodalities demonstrate that the proposed Shape-Intensity Knowledge Distillation\n(SIKD) consistently improves several baseline models (including recent MaxStyle\nand SAMed) under intra-dataset evaluation, and significantly improves the\ncross-dataset generalization ability. The code is available at\nhttps://github.com/whdong-whu/SIKD.\n","authors":["Wenhui Dong","Bo Du","Yongchao Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16663v2","updated":"2024-09-26T02:57:52Z","published":"2024-09-25T06:48:25Z","title":"Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles\n Using Latent Space Generative World Models","summary":" We propose the use of latent space generative world models to address the\ncovariate shift problem in autonomous driving. A world model is a neural\nnetwork capable of predicting an agent's next state given past states and\nactions. By leveraging a world model during training, the driving policy\neffectively mitigates covariate shift without requiring an excessive amount of\ntraining data. During end-to-end training, our policy learns how to recover\nfrom errors by aligning with states observed in human demonstrations, so that\nat runtime it can recover from perturbations outside the training distribution.\nAdditionally, we introduce a novel transformer-based perception encoder that\nemploys multi-view cross-attention and a learned scene query. We present\nqualitative and quantitative results, demonstrating significant improvements\nupon prior state of the art in closed-loop testing in the CARLA simulator, as\nwell as showing the ability to handle perturbations in both CARLA and NVIDIA's\nDRIVE Sim.\n","authors":["Alexander Popov","Alperen Degirmenci","David Wehr","Shashank Hegde","Ryan Oldja","Alexey Kamenev","Bertrand Douillard","David Nistér","Urs Muller","Ruchi Bhargava","Stan Birchfield","Nikolai Smolyanskiy"],"pdf_url":"https://arxiv.org/pdf/2409.16663v2.pdf","comment":"7 pages, 6 figures, for ICRA 2025 conference, for associated video\n file, see https://youtu.be/fO7RZ57gVxk"},{"id":"http://arxiv.org/abs/2401.04585v2","updated":"2024-09-26T02:53:15Z","published":"2024-01-09T14:42:49Z","title":"EDA-DM: Enhanced Distribution Alignment for Post-Training Quantization\n of Diffusion Models","summary":" Diffusion models have achieved great success in image generation tasks\nthrough iterative noise estimation. However, the heavy denoising process and\ncomplex neural networks hinder their low-latency applications in real-world\nscenarios. Quantization can effectively reduce model complexity, and\npost-training quantization (PTQ), which does not require fine-tuning, is highly\npromising for compressing and accelerating diffusion models. Unfortunately, we\nfind that due to the highly dynamic distribution of activations in different\ndenoising steps, existing PTQ methods for diffusion models suffer from\ndistribution mismatch issues at both calibration sample level and\nreconstruction output level, which makes the performance far from satisfactory,\nespecially in low-bit cases. In this paper, we propose Enhanced Distribution\nAlignment for Post-Training Quantization of Diffusion Models (EDA-DM) to\naddress the above issues. Specifically, at the calibration sample level, we\nselect calibration samples based on the density and variety in the latent\nspace, thus facilitating the alignment of their distribution with the overall\nsamples; and at the reconstruction output level, we modify the loss of block\nreconstruction with the losses of layers, aligning the outputs of quantized\nmodel and full-precision model at different network granularity. Extensive\nexperiments demonstrate that EDA-DM significantly outperforms the existing PTQ\nmethods across various models (DDIM, LDM-4, LDM-8, Stable-Diffusion) and\ndifferent datasets (CIFAR-10, LSUN-Bedroom, LSUN-Church, ImageNet, MS-COCO).\n","authors":["Xuewen Liu","Zhikai Li","Junrui Xiao","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2401.04585v2.pdf","comment":"Code: http://github.com/BienLuky/EDA-DM"},{"id":"http://arxiv.org/abs/2409.17487v1","updated":"2024-09-26T02:49:51Z","published":"2024-09-26T02:49:51Z","title":"Learning Quantized Adaptive Conditions for Diffusion Models","summary":" The curvature of ODE trajectories in diffusion models hinders their ability\nto generate high-quality images in a few number of function evaluations (NFE).\nIn this paper, we propose a novel and effective approach to reduce trajectory\ncurvature by utilizing adaptive conditions. By employing a extremely\nlight-weight quantized encoder, our method incurs only an additional 1% of\ntraining parameters, eliminates the need for extra regularization terms, yet\nachieves significantly better sample quality. Our approach accelerates ODE\nsampling while preserving the downstream task image editing capabilities of SDE\ntechniques. Extensive experiments verify that our method can generate high\nquality results under extremely limited sampling costs. With only 6 NFE, we\nachieve 5.14 FID on CIFAR-10, 6.91 FID on FFHQ 64x64 and 3.10 FID on AFHQv2.\n","authors":["Yuchen Liang","Yuchuan Tian","Lei Yu","Huao Tang","Jie Hu","Xiangzhong Fang","Hanting Chen"],"pdf_url":"https://arxiv.org/pdf/2409.17487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17486v1","updated":"2024-09-26T02:48:15Z","published":"2024-09-26T02:48:15Z","title":"Global-Local Medical SAM Adaptor Based on Full Adaption","summary":" Emerging of visual language models, such as the segment anything model (SAM),\nhave made great breakthroughs in the field of universal semantic segmentation\nand significantly aid the improvements of medical image segmentation, in\nparticular with the help of Medical SAM adaptor (Med-SA). However, Med-SA still\ncan be improved, as it fine-tunes SAM in a partial adaption manner. To resolve\nthis problem, we present a novel global medical SAM adaptor (GMed-SA) with full\nadaption, which can adapt SAM globally. We further combine GMed-SA and Med-SA\nto propose a global-local medical SAM adaptor (GLMed-SA) to adapt SAM both\nglobally and locally. Extensive experiments have been performed on the\nchallenging public 2D melanoma segmentation dataset. The results show that\nGLMed-SA outperforms several state-of-the-art semantic segmentation methods on\nvarious evaluation metrics, demonstrating the superiority of our methods.\n","authors":["Meng Wang","Yarong Feng","Yongwei Tang","Tian Zhang","Yuxin Liang","Chao Lv"],"pdf_url":"https://arxiv.org/pdf/2409.17486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17485v1","updated":"2024-09-26T02:47:41Z","published":"2024-09-26T02:47:41Z","title":"Revisiting Deep Ensemble Uncertainty for Enhanced Medical Anomaly\n Detection","summary":" Medical anomaly detection (AD) is crucial in pathological identification and\nlocalization. Current methods typically rely on uncertainty estimation in deep\nensembles to detect anomalies, assuming that ensemble learners should agree on\nnormal samples while exhibiting disagreement on unseen anomalies in the output\nspace. However, these methods may suffer from inadequate disagreement on\nanomalies or diminished agreement on normal samples. To tackle these issues, we\npropose D2UE, a Diversified Dual-space Uncertainty Estimation framework for\nmedical anomaly detection. To effectively balance agreement and disagreement\nfor anomaly detection, we propose Redundancy-Aware Repulsion (RAR), which uses\na similarity kernel that remains invariant to both isotropic scaling and\northogonal transformations, explicitly promoting diversity in learners' feature\nspace. Moreover, to accentuate anomalous regions, we develop Dual-Space\nUncertainty (DSU), which utilizes the ensemble's uncertainty in input and\noutput spaces. In input space, we first calculate gradients of reconstruction\nerror with respect to input images. The gradients are then integrated with\nreconstruction outputs to estimate uncertainty for inputs, enabling effective\nanomaly discrimination even when output space disagreement is minimal. We\nconduct a comprehensive evaluation of five medical benchmarks with different\nbackbones. Experimental results demonstrate the superiority of our method to\nstate-of-the-art methods and the effectiveness of each component in our\nframework. Our code is available at https://github.com/Rubiscol/D2UE.\n","authors":["Yi Gu","Yi Lin","Kwang-Ting Cheng","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2409.17485v1.pdf","comment":"Early accepted by MICCAI2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.18110v1","updated":"2024-09-26T17:52:57Z","published":"2024-09-26T17:52:57Z","title":"Open-World Evaluation for Retrieving Diverse Perspectives","summary":" We study retrieving a set of documents that covers various perspectives on a\ncomplex and contentious question (e.g., will ChatGPT do more harm than good?).\nWe curate a Benchmark for Retrieval Diversity for Subjective questions (BERDS),\nwhere each example consists of a question and diverse perspectives associated\nwith the question, sourced from survey questions and debate websites. On this\ndata, retrievers paired with a corpus are evaluated to surface a document set\nthat contains diverse perspectives. Our framing diverges from most retrieval\ntasks in that document relevancy cannot be decided by simple string matches to\nreferences. Instead, we build a language model based automatic evaluator that\ndecides whether each retrieved document contains a perspective. This allows us\nto evaluate the performance of three different types of corpus (Wikipedia, web\nsnapshot, and corpus constructed on the fly with retrieved pages from the\nsearch engine) paired with retrievers. Retrieving diverse documents remains\nchallenging, with the outputs from existing retrievers covering all\nperspectives on only 33.74% of the examples. We further study the impact of\nquery expansion and diversity-focused reranking approaches and analyze\nretriever sycophancy. Together, we lay the foundation for future studies in\nretrieval diversity handling complex queries.\n","authors":["Hung-Ting Chen","Eunsol Choi"],"pdf_url":"https://arxiv.org/pdf/2409.18110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18049v1","updated":"2024-09-26T16:49:58Z","published":"2024-09-26T16:49:58Z","title":"Revisit Anything: Visual Place Recognition via Image Segment Retrieval","summary":" Accurately recognizing a revisited place is crucial for embodied agents to\nlocalize and navigate. This requires visual representations to be distinct,\ndespite strong variations in camera viewpoint and scene appearance. Existing\nvisual place recognition pipelines encode the \"whole\" image and search for\nmatches. This poses a fundamental challenge in matching two images of the same\nplace captured from different camera viewpoints: \"the similarity of what\noverlaps can be dominated by the dissimilarity of what does not overlap\". We\naddress this by encoding and searching for \"image segments\" instead of the\nwhole images. We propose to use open-set image segmentation to decompose an\nimage into `meaningful' entities (i.e., things and stuff). This enables us to\ncreate a novel image representation as a collection of multiple overlapping\nsubgraphs connecting a segment with its neighboring segments, dubbed\nSuperSegment. Furthermore, to efficiently encode these SuperSegments into\ncompact vector representations, we propose a novel factorized representation of\nfeature aggregation. We show that retrieving these partial representations\nleads to significantly higher recognition recall than the typical whole image\nbased retrieval. Our segments-based approach, dubbed SegVLAD, sets a new\nstate-of-the-art in place recognition on a diverse selection of benchmark\ndatasets, while being applicable to both generic and task-specialized image\nencoders. Finally, we demonstrate the potential of our method to ``revisit\nanything'' by evaluating our method on an object instance retrieval task, which\nbridges the two disparate areas of research: visual place recognition and\nobject-goal navigation, through their common aim of recognizing goal objects\nspecific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything.\n","authors":["Kartik Garg","Sai Shubodh Puligilla","Shishir Kolathaya","Madhava Krishna","Sourav Garg"],"pdf_url":"https://arxiv.org/pdf/2409.18049v1.pdf","comment":"Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures"},{"id":"http://arxiv.org/abs/2409.18024v1","updated":"2024-09-26T16:32:10Z","published":"2024-09-26T16:32:10Z","title":"Report on the Workshop on Simulations for Information Access (Sim4IA\n 2024) at SIGIR 2024","summary":" This paper is a report of the Workshop on Simulations for Information Access\n(Sim4IA) workshop at SIGIR 2024. The workshop had two keynotes, a panel\ndiscussion, nine lightning talks, and two breakout sessions. Key takeaways were\nuser simulation's importance in academia and industry, the possible bridging of\nonline and offline evaluation, and the issues of organizing a companion shared\ntask around user simulations for information access. We report on how we\norganized the workshop, provide a brief overview of what happened at the\nworkshop, and summarize the main topics and findings of the workshop and future\nwork.\n","authors":["Timo Breuer","Christin Katharina Kreutz","Norbert Fuhr","Krisztian Balog","Philipp Schaer","Nolwenn Bernard","Ingo Frommholz","Marcel Gohsen","Kaixin Ji","Gareth J. F. Jones","Jüri Keller","Jiqun Liu","Martin Mladenov","Gabriella Pasi","Johanne Trippas","Xi Wang","Saber Zerhoudi","ChengXiang Zhai"],"pdf_url":"https://arxiv.org/pdf/2409.18024v1.pdf","comment":"Preprint of a SIGIR Forum submission for Vol. 58 No. 2 - December\n 2024"},{"id":"http://arxiv.org/abs/2409.18003v1","updated":"2024-09-26T16:12:33Z","published":"2024-09-26T16:12:33Z","title":"Enhancing Tourism Recommender Systems for Sustainable City Trips Using\n Retrieval-Augmented Generation","summary":" Tourism Recommender Systems (TRS) have traditionally focused on providing\npersonalized travel suggestions, often prioritizing user preferences without\nconsidering broader sustainability goals. Integrating sustainability into TRS\nhas become essential with the increasing need to balance environmental impact,\nlocal community interests, and visitor satisfaction. This paper proposes a\nnovel approach to enhancing TRS for sustainable city trips using Large Language\nModels (LLMs) and a modified Retrieval-Augmented Generation (RAG) pipeline. We\nenhance the traditional RAG system by incorporating a sustainability metric\nbased on a city's popularity and seasonal demand during the prompt augmentation\nphase. This modification, called Sustainability Augmented Reranking (SAR),\nensures the system's recommendations align with sustainability goals.\nEvaluations using popular open-source LLMs, such as Llama-3.1-Instruct-8B and\nMistral-Instruct-7B, demonstrate that the SAR-enhanced approach consistently\nmatches or outperforms the baseline (without SAR) across most metrics,\nhighlighting the benefits of incorporating sustainability into TRS.\n","authors":["Ashmi Banerjee","Adithi Satish","Wolfgang Wörndl"],"pdf_url":"https://arxiv.org/pdf/2409.18003v1.pdf","comment":"Accepted at the RecSoGood 2024 Workshop co-located with the 18th ACM\n Conference on Recommender Systems (RecSys 2024)"},{"id":"http://arxiv.org/abs/2409.13740v2","updated":"2024-09-26T15:27:08Z","published":"2024-09-10T16:37:58Z","title":"Language agents achieve superhuman synthesis of scientific knowledge","summary":" Language models are known to hallucinate incorrect information, and it is\nunclear if they are sufficiently accurate and reliable for use in scientific\nresearch. We developed a rigorous human-AI comparison methodology to evaluate\nlanguage model agents on real-world literature search tasks covering\ninformation retrieval, summarization, and contradiction detection tasks. We\nshow that PaperQA2, a frontier language model agent optimized for improved\nfactuality, matches or exceeds subject matter expert performance on three\nrealistic literature research tasks without any restrictions on humans (i.e.,\nfull access to internet, search tools, and time). PaperQA2 writes cited,\nWikipedia-style summaries of scientific topics that are significantly more\naccurate than existing, human-written Wikipedia articles. We also introduce a\nhard benchmark for scientific literature research called LitQA2 that guided\ndesign of PaperQA2, leading to it exceeding human performance. Finally, we\napply PaperQA2 to identify contradictions within the scientific literature, an\nimportant scientific task that is challenging for humans. PaperQA2 identifies\n2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of\nwhich 70% are validated by human experts. These results demonstrate that\nlanguage model agents are now capable of exceeding domain experts across\nmeaningful tasks on scientific literature.\n","authors":["Michael D. Skarlinski","Sam Cox","Jon M. Laurent","James D. Braza","Michaela Hinks","Michael J. Hammerling","Manvitha Ponnapati","Samuel G. Rodriques","Andrew D. White"],"pdf_url":"https://arxiv.org/pdf/2409.13740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09825v2","updated":"2024-09-26T15:21:31Z","published":"2024-06-14T08:29:34Z","title":"Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of\n Anomalous Behavior in Bio-regenerative Life Support System Telemetry","summary":" The detection of abnormal or critical system states is essential in condition\nmonitoring. While much attention is given to promptly identifying anomalies, a\nretrospective analysis of these anomalies can significantly enhance our\ncomprehension of the underlying causes of observed undesired behavior. This\naspect becomes particularly critical when the monitored system is deployed in a\nvital environment. In this study, we delve into anomalies within the domain of\nBio-Regenerative Life Support Systems (BLSS) for space exploration and analyze\nanomalies found in telemetry data stemming from the EDEN ISS space greenhouse\nin Antarctica. We employ time series clustering on anomaly detection results to\ncategorize various types of anomalies in both uni- and multivariate settings.\nWe then assess the effectiveness of these methods in identifying systematic\nanomalous behavior. Additionally, we illustrate that the anomaly detection\nmethods MDI and DAMP produce complementary results, as previously indicated by\nresearch.\n","authors":["Ferdinand Rewicki","Jakob Gawlikowski","Julia Niebling","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2406.09825v2.pdf","comment":"12 pages, + Supplemental Materials, Published at Machine Learning and\n Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024"},{"id":"http://arxiv.org/abs/2409.17864v1","updated":"2024-09-26T14:12:23Z","published":"2024-09-26T14:12:23Z","title":"A Multimodal Single-Branch Embedding Network for Recommendation in\n Cold-Start and Missing Modality Scenarios","summary":" Most recommender systems adopt collaborative filtering (CF) and provide\nrecommendations based on past collective interactions. Therefore, the\nperformance of CF algorithms degrades when few or no interactions are\navailable, a scenario referred to as cold-start. To address this issue,\nprevious work relies on models leveraging both collaborative data and side\ninformation on the users or items. Similar to multimodal learning, these models\naim at combining collaborative and content representations in a shared\nembedding space. In this work we propose a novel technique for multimodal\nrecommendation, relying on a multimodal Single-Branch embedding network for\nRecommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction\ndata as well as multimodal side information using the same single-branch\nembedding network on different modalities. This makes SiBraR effective in\nscenarios of missing modality, including cold start. Our extensive experiments\non large-scale recommendation datasets from three different recommendation\ndomains (music, movie, and e-commerce) and providing multimodal content\ninformation (audio, text, image, labels, and interactions) show that SiBraR\nsignificantly outperforms CF as well as state-of-the-art content-based RSs in\ncold-start scenarios, and is competitive in warm scenarios. We show that\nSiBraR's recommendations are accurate in missing modality scenarios, and that\nthe model is able to map different modalities to the same region of the shared\nembedding space, hence reducing the modality gap.\n","authors":["Christian Ganhör","Marta Moscati","Anna Hausberger","Shah Nawaz","Markus Schedl"],"pdf_url":"https://arxiv.org/pdf/2409.17864v1.pdf","comment":"Accepted at 18th ACM Conference on Recommender Systems (RecSys '24)"},{"id":"http://arxiv.org/abs/2409.17769v1","updated":"2024-09-26T12:07:46Z","published":"2024-09-26T12:07:46Z","title":"Value Identification in Multistakeholder Recommender Systems for\n Humanities and Historical Research: The Case of the Digital Archive\n Monasterium.net","summary":" Recommender systems remain underutilized in humanities and historical\nresearch, despite their potential to enhance the discovery of cultural records.\nThis paper offers an initial value identification of the multiple stakeholders\nthat might be impacted by recommendations in Monasterium.net, a digital archive\nfor historical legal documents. Specifically, we discuss the diverse values and\nobjectives of its stakeholders, such as editors, aggregators, platform owners,\nresearchers, publishers, and funding agencies. These in-depth insights into the\npotentially conflicting values of stakeholder groups allow designing and\nadapting recommender systems to enhance their usefulness for humanities and\nhistorical research. Additionally, our findings will support deeper engagement\nwith additional stakeholders to refine value models and evaluation metrics for\nrecommender systems in the given domains. Our conclusions are embedded in and\napplicable to other digital archives and a broader cultural heritage context.\n","authors":["Florian Atzenhofer-Baumgartner","Bernhard C. Geiger","Georg Vogeler","Dominik Kowald"],"pdf_url":"https://arxiv.org/pdf/2409.17769v1.pdf","comment":"To be presented at: NORMalize 2024: The Second Workshop on the\n Normative Design and Evaluation of Recommender Systems, October 18, 2024,\n co-located with the ACM Conference on Recommender Systems 2024 (RecSys 2024),\n Bari, Italy"},{"id":"http://arxiv.org/abs/2409.17745v1","updated":"2024-09-26T11:19:09Z","published":"2024-09-26T11:19:09Z","title":"Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval\n Model","summary":" A supervised ranking model, despite its advantage of being effective, usually\ninvolves complex processing - typically multiple stages of task-specific\npre-training and fine-tuning. This has motivated researchers to explore simpler\npipelines leveraging large language models (LLMs) that are capable of working\nin a zero-shot manner. However, since zero-shot inference does not make use of\na training set of pairs of queries and their relevant documents, its\nperformance is mostly worse than that of supervised models, which are trained\non such example pairs. Motivated by the existing findings that training\nexamples generally improve zero-shot performance, in our work, we explore if\nthis also applies to ranking models. More specifically, given a query and a\npair of documents, the preference prediction task is improved by augmenting\nexamples of preferences for similar queries from a training set. Our proposed\npairwise few-shot ranker demonstrates consistent improvements over the\nzero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset)\nretrieval benchmarks. Our method also achieves a close performance to that of a\nsupervised model without requiring any complex training pipeline.\n","authors":["Nilanjan Sinhababu","Andrew Parry","Debasis Ganguly","Debasis Samanta","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2409.17745v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17730v1","updated":"2024-09-26T11:00:19Z","published":"2024-09-26T11:00:19Z","title":"Autoregressive Generation Strategies for Top-K Sequential\n Recommendations","summary":" The goal of modern sequential recommender systems is often formulated in\nterms of next-item prediction. In this paper, we explore the applicability of\ngenerative transformer-based models for the Top-K sequential recommendation\ntask, where the goal is to predict items a user is likely to interact with in\nthe \"near future\".\n We explore commonly used autoregressive generation strategies, including\ngreedy decoding, beam search, and temperature sampling, to evaluate their\nperformance for the Top-K sequential recommendation task. In addition, we\npropose novel Reciprocal Rank Aggregation (RRA) and Relevance Aggregation (RA)\ngeneration strategies based on multi-sequence generation with temperature\nsampling and subsequent aggregation.\n Experiments on diverse datasets give valuable insights regarding commonly\nused strategies' applicability and show that suggested approaches improve\nperformance on longer time horizons compared to widely-used Top-K prediction\napproach and single-sequence autoregressive generation strategies.\n","authors":["Anna Volodkevich","Danil Gusak","Anton Klenitskiy","Alexey Vasilev"],"pdf_url":"https://arxiv.org/pdf/2409.17730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17711v1","updated":"2024-09-26T10:27:19Z","published":"2024-09-26T10:27:19Z","title":"Efficient Pointwise-Pairwise Learning-to-Rank for News Recommendation","summary":" News recommendation is a challenging task that involves personalization based\non the interaction history and preferences of each user. Recent works have\nleveraged the power of pretrained language models (PLMs) to directly rank news\nitems by using inference approaches that predominately fall into three\ncategories: pointwise, pairwise, and listwise learning-to-rank. While pointwise\nmethods offer linear inference complexity, they fail to capture crucial\ncomparative information between items that is more effective for ranking tasks.\nConversely, pairwise and listwise approaches excel at incorporating these\ncomparisons but suffer from practical limitations: pairwise approaches are\neither computationally expensive or lack theoretical guarantees, and listwise\nmethods often perform poorly in practice. In this paper, we propose a novel\nframework for PLM-based news recommendation that integrates both pointwise\nrelevance prediction and pairwise comparisons in a scalable manner. We present\na rigorous theoretical analysis of our framework, establishing conditions under\nwhich our approach guarantees improved performance. Extensive experiments show\nthat our approach outperforms the state-of-the-art methods on the MIND and\nAdressa news recommendation datasets.\n","authors":["Nithish Kannen","Yao Ma","Gerrit J. J. van den Burg","Jean Baptiste Faddoul"],"pdf_url":"https://arxiv.org/pdf/2409.17711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15953v2","updated":"2024-09-26T10:22:34Z","published":"2024-08-28T17:12:01Z","title":"Modeling and Analyzing the Influence of Non-Item Pages on Sequential\n Next-Item Prediction","summary":" Analyzing sequences of interactions between users and items, sequential\nrecommendation models can learn user intent and make predictions about the next\nitem. Next to item interactions, most systems also have interactions with what\nwe call non-item pages: these pages are not related to specific items but still\ncan provide insights of the user's interests, as, for example, navigation\npages.\n We therefore propose a general way to include these non-item pages in\nsequential recommendation models to enhance next-item prediction. First, we\ndemonstrate the influence of non-item pages on following interactions with the\nhypotheses testing framework HypTrails and propose methods for representing\nnon-item pages in sequential recommendation models. Subsequently, we adapt\npopular sequential recommender models to integrate non-item pages and\ninvestigate their performance with different item representation strategies as\nwell as their ability to handle noisy data. To show the general capabilities of\nthe models to integrate non-item pages, we create a synthetic dataset for a\ncontrolled setting and then evaluate the improvements from including non-item\npages on two real-world datasets.\n Our results show that non-item pages are a valuable source of information,\nand incorporating them in sequential recommendation models increases the\nperformance of next-item prediction across all analyzed model architectures.\n","authors":["Elisabeth Fischer","Albin Zehe","Andreas Hotho","Daniel Schlör"],"pdf_url":"https://arxiv.org/pdf/2408.15953v2.pdf","comment":"37 pages, 19 figures; Submitted to ACM TORS"},{"id":"http://arxiv.org/abs/2409.17580v1","updated":"2024-09-26T06:53:29Z","published":"2024-09-26T06:53:29Z","title":"Enhancing Structured-Data Retrieval with GraphRAG: Soccer Data Case\n Study","summary":" Extracting meaningful insights from large and complex datasets poses\nsignificant challenges, particularly in ensuring the accuracy and relevance of\nretrieved information. Traditional data retrieval methods such as sequential\nsearch and index-based retrieval often fail when handling intricate and\ninterconnected data structures, resulting in incomplete or misleading outputs.\nTo overcome these limitations, we introduce Structured-GraphRAG, a versatile\nframework designed to enhance information retrieval across structured datasets\nin natural language queries. Structured-GraphRAG utilizes multiple knowledge\ngraphs, which represent data in a structured format and capture complex\nrelationships between entities, enabling a more nuanced and comprehensive\nretrieval of information. This graph-based approach reduces the risk of errors\nin language model outputs by grounding responses in a structured format,\nthereby enhancing the reliability of results. We demonstrate the effectiveness\nof Structured-GraphRAG by comparing its performance with that of a recently\npublished method using traditional retrieval-augmented generation. Our findings\nshow that Structured-GraphRAG significantly improves query processing\nefficiency and reduces response times. While our case study focuses on soccer\ndata, the framework's design is broadly applicable, offering a powerful tool\nfor data analysis and enhancing language model applications across various\nstructured domains.\n","authors":["Zahra Sepasdar","Sushant Gautam","Cise Midoglu","Michael A. Riegler","Pål Halvorsen"],"pdf_url":"https://arxiv.org/pdf/2409.17580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05013v2","updated":"2024-09-26T06:19:34Z","published":"2024-06-07T15:23:53Z","title":"CHIQ: Contextual History Enhancement for Improving Query Rewriting in\n Conversational Search","summary":" In this paper, we study how open-source large language models (LLMs) can be\neffectively deployed for improving query rewriting in conversational search,\nespecially for ambiguous queries. We introduce CHIQ, a two-step method that\nleverages the capabilities of LLMs to resolve ambiguities in the conversation\nhistory before query rewriting. This approach contrasts with prior studies that\npredominantly use closed-source LLMs to directly generate search queries from\nconversation history. We demonstrate on five well-established benchmarks that\nCHIQ leads to state-of-the-art results across most settings, showing highly\ncompetitive performances with systems leveraging closed-source LLMs. Our study\nprovides a first step towards leveraging open-source LLMs in conversational\nsearch, as a competitive alternative to the prevailing reliance on commercial\nLLMs. Data, models, and source code will be publicly available upon acceptance\nat https://github.com/fengranMark/CHIQ.\n","authors":["Fengran Mo","Abbas Ghaddar","Kelong Mao","Mehdi Rezagholizadeh","Boxing Chen","Qun Liu","Jian-Yun Nie"],"pdf_url":"https://arxiv.org/pdf/2406.05013v2.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.15763v2","updated":"2024-09-26T05:43:08Z","published":"2024-09-24T05:39:53Z","title":"IRSC: A Zero-shot Evaluation Benchmark for Information Retrieval through\n Semantic Comprehension in Retrieval-Augmented Generation Scenarios","summary":" In Retrieval-Augmented Generation (RAG) tasks using Large Language Models\n(LLMs), the quality of retrieved information is critical to the final output.\nThis paper introduces the IRSC benchmark for evaluating the performance of\nembedding models in multilingual RAG tasks. The benchmark encompasses five\nretrieval tasks: query retrieval, title retrieval, part-of-paragraph retrieval,\nkeyword retrieval, and summary retrieval. Our research addresses the current\nlack of comprehensive testing and effective comparison methods for embedding\nmodels in RAG scenarios. We introduced new metrics: the Similarity of Semantic\nComprehension Index (SSCI) and the Retrieval Capability Contest Index (RCCI),\nand evaluated models such as Snowflake-Arctic, BGE, GTE, and M3E. Our\ncontributions include: 1) the IRSC benchmark, 2) the SSCI and RCCI metrics, and\n3) insights into the cross-lingual limitations of embedding models. The IRSC\nbenchmark aims to enhance the understanding and development of accurate\nretrieval systems in RAG tasks. All code and datasets are available at:\nhttps://github.com/Jasaxion/IRSC_Benchmark\n","authors":["Hai Lin","Shaoxiong Zhan","Junyou Su","Haitao Zheng","Hui Wang"],"pdf_url":"https://arxiv.org/pdf/2409.15763v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10743v4","updated":"2024-09-26T03:38:59Z","published":"2023-12-17T15:28:06Z","title":"A Unified Framework for Multi-Domain CTR Prediction via Large Language\n Models","summary":" Click-Through Rate (CTR) prediction is a crucial task in online\nrecommendation platforms as it involves estimating the probability of user\nengagement with advertisements or items by clicking on them. Given the\navailability of various services like online shopping, ride-sharing, food\ndelivery, and professional services on commercial platforms, recommendation\nsystems in these platforms are required to make CTR predictions across multiple\ndomains rather than just a single domain. However, multi-domain click-through\nrate (MDCTR) prediction remains a challenging task in online recommendation due\nto the complex mutual influence between domains. Traditional MDCTR models\ntypically encode domains as discrete identifiers, ignoring rich semantic\ninformation underlying. Consequently, they can hardly generalize to new\ndomains. Besides, existing models can be easily dominated by some specific\ndomains, which results in significant performance drops in the other domains\n(i.e. the \"seesaw phenomenon\"). In this paper, we propose a novel solution\nUni-CTR to address the above challenges. Uni-CTR leverages a backbone Large\nLanguage Model (LLM) to learn layer-wise semantic representations that capture\ncommonalities between domains. Uni-CTR also uses several domain-specific\nnetworks to capture the characteristics of each domain. Note that we design a\nmasked loss strategy so that these domain-specific networks are decoupled from\nbackbone LLM. This allows domain-specific networks to remain unchanged when\nincorporating new or removing domains, thereby enhancing the flexibility and\nscalability of the system significantly. Experimental results on three public\ndatasets show that Uni-CTR outperforms the state-of-the-art (SOTA) MDCTR models\nsignificantly. Furthermore, Uni-CTR demonstrates remarkable effectiveness in\nzero-shot prediction. We have applied Uni-CTR in industrial scenarios,\nconfirming its efficiency.\n","authors":["Zichuan Fu","Xiangyang Li","Chuhan Wu","Yichao Wang","Kuicai Dong","Xiangyu Zhao","Mengchen Zhao","Huifeng Guo","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2312.10743v4.pdf","comment":"Accept By ACM TRANSACTIONS ON INFORMATION SYSTEMS(TOIS)"},{"id":"http://arxiv.org/abs/2409.17476v1","updated":"2024-09-26T02:24:03Z","published":"2024-09-26T02:24:03Z","title":"Improving the Shortest Plank: Vulnerability-Aware Adversarial Training\n for Robust Recommender System","summary":" Recommender systems play a pivotal role in mitigating information overload in\nvarious fields. Nonetheless, the inherent openness of these systems introduces\nvulnerabilities, allowing attackers to insert fake users into the system's\ntraining data to skew the exposure of certain items, known as poisoning\nattacks. Adversarial training has emerged as a notable defense mechanism\nagainst such poisoning attacks within recommender systems. Existing adversarial\ntraining methods apply perturbations of the same magnitude across all users to\nenhance system robustness against attacks. Yet, in reality, we find that\nattacks often affect only a subset of users who are vulnerable. These\nperturbations of indiscriminate magnitude make it difficult to balance\neffective protection for vulnerable users without degrading recommendation\nquality for those who are not affected. To address this issue, our research\ndelves into understanding user vulnerability. Considering that poisoning\nattacks pollute the training data, we note that the higher degree to which a\nrecommender system fits users' training data correlates with an increased\nlikelihood of users incorporating attack information, indicating their\nvulnerability. Leveraging these insights, we introduce the Vulnerability-aware\nAdversarial Training (VAT), designed to defend against poisoning attacks in\nrecommender systems. VAT employs a novel vulnerability-aware function to\nestimate users' vulnerability based on the degree to which the system fits\nthem. Guided by this estimation, VAT applies perturbations of adaptive\nmagnitude to each user, not only reducing the success ratio of attacks but also\npreserving, and potentially enhancing, the quality of recommendations.\nComprehensive experiments confirm VAT's superior defensive capabilities across\ndifferent recommendation models and against various types of attacks.\n","authors":["Kaike Zhang","Qi Cao","Yunfan Wu","Fei Sun","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2409.17476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17460v1","updated":"2024-09-26T01:38:05Z","published":"2024-09-26T01:38:05Z","title":"Towards More Relevant Product Search Ranking Via Large Language Models:\n An Empirical Study","summary":" Training Learning-to-Rank models for e-commerce product search ranking can be\nchallenging due to the lack of a gold standard of ranking relevance. In this\npaper, we decompose ranking relevance into content-based and engagement-based\naspects, and we propose to leverage Large Language Models (LLMs) for both label\nand feature generation in model training, primarily aiming to improve the\nmodel's predictive capability for content-based relevance. Additionally, we\nintroduce different sigmoid transformations on the LLM outputs to polarize\nrelevance scores in labeling, enhancing the model's ability to balance\ncontent-based and engagement-based relevances and thus prioritize highly\nrelevant items overall. Comprehensive online tests and offline evaluations are\nalso conducted for the proposed design. Our work sheds light on advanced\nstrategies for integrating LLMs into e-commerce product search ranking model\ntraining, offering a pathway to more effective and balanced models with\nimproved ranking relevance.\n","authors":["Qi Liu","Atul Singh","Jingbo Liu","Cun Mu","Zheng Yan"],"pdf_url":"https://arxiv.org/pdf/2409.17460v1.pdf","comment":"To be published in CIKM 2024 GenAIECommerce Workshop"},{"id":"http://arxiv.org/abs/2409.17456v1","updated":"2024-09-26T01:18:29Z","published":"2024-09-26T01:18:29Z","title":"Long or Short or Both? An Exploration on Lookback Time Windows of\n Behavioral Features in Product Search Ranking","summary":" Customer shopping behavioral features are core to product search ranking\nmodels in eCommerce. In this paper, we investigate the effect of lookback time\nwindows when aggregating these features at the (query, product) level over\nhistory. By studying the pros and cons of using long and short time windows, we\npropose a novel approach to integrating these historical behavioral features of\ndifferent time windows. In particular, we address the criticality of using\nquery-level vertical signals in ranking models to effectively aggregate all\ninformation from different behavioral features. Anecdotal evidence for the\nproposed approach is also provided using live product search traffic on\nWalmart.com.\n","authors":["Qi Liu","Atul Singh","Jingbo Liu","Cun Mu","Zheng Yan","Jan Pedersen"],"pdf_url":"https://arxiv.org/pdf/2409.17456v1.pdf","comment":"Published in ACM SIGIR Workshop on eCommerce 2024"},{"id":"http://arxiv.org/abs/2409.17436v1","updated":"2024-09-26T00:08:46Z","published":"2024-09-26T00:08:46Z","title":"Minimizing Live Experiments in Recommender Systems: User Simulation to\n Evaluate Preference Elicitation Policies","summary":" Evaluation of policies in recommender systems typically involves A/B testing\nusing live experiments on real users to assess a new policy's impact on\nrelevant metrics. This ``gold standard'' comes at a high cost, however, in\nterms of cycle time, user cost, and potential user retention. In developing\npolicies for ``onboarding'' new users, these costs can be especially\nproblematic, since on-boarding occurs only once. In this work, we describe a\nsimulation methodology used to augment (and reduce) the use of live\nexperiments. We illustrate its deployment for the evaluation of ``preference\nelicitation'' algorithms used to onboard new users of the YouTube Music\nplatform. By developing counterfactually robust user behavior models, and a\nsimulation service that couples such models with production infrastructure, we\nare able to test new algorithms in a way that reliably predicts their\nperformance on key metrics when deployed live. We describe our domain, our\nsimulation models and platform, results of experiments and deployment, and\nsuggest future steps needed to further realistic simulation as a powerful\ncomplement to live experiments.\n","authors":["Chih-Wei Hsu","Martin Mladenov","Ofer Meshi","James Pine","Hubert Pham","Shane Li","Xujian Liang","Anton Polishko","Li Yang","Ben Scheetz","Craig Boutilier"],"pdf_url":"https://arxiv.org/pdf/2409.17436v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2409.18119v1","updated":"2024-09-26T17:56:59Z","published":"2024-09-26T17:56:59Z","title":"Multi-View and Multi-Scale Alignment for Contrastive Language-Image\n Pre-training in Mammography","summary":" Contrastive Language-Image Pre-training (CLIP) shows promise in medical image\nanalysis but requires substantial data and computational resources. Due to\nthese restrictions, existing CLIP applications in medical imaging focus mainly\non modalities like chest X-rays that have abundant image-report data available,\nleaving many other important modalities under-explored. Here, we propose the\nfirst adaptation of the full CLIP model to mammography, which presents\nsignificant challenges due to labeled data scarcity, high-resolution images\nwith small regions of interest, and data imbalance. We first develop a\nspecialized supervision framework for mammography that leverages its multi-view\nnature. Furthermore, we design a symmetric local alignment module to better\nfocus on detailed features in high-resolution images. Lastly, we incorporate a\nparameter-efficient fine-tuning approach for large language models pre-trained\nwith medical knowledge to address data limitations. Our multi-view and\nmulti-scale alignment (MaMA) method outperforms state-of-the-art baselines for\nthree different tasks on two large real-world mammography datasets, EMBED and\nRSNA-Mammo, with only 52% model size compared with the largest baseline.\n","authors":["Yuexi Du","John Onofrey","Nicha C. Dvornek"],"pdf_url":"https://arxiv.org/pdf/2409.18119v1.pdf","comment":"This work is also the basis of the overall best solution for the\n MICCAI 2024 CXR-LT Challenge"},{"id":"http://arxiv.org/abs/2310.13387v2","updated":"2024-09-26T17:55:48Z","published":"2023-10-20T09:56:07Z","title":"Assumption violations in causal discovery and the robustness of score\n matching","summary":" When domain knowledge is limited and experimentation is restricted by\nethical, financial, or time constraints, practitioners turn to observational\ncausal discovery methods to recover the causal structure, exploiting the\nstatistical properties of their data. Because causal discovery without further\nassumptions is an ill-posed problem, each algorithm comes with its own set of\nusually untestable assumptions, some of which are hard to meet in real\ndatasets. Motivated by these considerations, this paper extensively benchmarks\nthe empirical performance of recent causal discovery methods on observational\ni.i.d. data generated under different background conditions, allowing for\nviolations of the critical assumptions required by each selected approach. Our\nexperimental findings show that score matching-based methods demonstrate\nsurprising performance in the false positive and false negative rate of the\ninferred graph in these challenging scenarios, and we provide theoretical\ninsights into their performance. This work is also the first effort to\nbenchmark the stability of causal discovery algorithms with respect to the\nvalues of their hyperparameters. Finally, we hope this paper will set a new\nstandard for the evaluation of causal discovery methods and can serve as an\naccessible entry point for practitioners interested in the field, highlighting\nthe empirical implications of different algorithm choices.\n","authors":["Francesco Montagna","Atalanti A. Mastakouri","Elias Eulig","Nicoletta Noceti","Lorenzo Rosasco","Dominik Janzing","Bryon Aragam","Francesco Locatello"],"pdf_url":"https://arxiv.org/pdf/2310.13387v2.pdf","comment":"37th Conference on Neural Information Processing Systems (NeurIPS\n 2023)"},{"id":"http://arxiv.org/abs/2409.18104v1","updated":"2024-09-26T17:49:20Z","published":"2024-09-26T17:49:20Z","title":"Find Rhinos without Finding Rhinos: Active Learning with Multimodal\n Imagery of South African Rhino Habitats","summary":" Much of Earth's charismatic megafauna is endangered by human activities,\nparticularly the rhino, which is at risk of extinction due to the poaching\ncrisis in Africa. Monitoring rhinos' movement is crucial to their protection\nbut has unfortunately proven difficult because rhinos are elusive. Therefore,\ninstead of tracking rhinos, we propose the novel approach of mapping communal\ndefecation sites, called middens, which give information about rhinos' spatial\nbehavior valuable to anti-poaching, management, and reintroduction efforts.\nThis paper provides the first-ever mapping of rhino midden locations by\nbuilding classifiers to detect them using remotely sensed thermal, RGB, and\nLiDAR imagery in passive and active learning settings. As existing active\nlearning methods perform poorly due to the extreme class imbalance in our\ndataset, we design MultimodAL, an active learning system employing a ranking\ntechnique and multimodality to achieve competitive performance with passive\nlearning models with 94% fewer labels. Our methods could therefore save over 76\nhours in labeling time when used on a similarly-sized dataset. Unexpectedly,\nour midden map reveals that rhino middens are not randomly distributed\nthroughout the landscape; rather, they are clustered. Consequently, rangers\nshould be targeted at areas with high midden densities to strengthen\nanti-poaching efforts, in line with UN Target 15.7.\n","authors":["Lucia Gordon","Nikhil Behari","Samuel Collier","Elizabeth Bondi-Kelly","Jackson A. Killian","Catherine Ressijac","Peter Boucher","Andrew Davies","Milind Tambe"],"pdf_url":"https://arxiv.org/pdf/2409.18104v1.pdf","comment":"9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good"},{"id":"http://arxiv.org/abs/2409.18102v1","updated":"2024-09-26T17:45:10Z","published":"2024-09-26T17:45:10Z","title":"MALPOLON: A Framework for Deep Species Distribution Modeling","summary":" This paper describes a deep-SDM framework, MALPOLON. Written in Python and\nbuilt upon the PyTorch library, this framework aims to facilitate training and\ninferences of deep species distribution models (deep-SDM) and sharing for users\nwith only general Python language skills (e.g., modeling ecologists) who are\ninterested in testing deep learning approaches to build new SDMs. More advanced\nusers can also benefit from the framework's modularity to run more specific\nexperiments by overriding existing classes while taking advantage of\npress-button examples to train neural networks on multiple classification tasks\nusing custom or provided raw and pre-processed datasets. The framework is\nopen-sourced on GitHub and PyPi along with extensive documentation and examples\nof use in various scenarios. MALPOLON offers straightforward installation,\nYAML-based configuration, parallel computing, multi-GPU utilization, baseline\nand foundational models for benchmarking, and extensive\ntutorials/documentation, aiming to enhance accessibility and performance\nscalability for ecologists and researchers.\n","authors":["Theo Larcher","Lukas Picek","Benjamin Deneu","Titouan Lorieul","Maximilien Servajean","Alexis Joly"],"pdf_url":"https://arxiv.org/pdf/2409.18102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18100v1","updated":"2024-09-26T17:44:29Z","published":"2024-09-26T17:44:29Z","title":"Self-supervised Pretraining for Cardiovascular Magnetic Resonance Cine\n Segmentation","summary":" Self-supervised pretraining (SSP) has shown promising results in learning\nfrom large unlabeled datasets and, thus, could be useful for automated\ncardiovascular magnetic resonance (CMR) short-axis cine segmentation. However,\ninconsistent reports of the benefits of SSP for segmentation have made it\ndifficult to apply SSP to CMR. Therefore, this study aimed to evaluate SSP\nmethods for CMR cine segmentation.\n To this end, short-axis cine stacks of 296 subjects (90618 2D slices) were\nused for unlabeled pretraining with four SSP methods; SimCLR, positional\ncontrastive learning, DINO, and masked image modeling (MIM). Subsets of varying\nnumbers of subjects were used for supervised fine-tuning of 2D models for each\nSSP method, as well as to train a 2D baseline model from scratch. The\nfine-tuned models were compared to the baseline using the 3D Dice similarity\ncoefficient (DSC) in a test dataset of 140 subjects.\n The SSP methods showed no performance gains with the largest supervised\nfine-tuning subset compared to the baseline (DSC = 0.89). When only 10 subjects\n(231 2D slices) are available for supervised training, SSP using MIM (DSC =\n0.86) improves over training from scratch (DSC = 0.82).\n This study found that SSP is valuable for CMR cine segmentation when labeled\ntraining data is scarce, but does not aid state-of-the-art deep learning\nmethods when ample labeled data is available. Moreover, the choice of SSP\nmethod is important. The code is publicly available at:\nhttps://github.com/q-cardIA/ssp-cmr-cine-segmentation\n","authors":["Rob A. J. de Mooij","Josien P. W. Pluim","Cian M. Scannell"],"pdf_url":"https://arxiv.org/pdf/2409.18100v1.pdf","comment":"Accepted to Data Engineering in Medical Imaging (DEMI) Workshop at\n MICCAI 2024"},{"id":"http://arxiv.org/abs/2409.04406v2","updated":"2024-09-26T17:38:26Z","published":"2024-09-06T16:56:06Z","title":"Quantum Kernel Methods under Scrutiny: A Benchmarking Study","summary":" Since the entry of kernel theory in the field of quantum machine learning,\nquantum kernel methods (QKMs) have gained increasing attention with regard to\nboth probing promising applications and delivering intriguing research\ninsights. Two common approaches for computing the underlying Gram matrix have\nemerged: fidelity quantum kernels (FQKs) and projected quantum kernels (PQKs).\nBenchmarking these methods is crucial to gain robust insights and to understand\ntheir practical utility. In this work, we present a comprehensive large-scale\nstudy examining QKMs based on FQKs and PQKs across a manifold of design\nchoices. Our investigation encompasses both classification and regression tasks\nfor five dataset families and 64 datasets, systematically comparing the use of\nFQKs and PQKs quantum support vector machines and kernel ridge regression. This\nresulted in over 20,000 models that were trained and optimized using a\nstate-of-the-art hyperparameter search to ensure robust and comprehensive\ninsights. We delve into the importance of hyperparameters on model performance\nscores and support our findings through rigorous correlation analyses. In this,\nwe also closely inspect two data encoding strategies. Moreover, we provide an\nin-depth analysis addressing the design freedom of PQKs and explore the\nunderlying principles responsible for learning. Our goal is not to identify the\nbest-performing model for a specific task but to uncover the mechanisms that\nlead to effective QKMs and reveal universal patterns.\n","authors":["Jan Schnabel","Marco Roth"],"pdf_url":"https://arxiv.org/pdf/2409.04406v2.pdf","comment":"18 pages main text including 12 figures and 1 table, appendix 14\n pages with 19 figures and 1 table; restructure result section and prune\n appendix"},{"id":"http://arxiv.org/abs/2409.18073v1","updated":"2024-09-26T17:19:49Z","published":"2024-09-26T17:19:49Z","title":"Infer Human's Intentions Before Following Natural Language Instructions","summary":" For AI agents to be helpful to humans, they should be able to follow natural\nlanguage instructions to complete everyday cooperative tasks in human\nenvironments. However, real human instructions inherently possess ambiguity,\nbecause the human speakers assume sufficient prior knowledge about their hidden\ngoals and intentions. Standard language grounding and planning methods fail to\naddress such ambiguities because they do not model human internal goals as\nadditional partially observable factors in the environment. We propose a new\nframework, Follow Instructions with Social and Embodied Reasoning (FISER),\naiming for better natural language instruction following in collaborative\nembodied tasks. Our framework makes explicit inferences about human goals and\nintentions as intermediate reasoning steps. We implement a set of\nTransformer-based models and evaluate them over a challenging benchmark,\nHandMeThat. We empirically demonstrate that using social reasoning to\nexplicitly infer human intentions before making action plans surpasses purely\nend-to-end approaches. We also compare our implementation with strong\nbaselines, including Chain of Thought prompting on the largest available\npre-trained language models, and find that FISER provides better performance on\nthe embodied social reasoning tasks under investigation, reaching the\nstate-of-the-art on HandMeThat.\n","authors":["Yanming Wan","Yue Wu","Yiping Wang","Jiayuan Mao","Natasha Jaques"],"pdf_url":"https://arxiv.org/pdf/2409.18073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18061v1","updated":"2024-09-26T17:01:41Z","published":"2024-09-26T17:01:41Z","title":"Optimal Protocols for Continual Learning via Statistical Physics and\n Control Theory","summary":" Artificial neural networks often struggle with catastrophic forgetting when\nlearning multiple tasks sequentially, as training on new tasks degrades the\nperformance on previously learned ones. Recent theoretical work has addressed\nthis issue by analysing learning curves in synthetic frameworks under\npredefined training protocols. However, these protocols relied on heuristics\nand lacked a solid theoretical foundation assessing their optimality. In this\npaper, we fill this gap combining exact equations for training dynamics,\nderived using statistical physics techniques, with optimal control methods. We\napply this approach to teacher-student models for continual learning and\nmulti-task problems, obtaining a theory for task-selection protocols maximising\nperformance while minimising forgetting. Our theoretical analysis offers\nnon-trivial yet interpretable strategies for mitigating catastrophic\nforgetting, shedding light on how optimal learning protocols can modulate\nestablished effects, such as the influence of task similarity on forgetting.\nFinally, we validate our theoretical findings on real-world data.\n","authors":["Francesco Mori","Stefano Sarao Mannelli","Francesca Mignacco"],"pdf_url":"https://arxiv.org/pdf/2409.18061v1.pdf","comment":"19 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.18051v1","updated":"2024-09-26T16:55:31Z","published":"2024-09-26T16:55:31Z","title":"Inverse Reinforcement Learning with Multiple Planning Horizons","summary":" In this work, we study an inverse reinforcement learning (IRL) problem where\nthe experts are planning under a shared reward function but with different,\nunknown planning horizons. Without the knowledge of discount factors, the\nreward function has a larger feasible solution set, which makes it harder for\nexisting IRL approaches to identify a reward function. To overcome this\nchallenge, we develop algorithms that can learn a global multi-agent reward\nfunction with agent-specific discount factors that reconstruct the expert\npolicies. We characterize the feasible solution space of the reward function\nand discount factors for both algorithms and demonstrate the generalizability\nof the learned reward function across multiple domains.\n","authors":["Jiayu Yao","Weiwei Pan","Finale Doshi-Velez","Barbara E Engelhardt"],"pdf_url":"https://arxiv.org/pdf/2409.18051v1.pdf","comment":"Accepted at RLC 2024"},{"id":"http://arxiv.org/abs/2409.18049v1","updated":"2024-09-26T16:49:58Z","published":"2024-09-26T16:49:58Z","title":"Revisit Anything: Visual Place Recognition via Image Segment Retrieval","summary":" Accurately recognizing a revisited place is crucial for embodied agents to\nlocalize and navigate. This requires visual representations to be distinct,\ndespite strong variations in camera viewpoint and scene appearance. Existing\nvisual place recognition pipelines encode the \"whole\" image and search for\nmatches. This poses a fundamental challenge in matching two images of the same\nplace captured from different camera viewpoints: \"the similarity of what\noverlaps can be dominated by the dissimilarity of what does not overlap\". We\naddress this by encoding and searching for \"image segments\" instead of the\nwhole images. We propose to use open-set image segmentation to decompose an\nimage into `meaningful' entities (i.e., things and stuff). This enables us to\ncreate a novel image representation as a collection of multiple overlapping\nsubgraphs connecting a segment with its neighboring segments, dubbed\nSuperSegment. Furthermore, to efficiently encode these SuperSegments into\ncompact vector representations, we propose a novel factorized representation of\nfeature aggregation. We show that retrieving these partial representations\nleads to significantly higher recognition recall than the typical whole image\nbased retrieval. Our segments-based approach, dubbed SegVLAD, sets a new\nstate-of-the-art in place recognition on a diverse selection of benchmark\ndatasets, while being applicable to both generic and task-specialized image\nencoders. Finally, we demonstrate the potential of our method to ``revisit\nanything'' by evaluating our method on an object instance retrieval task, which\nbridges the two disparate areas of research: visual place recognition and\nobject-goal navigation, through their common aim of recognizing goal objects\nspecific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything.\n","authors":["Kartik Garg","Sai Shubodh Puligilla","Shishir Kolathaya","Madhava Krishna","Sourav Garg"],"pdf_url":"https://arxiv.org/pdf/2409.18049v1.pdf","comment":"Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures"},{"id":"http://arxiv.org/abs/2408.11974v2","updated":"2024-09-26T16:48:34Z","published":"2024-08-21T20:14:54Z","title":"Two-Timescale Gradient Descent Ascent Algorithms for Nonconvex Minimax\n Optimization","summary":" We provide a unified analysis of two-timescale gradient descent ascent\n(TTGDA) for solving structured nonconvex minimax optimization problems in the\nform of $\\min_\\textbf{x} \\max_{\\textbf{y} \\in Y} f(\\textbf{x}, \\textbf{y})$,\nwhere the objective function $f(\\textbf{x}, \\textbf{y})$ is nonconvex in\n$\\textbf{x}$ and concave in $\\textbf{y}$, and the constraint set $Y \\subseteq\n\\mathbb{R}^n$ is convex and bounded. In the convex-concave setting, the\nsingle-timescale gradient descent ascent (GDA) algorithm is widely used in\napplications and has been shown to have strong convergence guarantees. In more\ngeneral settings, however, it can fail to converge. Our contribution is to\ndesign TTGDA algorithms that are effective beyond the convex-concave setting,\nefficiently finding a stationary point of the function $\\Phi(\\cdot) :=\n\\max_{\\textbf{y} \\in Y} f(\\cdot, \\textbf{y})$. We also establish theoretical\nbounds on the complexity of solving both smooth and nonsmooth nonconvex-concave\nminimax optimization problems. To the best of our knowledge, this is the first\nsystematic analysis of TTGDA for nonconvex minimax optimization, shedding light\non its superior performance in training generative adversarial networks (GANs)\nand in other real-world application problems.\n","authors":["Tianyi Lin","Chi Jin","Michael. I. Jordan"],"pdf_url":"https://arxiv.org/pdf/2408.11974v2.pdf","comment":"A preliminary version [arXiv:1906.00331] of this paper, with a subset\n of the results that are presented here, was presented at ICML 2020; 44 Pages,\n 10 Figures"},{"id":"http://arxiv.org/abs/2409.18046v1","updated":"2024-09-26T16:47:32Z","published":"2024-09-26T16:47:32Z","title":"IFCap: Image-like Retrieval and Frequency-based Entity Filtering for\n Zero-shot Captioning","summary":" Recent advancements in image captioning have explored text-only training\nmethods to overcome the limitations of paired image-text data. However,\nexisting text-only training methods often overlook the modality gap between\nusing text data during training and employing images during inference. To\naddress this issue, we propose a novel approach called Image-like Retrieval,\nwhich aligns text features with visually relevant features to mitigate the\nmodality gap. Our method further enhances the accuracy of generated captions by\ndesigning a Fusion Module that integrates retrieved captions with input\nfeatures. Additionally, we introduce a Frequency-based Entity Filtering\ntechnique that significantly improves caption quality. We integrate these\nmethods into a unified framework, which we refer to as IFCap\n($\\textbf{I}$mage-like Retrieval and $\\textbf{F}$requency-based Entity\nFiltering for Zero-shot $\\textbf{Cap}$tioning). Through extensive\nexperimentation, our straightforward yet powerful approach has demonstrated its\nefficacy, outperforming the state-of-the-art methods by a significant margin in\nboth image captioning and video captioning compared to zero-shot captioning\nbased on text-only training.\n","authors":["Soeun Lee","Si-Woo Kim","Taewhan Kim","Dong-Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18046v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.16626v2","updated":"2024-09-26T16:41:27Z","published":"2024-09-25T05:11:58Z","title":"Ascend HiFloat8 Format for Deep Learning","summary":" This preliminary white paper proposes a novel 8-bit floating-point data\nformat HiFloat8 (abbreviated as HiF8) for deep learning. HiF8 features tapered\nprecision. For normal value encoding, it provides 7 exponent values with 3-bit\nmantissa, 8 exponent values with 2-bit mantissa, and 16 exponent values with\n1-bit mantissa. For denormal value encoding, it extends the dynamic range by 7\nextra powers of 2, from 31 to 38 binades (notice that FP16 covers 40 binades).\nMeanwhile, HiF8 encodes all the special values except that positive zero and\nnegative zero are represented by only one bit-pattern. Thanks to the better\nbalance between precision and dynamic range, HiF8 can be simultaneously used in\nboth forward and backward passes of AI training. In this paper, we will\ndescribe the definition and rounding methods of HiF8, as well as the tentative\ntraining and inference solutions. To demonstrate the efficacy of HiF8, massive\nsimulation results on various neural networks, including traditional neural\nnetworks and large language models (LLMs), will also be presented.\n","authors":["Yuanyong Luo","Zhongxing Zhang","Richard Wu","Hu Liu","Ying Jin","Kai Zheng","Minmin Wang","Zhanying He","Guipeng Hu","Luyao Chen","Tianchi Hu","Junsong Wang","Minqi Chen","Mikhaylov Dmitry","Korviakov Vladimir","Bobrin Maxim","Yuhao Hu","Guanfu Chen","Zeyi Huang"],"pdf_url":"https://arxiv.org/pdf/2409.16626v2.pdf","comment":"13 Pages, 4 Figures, 9 Tables"},{"id":"http://arxiv.org/abs/2409.18032v1","updated":"2024-09-26T16:38:48Z","published":"2024-09-26T16:38:48Z","title":"FlowBench: A Large Scale Benchmark for Flow Simulation over Complex\n Geometries","summary":" Simulating fluid flow around arbitrary shapes is key to solving various\nengineering problems. However, simulating flow physics across complex\ngeometries remains numerically challenging and computationally\nresource-intensive, particularly when using conventional PDE solvers. Machine\nlearning methods offer attractive opportunities to create fast and adaptable\nPDE solvers. However, benchmark datasets to measure the performance of such\nmethods are scarce, especially for flow physics across complex geometries. We\nintroduce FlowBench, a dataset for neural simulators with over 10K samples,\nwhich is currently larger than any publicly available flow physics dataset.\nFlowBench contains flow simulation data across complex geometries\n(\\textit{parametric vs. non-parametric}), spanning a range of flow conditions\n(\\textit{Reynolds number and Grashoff number}), capturing a diverse array of\nflow phenomena (\\textit{steady vs. transient; forced vs. free convection}), and\nfor both 2D and 3D. FlowBench contains over 10K data samples, with each sample\nthe outcome of a fully resolved, direct numerical simulation using a\nwell-validated simulator framework designed for modeling transport phenomena in\ncomplex geometries. For each sample, we include velocity, pressure, and\ntemperature field data at 3 different resolutions and several summary\nstatistics features of engineering relevance (such as coefficients of lift and\ndrag, and Nusselt numbers). %Additionally, we include masks and signed distance\nfields for each shape. We envision that FlowBench will enable evaluating the\ninterplay between complex geometry, coupled flow phenomena, and data\nsufficiency on the performance of current, and future, neural PDE solvers. We\nenumerate several evaluation metrics to help rank order the performance of\nneural PDE solvers. We benchmark the performance of several baseline methods\nincluding FNO, CNO, WNO, and DeepONet.\n","authors":["Ronak Tali","Ali Rabeh","Cheng-Hau Yang","Mehdi Shadkhah","Samundra Karki","Abhisek Upadhyaya","Suriya Dhakshinamoorthy","Marjan Saadati","Soumik Sarkar","Adarsh Krishnamurthy","Chinmay Hegde","Aditya Balu","Baskar Ganapathysubramanian"],"pdf_url":"https://arxiv.org/pdf/2409.18032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15126v5","updated":"2024-09-26T16:38:32Z","published":"2024-08-27T15:07:27Z","title":"Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of\n Peptides","summary":" Molecular Dynamics (MD) is crucial in various fields such as materials\nscience, chemistry, and pharmacology to name a few. Conventional MD software\nstruggles with the balance between time cost and prediction accuracy, which\nrestricts its wider application. Recently, data-driven approaches based on deep\ngenerative models have been devised for time-coarsened dynamics, which aim at\nlearning dynamics of diverse molecular systems over a long timestep, enjoying\nboth universality and efficiency. Nevertheless, most current methods are\ndesigned solely to learn from the data distribution regardless of the\nunderlying Boltzmann distribution, and the physics priors such as energies and\nforces are constantly overlooked. In this work, we propose a conditional\ngenerative model called Force-guided Bridge Matching (FBM), which learns\nfull-atom time-coarsened dynamics and targets the Boltzmann-constrained\ndistribution. With the guidance of our delicately-designed intermediate force\nfield, FBM leverages favourable physics priors into the generation process,\ngiving rise to enhanced simulations. Experiments on two datasets consisting of\npeptides verify our superiority in terms of comprehensive metrics and\ndemonstrate transferability to unseen systems.\n","authors":["Ziyang Yu","Wenbing Huang","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15126v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18025v1","updated":"2024-09-26T16:32:19Z","published":"2024-09-26T16:32:19Z","title":"An Adversarial Perspective on Machine Unlearning for AI Safety","summary":" Large language models are finetuned to refuse questions about hazardous\nknowledge, but these protections can often be bypassed. Unlearning methods aim\nat completely removing hazardous capabilities from models and make them\ninaccessible to adversaries. This work challenges the fundamental differences\nbetween unlearning and traditional safety post-training from an adversarial\nperspective. We demonstrate that existing jailbreak methods, previously\nreported as ineffective against unlearning, can be successful when applied\ncarefully. Furthermore, we develop a variety of adaptive methods that recover\nmost supposedly unlearned capabilities. For instance, we show that finetuning\non 10 unrelated examples or removing specific directions in the activation\nspace can recover most hazardous capabilities for models edited with RMU, a\nstate-of-the-art unlearning method. Our findings challenge the robustness of\ncurrent unlearning approaches and question their advantages over safety\ntraining.\n","authors":["Jakub Łucki","Boyi Wei","Yangsibo Huang","Peter Henderson","Florian Tramèr","Javier Rando"],"pdf_url":"https://arxiv.org/pdf/2409.18025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18013v1","updated":"2024-09-26T16:22:08Z","published":"2024-09-26T16:22:08Z","title":"Spatiotemporal Learning on Cell-embedded Graphs","summary":" Data-driven simulation of physical systems has recently kindled significant\nattention, where many neural models have been developed. In particular,\nmesh-based graph neural networks (GNNs) have demonstrated significant potential\nin predicting spatiotemporal dynamics across arbitrary geometric domains.\nHowever, the existing node-edge message passing mechanism in GNNs limits the\nmodel's representation learning ability. In this paper, we proposed a\ncell-embedded GNN model (aka CeGNN) to learn spatiotemporal dynamics with\nlifted performance. Specifically, we introduce a learnable cell attribution to\nthe node-edge message passing process, which better captures the spatial\ndependency of regional features. Such a strategy essentially upgrades the local\naggregation scheme from the first order (e.g., from edge to node) to a higher\norder (e.g., from volume to edge and then to node), which takes advantage of\nvolumetric information in message passing. Meanwhile, a novel feature-enhanced\nblock is designed to further improve the performance of CeGNN and relieve the\nover-smoothness problem, via treating the latent features as basis functions.\nThe extensive experiments on various PDE systems and one real-world dataset\ndemonstrate that CeGNN achieves superior performance compared with other\nbaseline models, particularly reducing the prediction error with up to 1 orders\nof magnitude on several PDE systems.\n","authors":["Yuan Mi","Hao Sun"],"pdf_url":"https://arxiv.org/pdf/2409.18013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18000v1","updated":"2024-09-26T16:09:19Z","published":"2024-09-26T16:09:19Z","title":"Safe Time-Varying Optimization based on Gaussian Processes with\n Spatio-Temporal Kernel","summary":" Ensuring safety is a key aspect in sequential decision making problems, such\nas robotics or process control. The complexity of the underlying systems often\nmakes finding the optimal decision challenging, especially when the\nsafety-critical system is time-varying. Overcoming the problem of optimizing an\nunknown time-varying reward subject to unknown time-varying safety constraints,\nwe propose TVSafeOpt, a new algorithm built on Bayesian optimization with a\nspatio-temporal kernel. The algorithm is capable of safely tracking a\ntime-varying safe region without the need for explicit change detection.\nOptimality guarantees are also provided for the algorithm when the optimization\nproblem becomes stationary. We show that TVSafeOpt compares favorably against\nSafeOpt on synthetic data, both regarding safety and optimality. Evaluation on\na realistic case study with gas compressors confirms that TVSafeOpt ensures\nsafety when solving time-varying optimization problems with unknown reward and\nsafety functions.\n","authors":["Jialin Li","Marta Zagorowska","Giulia De Pasquale","Alisa Rupenyan","John Lygeros"],"pdf_url":"https://arxiv.org/pdf/2409.18000v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17996v1","updated":"2024-09-26T16:07:24Z","published":"2024-09-26T16:07:24Z","title":"PhoCoLens: Photorealistic and Consistent Reconstruction in Lensless\n Imaging","summary":" Lensless cameras offer significant advantages in size, weight, and cost\ncompared to traditional lens-based systems. Without a focusing lens, lensless\ncameras rely on computational algorithms to recover the scenes from multiplexed\nmeasurements. However, current algorithms struggle with inaccurate forward\nimaging models and insufficient priors to reconstruct high-quality images. To\novercome these limitations, we introduce a novel two-stage approach for\nconsistent and photorealistic lensless image reconstruction. The first stage of\nour approach ensures data consistency by focusing on accurately reconstructing\nthe low-frequency content with a spatially varying deconvolution method that\nadjusts to changes in the Point Spread Function (PSF) across the camera's field\nof view. The second stage enhances photorealism by incorporating a generative\nprior from pre-trained diffusion models. By conditioning on the low-frequency\ncontent retrieved in the first stage, the diffusion model effectively\nreconstructs the high-frequency details that are typically lost in the lensless\nimaging process, while also maintaining image fidelity. Our method achieves a\nsuperior balance between data fidelity and visual quality compared to existing\nmethods, as demonstrated with two popular lensless systems, PhlatCam and\nDiffuserCam. Project website: https://phocolens.github.io/.\n","authors":["Xin Cai","Zhiyuan You","Hailong Zhang","Wentao Liu","Jinwei Gu","Tianfan Xue"],"pdf_url":"https://arxiv.org/pdf/2409.17996v1.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2409.17995v1","updated":"2024-09-26T16:07:20Z","published":"2024-09-26T16:07:20Z","title":"Joint Localization and Planning using Diffusion","summary":" Diffusion models have been successfully applied to robotics problems such as\nmanipulation and vehicle path planning. In this work, we explore their\napplication to end-to-end navigation -- including both perception and planning\n-- by considering the problem of jointly performing global localization and\npath planning in known but arbitrary 2D environments. In particular, we\nintroduce a diffusion model which produces collision-free paths in a global\nreference frame given an egocentric LIDAR scan, an arbitrary map, and a desired\ngoal position. To this end, we implement diffusion in the space of paths in\nSE(2), and describe how to condition the denoising process on both obstacles\nand sensor observations. In our evaluation, we show that the proposed\nconditioning techniques enable generalization to realistic maps of considerably\ndifferent appearance than the training environment, demonstrate our model's\nability to accurately describe ambiguous solutions, and run extensive\nsimulation experiments showcasing our model's use as a real-time, end-to-end\nlocalization and planning stack.\n","authors":["L. Lao Beyer","S. Karaman"],"pdf_url":"https://arxiv.org/pdf/2409.17995v1.pdf","comment":"7 pages, 9 figures. Submitted to ICRA 2025, under review"},{"id":"http://arxiv.org/abs/2405.15618v2","updated":"2024-09-26T16:05:30Z","published":"2024-05-24T15:04:36Z","title":"MLPs Learn In-Context on Regression and Classification Tasks","summary":" In-context learning (ICL), the remarkable ability to solve a task from only\ninput exemplars, is often assumed to be a unique hallmark of Transformer\nmodels. By examining commonly employed synthetic ICL tasks, we demonstrate that\nmulti-layer perceptrons (MLPs) can also learn in-context. Moreover, MLPs, and\nthe closely related MLP-Mixer models, learn in-context competitively with\nTransformers given the same compute budget in this setting. We further show\nthat MLPs outperform Transformers on a series of classical tasks from\npsychology designed to test relational reasoning, which are closely related to\nin-context classification. These results underscore a need for studying\nin-context learning beyond attention-based architectures, while also\nchallenging strong prior arguments about MLPs' limited ability to solve\nrelational tasks. Altogether, our results highlight the unexpected competence\nof MLPs, and support the growing interest in all-MLP alternatives to\ntask-specific architectures.\n","authors":["William L. Tong","Cengiz Pehlevan"],"pdf_url":"https://arxiv.org/pdf/2405.15618v2.pdf","comment":"30 pages, 10 figures, code available at\n https://github.com/wtong98/mlp-icl"},{"id":"http://arxiv.org/abs/2409.17992v1","updated":"2024-09-26T16:02:25Z","published":"2024-09-26T16:02:25Z","title":"LoopSR: Looping Sim-and-Real for Lifelong Policy Adaptation of Legged\n Robots","summary":" Reinforcement Learning (RL) has shown its remarkable and generalizable\ncapability in legged locomotion through sim-to-real transfer. However, while\nadaptive methods like domain randomization are expected to make policy more\nrobust to diverse environments, such comprehensiveness potentially detracts\nfrom the policy's performance in any specific environment according to the No\nFree Lunch theorem, leading to a suboptimal solution once deployed in the real\nworld. To address this issue, we propose a lifelong policy adaptation framework\nnamed LoopSR, which utilizes a transformer-based encoder to project real-world\ntrajectories into a latent space, and accordingly reconstruct the real-world\nenvironments back in simulation for further improvement. Autoencoder\narchitecture and contrastive learning methods are adopted to better extract the\ncharacteristics of real-world dynamics. The simulation parameters for continual\ntraining are derived by combining predicted parameters from the decoder with\nretrieved parameters from the simulation trajectory dataset. By leveraging the\ncontinual training, LoopSR achieves superior data efficiency compared with\nstrong baselines, with only a limited amount of data to yield eminent\nperformance in both sim-to-sim and sim-to-real experiments.\n","authors":["Peilin Wu","Weiji Xie","Jiahang Cao","Hang Lai","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17992v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2409.17991v1","updated":"2024-09-26T16:02:13Z","published":"2024-09-26T16:02:13Z","title":"Dimension-independent learning rates for high-dimensional classification\n problems","summary":" We study the problem of approximating and estimating classification functions\nthat have their decision boundary in the $RBV^2$ space. Functions of $RBV^2$\ntype arise naturally as solutions of regularized neural network learning\nproblems and neural networks can approximate these functions without the curse\nof dimensionality. We modify existing results to show that every $RBV^2$\nfunction can be approximated by a neural network with bounded weights.\nThereafter, we prove the existence of a neural network with bounded weights\napproximating a classification function. And we leverage these bounds to\nquantify the estimation rates. Finally, we present a numerical study that\nanalyzes the effect of different regularity conditions on the decision\nboundaries.\n","authors":["Andres Felipe Lerma-Pineda","Philipp Petersen","Simon Frieder","Thomas Lukasiewicz"],"pdf_url":"https://arxiv.org/pdf/2409.17991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17986v1","updated":"2024-09-26T15:56:40Z","published":"2024-09-26T15:56:40Z","title":"Supra-Laplacian Encoding for Transformer on Dynamic Graphs","summary":" Fully connected Graph Transformers (GT) have rapidly become prominent in the\nstatic graph community as an alternative to Message-Passing models, which\nsuffer from a lack of expressivity, oversquashing, and under-reaching. However,\nin a dynamic context, by interconnecting all nodes at multiple snapshots with\nself-attention, GT loose both structural and temporal information. In this\nwork, we introduce Supra-LAplacian encoding for spatio-temporal TransformErs\n(SLATE), a new spatio-temporal encoding to leverage the GT architecture while\nkeeping spatio-temporal information. Specifically, we transform Discrete Time\nDynamic Graphs into multi-layer graphs and take advantage of the spectral\nproperties of their associated supra-Laplacian matrix. Our second contribution\nexplicitly model nodes' pairwise relationships with a cross-attention\nmechanism, providing an accurate edge representation for dynamic link\nprediction. SLATE outperforms numerous state-of-the-art methods based on\nMessage-Passing Graph Neural Networks combined with recurrent models (e.g\nLSTM), and Dynamic Graph Transformers, on 9 datasets. Code and instructions to\nreproduce our results will be open-sourced.\n","authors":["Yannis Karmim","Marc Lafon","Raphaël Fournier S'niehotta","Nicolas Thome"],"pdf_url":"https://arxiv.org/pdf/2409.17986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15244v2","updated":"2024-09-26T15:56:30Z","published":"2024-03-22T14:40:29Z","title":"A Stochastic Quasi-Newton Method for Non-convex Optimization with\n Non-uniform Smoothness","summary":" Classical convergence analyses for optimization algorithms rely on the\nwidely-adopted uniform smoothness assumption. However, recent experimental\nstudies have demonstrated that many machine learning problems exhibit\nnon-uniform smoothness, meaning the smoothness factor is a function of the\nmodel parameter instead of a universal constant. In particular, it has been\nobserved that the smoothness grows with respect to the gradient norm along the\ntraining trajectory. Motivated by this phenomenon, the recently introduced\n$(L_0, L_1)$-smoothness is a more general notion, compared to traditional\n$L$-smoothness, that captures such positive relationship between smoothness and\ngradient norm. Under this type of non-uniform smoothness, existing literature\nhas designed stochastic first-order algorithms by utilizing gradient clipping\ntechniques to obtain the optimal $\\mathcal{O}(\\epsilon^{-3})$ sample complexity\nfor finding an $\\epsilon$-approximate first-order stationary solution.\nNevertheless, the studies of quasi-Newton methods are still lacking.\nConsidering higher accuracy and more robustness for quasi-Newton methods, in\nthis paper we propose a fast stochastic quasi-Newton method when there exists\nnon-uniformity in smoothness. Leveraging gradient clipping and variance\nreduction, our algorithm can achieve the best-known\n$\\mathcal{O}(\\epsilon^{-3})$ sample complexity and enjoys convergence speedup\nwith simple hyperparameter tuning. Our numerical experiments show that our\nproposed algorithm outperforms the state-of-the-art approaches.\n","authors":["Zhenyu Sun","Ermin Wei"],"pdf_url":"https://arxiv.org/pdf/2403.15244v2.pdf","comment":"Paper accepted by CDC 2024"},{"id":"http://arxiv.org/abs/2409.17985v1","updated":"2024-09-26T15:55:59Z","published":"2024-09-26T15:55:59Z","title":"Hypergame Theory for Decentralized Resource Allocation in Multi-user\n Semantic Communications","summary":" Semantic communications (SC) is an emerging communication paradigm in which\nwireless devices can send only relevant information from a source of data while\nrelying on computing resources to regenerate missing data points. However, the\ndesign of a multi-user SC system becomes more challenging because of the\ncomputing and communication overhead required for coordination. Existing\nsolutions for learning the semantic language and performing resource allocation\noften fail to capture the computing and communication tradeoffs involved in\nmultiuser SC. To address this gap, a novel framework for decentralized\ncomputing and communication resource allocation in multiuser SC systems is\nproposed. The challenge of efficiently allocating communication and computing\nresources (for reasoning) in a decentralized manner to maximize the quality of\ntask experience for the end users is addressed through the application of\nStackelberg hyper game theory. Leveraging the concept of second-level hyper\ngames, novel analytical formulations are developed to model misperceptions of\nthe users about each other's communication and control strategies. Further,\nequilibrium analysis of the learned resource allocation protocols examines the\nconvergence of the computing and communication strategies to a local\nStackelberg equilibria, considering misperceptions. Simulation results show\nthat the proposed Stackelberg hyper game results in efficient usage of\ncommunication and computing resources while maintaining a high quality of\nexperience for the users compared to state-of-the-art that does not account for\nthe misperceptions.\n","authors":["Christo Kurisummoottil Thomas","Walid Saad"],"pdf_url":"https://arxiv.org/pdf/2409.17985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15059v2","updated":"2024-09-26T15:53:10Z","published":"2024-05-23T21:17:20Z","title":"Message-Passing Monte Carlo: Generating low-discrepancy point sets via\n Graph Neural Networks","summary":" Discrepancy is a well-known measure for the irregularity of the distribution\nof a point set. Point sets with small discrepancy are called low-discrepancy\nand are known to efficiently fill the space in a uniform manner.\nLow-discrepancy points play a central role in many problems in science and\nengineering, including numerical integration, computer vision, machine\nperception, computer graphics, machine learning, and simulation. In this work,\nwe present the first machine learning approach to generate a new class of\nlow-discrepancy point sets named Message-Passing Monte Carlo (MPMC) points.\nMotivated by the geometric nature of generating low-discrepancy point sets, we\nleverage tools from Geometric Deep Learning and base our model on Graph Neural\nNetworks. We further provide an extension of our framework to higher\ndimensions, which flexibly allows the generation of custom-made points that\nemphasize the uniformity in specific dimensions that are primarily important\nfor the particular problem at hand. Finally, we demonstrate that our proposed\nmodel achieves state-of-the-art performance superior to previous methods by a\nsignificant margin. In fact, MPMC points are empirically shown to be either\noptimal or near-optimal with respect to the discrepancy for low dimension and\nsmall number of points, i.e., for which the optimal discrepancy can be\ndetermined. Code for generating MPMC points can be found at\nhttps://github.com/tk-rusch/MPMC.\n","authors":["T. Konstantin Rusch","Nathan Kirk","Michael M. Bronstein","Christiane Lemieux","Daniela Rus"],"pdf_url":"https://arxiv.org/pdf/2405.15059v2.pdf","comment":"Published in Proceedings of the National Academy of Sciences (PNAS):\n https://www.pnas.org/doi/10.1073/pnas.2409913121"},{"id":"http://arxiv.org/abs/2409.17978v1","updated":"2024-09-26T15:52:36Z","published":"2024-09-26T15:52:36Z","title":"HydraViT: Stacking Heads for a Scalable ViT","summary":" The architecture of Vision Transformers (ViTs), particularly the Multi-head\nAttention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs\non devices with varying constraints, such as mobile phones, requires multiple\nmodels of different sizes. However, this approach has limitations, such as\ntraining and storing each required model separately. This paper introduces\nHydraViT, a novel approach that addresses these limitations by stacking\nattention heads to achieve a scalable ViT. By repeatedly changing the size of\nthe embedded dimensions throughout each layer and their corresponding number of\nattention heads in MHA during training, HydraViT induces multiple subnetworks.\nThereby, HydraViT achieves adaptability across a wide spectrum of hardware\nenvironments while maintaining performance. Our experimental results\ndemonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10\nsubnetworks, covering a wide range of resource constraints. HydraViT achieves\nup to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy\nwith the same throughput on ImageNet-1K compared to the baselines, making it an\neffective solution for scenarios where hardware availability is diverse or\nvaries over time. Source code available at https://github.com/ds-kiel/HydraViT.\n","authors":["Janek Haberer","Ali Hojjat","Olaf Landsiedel"],"pdf_url":"https://arxiv.org/pdf/2409.17978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17972v1","updated":"2024-09-26T15:47:42Z","published":"2024-09-26T15:47:42Z","title":"BEATS: Optimizing LLM Mathematical Capabilities with BackVerify and\n Adaptive Disambiguate based Efficient Tree Search","summary":" Large Language Models (LLMs) have exhibited exceptional performance across a\nbroad range of tasks and domains. However, they still encounter difficulties in\nsolving mathematical problems due to the rigorous and logical nature of\nmathematics. Previous studies have employed techniques such as supervised\nfine-tuning (SFT), prompt engineering, and search-based methods to improve the\nmathematical problem-solving abilities of LLMs. Despite these efforts, their\nperformance remains suboptimal and demands substantial computational resources.\nTo address this issue, we propose a novel approach, BEATS, to enhance\nmathematical problem-solving abilities. Our method leverages newly designed\nprompts that guide the model to iteratively rewrite, advance by one step, and\ngenerate answers based on previous steps. Additionally, we introduce a new\nback-verification technique that uses LLMs to validate the correctness of the\ngenerated answers. Furthermore, we employ a pruning tree search to optimize\nsearch time while achieving strong performance. Notably, our method improves\nQwen2-7b-Instruct's score from 36.94 to 61.52, outperforming GPT4's 42.5 on the\nMATH benchmark.\n","authors":["Linzhuang Sun","Hao Liang","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14500v2","updated":"2024-09-26T15:26:43Z","published":"2024-09-22T15:53:19Z","title":"TabGraphs: A Benchmark and Strong Baselines for Learning on Graphs with\n Tabular Node Features","summary":" Tabular machine learning is an important field for industry and science. In\nthis field, table rows are usually treated as independent data samples, but\nadditional information about relations between them is sometimes available and\ncan be used to improve predictive performance. Such information can be\nnaturally modeled with a graph, thus tabular machine learning may benefit from\ngraph machine learning methods. However, graph machine learning models are\ntypically evaluated on datasets with homogeneous node features, which have\nlittle in common with heterogeneous mixtures of numerical and categorical\nfeatures present in tabular datasets. Thus, there is a critical difference\nbetween the data used in tabular and graph machine learning studies, which does\nnot allow one to understand how successfully graph models can be transferred to\ntabular data. To bridge this gap, we propose a new benchmark of diverse graphs\nwith heterogeneous tabular node features and realistic prediction tasks. We use\nthis benchmark to evaluate a vast set of models, including simple methods\npreviously overlooked in the literature. Our experiments show that graph neural\nnetworks (GNNs) can indeed often bring gains in predictive performance for\ntabular data, but standard tabular models also can be adapted to work with\ngraph data by using simple feature preprocessing, which sometimes enables them\nto compete with and even outperform GNNs. Based on our empirical study, we\nprovide insights for researchers and practitioners in both tabular and graph\nmachine learning fields.\n","authors":["Gleb Bazhenov","Oleg Platonov","Liudmila Prokhorenkova"],"pdf_url":"https://arxiv.org/pdf/2409.14500v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09825v2","updated":"2024-09-26T15:21:31Z","published":"2024-06-14T08:29:34Z","title":"Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of\n Anomalous Behavior in Bio-regenerative Life Support System Telemetry","summary":" The detection of abnormal or critical system states is essential in condition\nmonitoring. While much attention is given to promptly identifying anomalies, a\nretrospective analysis of these anomalies can significantly enhance our\ncomprehension of the underlying causes of observed undesired behavior. This\naspect becomes particularly critical when the monitored system is deployed in a\nvital environment. In this study, we delve into anomalies within the domain of\nBio-Regenerative Life Support Systems (BLSS) for space exploration and analyze\nanomalies found in telemetry data stemming from the EDEN ISS space greenhouse\nin Antarctica. We employ time series clustering on anomaly detection results to\ncategorize various types of anomalies in both uni- and multivariate settings.\nWe then assess the effectiveness of these methods in identifying systematic\nanomalous behavior. Additionally, we illustrate that the anomaly detection\nmethods MDI and DAMP produce complementary results, as previously indicated by\nresearch.\n","authors":["Ferdinand Rewicki","Jakob Gawlikowski","Julia Niebling","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2406.09825v2.pdf","comment":"12 pages, + Supplemental Materials, Published at Machine Learning and\n Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024"},{"id":"http://arxiv.org/abs/2409.17943v1","updated":"2024-09-26T15:18:34Z","published":"2024-09-26T15:18:34Z","title":"On Translating Technical Terminology: A Translation Workflow for\n Machine-Translated Acronyms","summary":" The typical workflow for a professional translator to translate a document\nfrom its source language (SL) to a target language (TL) is not always focused\non what many language models in natural language processing (NLP) do - predict\nthe next word in a series of words. While high-resource languages like English\nand French are reported to achieve near human parity using common metrics for\nmeasurement such as BLEU and COMET, we find that an important step is being\nmissed: the translation of technical terms, specifically acronyms. Some\nstate-of-the art machine translation systems like Google Translate which are\npublicly available can be erroneous when dealing with acronyms - as much as 50%\nin our findings. This article addresses acronym disambiguation for MT systems\nby proposing an additional step to the SL-TL (FR-EN) translation workflow where\nwe first offer a new acronym corpus for public consumption and then experiment\nwith a search-based thresholding algorithm that achieves nearly 10% increase\nwhen compared to Google Translate and OpusMT.\n","authors":["Richard Yue","John E. Ortega","Kenneth Ward Church"],"pdf_url":"https://arxiv.org/pdf/2409.17943v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17939v1","updated":"2024-09-26T15:12:59Z","published":"2024-09-26T15:12:59Z","title":"Predicting Anchored Text from Translation Memories for Machine\n Translation Using Deep Learning Methods","summary":" Translation memories (TMs) are the backbone for professional translation\ntools called computer-aided translation (CAT) tools. In order to perform a\ntranslation using a CAT tool, a translator uses the TM to gather translations\nsimilar to the desired segment to translate (s'). Many CAT tools offer a\nfuzzy-match algorithm to locate segments (s) in the TM that are close in\ndistance to s'. After locating two similar segments, the CAT tool will present\nparallel segments (s, t) that contain one segment in the source language along\nwith its translation in the target language. Additionally, CAT tools contain\nfuzzy-match repair (FMR) techniques that will automatically use the parallel\nsegments from the TM to create new TM entries containing a modified version of\nthe original with the idea in mind that it will be the translation of s'. Most\nFMR techniques use machine translation as a way of \"repairing\" those words that\nhave to be modified. In this article, we show that for a large part of those\nwords which are anchored, we can use other techniques that are based on machine\nlearning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we\nshow that for anchored words that follow the continuous bag-of-words (CBOW)\nparadigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for\nsome cases, better results than neural machine translation for translating\nanchored words from French to English.\n","authors":["Richard Yue","John E. Ortega"],"pdf_url":"https://arxiv.org/pdf/2409.17939v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17937v1","updated":"2024-09-26T15:12:41Z","published":"2024-09-26T15:12:41Z","title":"Adaptive Stream Processing on Edge Devices through Active Inference","summary":" The current scenario of IoT is witnessing a constant increase on the volume\nof data, which is generated in constant stream, calling for novel architectural\nand logical solutions for processing it. Moving the data handling towards the\nedge of the computing spectrum guarantees better distribution of load and, in\nprinciple, lower latency and better privacy. However, managing such a structure\nis complex, especially when requirements, also referred to Service Level\nObjectives (SLOs), specified by applications' owners and infrastructure\nmanagers need to be ensured. Despite the rich number of proposals of Machine\nLearning (ML) based management solutions, researchers and practitioners yet\nstruggle to guarantee long-term prediction and control, and accurate\ntroubleshooting. Therefore, we present a novel ML paradigm based on Active\nInference (AIF) -- a concept from neuroscience that describes how the brain\nconstantly predicts and evaluates sensory information to decrease long-term\nsurprise. We implement it and evaluate it in a heterogeneous real stream\nprocessing use case, where an AIF-based agent continuously optimizes the\nfulfillment of three SLOs for three autonomous driving services running on\nmultiple devices. The agent used causal knowledge to gradually develop an\nunderstanding of how its actions are related to requirements fulfillment, and\nwhich configurations to favor. Through this approach, our agent requires up to\nthirty iterations to converge to the optimal solution, showing the capability\nof offering accurate results in a short amount of time. Furthermore, thanks to\nAIF and its causal structures, our method guarantees full transparency on the\ndecision making, making the interpretation of the results and the\ntroubleshooting effortless.\n","authors":["Boris Sedlak","Victor Casamayor Pujol","Andrea Morichetta","Praveen Kumar Donta","Schahram Dustdar"],"pdf_url":"https://arxiv.org/pdf/2409.17937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17932v1","updated":"2024-09-26T15:08:52Z","published":"2024-09-26T15:08:52Z","title":"Sample compression unleashed : New generalization bounds for real valued\n losses","summary":" The sample compression theory provides generalization guarantees for\npredictors that can be fully defined using a subset of the training dataset and\na (short) message string, generally defined as a binary sequence. Previous\nworks provided generalization bounds for the zero-one loss, which is\nrestrictive, notably when applied to deep learning approaches. In this paper,\nwe present a general framework for deriving new sample compression bounds that\nhold for real-valued losses. We empirically demonstrate the tightness of the\nbounds and their versatility by evaluating them on different types of models,\ne.g., neural networks and decision forests, trained with the Pick-To-Learn\n(P2L) meta-algorithm, which transforms the training method of any\nmachine-learning predictor to yield sample-compressed predictors. In contrast\nto existing P2L bounds, ours are valid in the non-consistent case.\n","authors":["Mathieu Bazinet","Valentina Zantedeschi","Pascal Germain"],"pdf_url":"https://arxiv.org/pdf/2409.17932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17931v1","updated":"2024-09-26T15:08:38Z","published":"2024-09-26T15:08:38Z","title":"Intelligent Energy Management: Remaining Useful Life Prediction and\n Charging Automation System Comprised of Deep Learning and the Internet of\n Things","summary":" Remaining Useful Life (RUL) of battery is an important parameter to know the\nbattery's remaining life and need for recharge. The goal of this research\nproject is to develop machine learning-based models for the battery RUL\ndataset. Different ML models are developed to classify the RUL of the vehicle,\nand the IoT (Internet of Things) concept is simulated for automating the\ncharging system and managing any faults aligning. The graphs plotted depict the\nrelationship between various vehicle parameters using the Blynk IoT platform.\nResults show that the catboost, Multi-Layer Perceptron (MLP), Gated Recurrent\nUnit (GRU), and hybrid model developed could classify RUL into three classes\nwith 99% more accuracy. The data is fed using the tkinter GUI for simulating\nartificial intelligence (AI)-based charging, and with a pyserial backend, data\ncan be entered into the Esp-32 microcontroller for making charge discharge\npossible with the model's predictions. Also, with an IoT system, the charging\ncan be disconnected, monitored, and analyzed for automation. The results show\nthat an accuracy of 99% can be obtained on models MLP, catboost model and\nsimilar accuracy on GRU model can be obtained, and finally relay-based\ntriggering can be made by prediction through the model used for automating the\ncharging and energy-saving mechanism. By showcasing an exemplary Blynk\nplatform-based monitoring and automation phenomenon, we further present\ninnovative ways of monitoring parameters and automating the system.\n","authors":["Biplov Paneru","Bishwash Paneru","DP Sharma Mainali"],"pdf_url":"https://arxiv.org/pdf/2409.17931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15228v3","updated":"2024-09-26T14:57:52Z","published":"2024-09-23T17:22:09Z","title":"A Comprehensive Framework for Evaluating API-oriented Code Generation in\n Large Language Models","summary":" Large language models (LLMs) like GitHub Copilot and ChatGPT have emerged as\npowerful tools for code generation, significantly enhancing productivity and\naccelerating software development. However, existing benchmarks primarily focus\non general code generation without considering API-oriented code generation,\ni.e., generating code that invokes APIs from specific libraries. Given the\ngrowing demand for API-oriented code generation, there is a pressing need for a\nsystematic and automated approach to evaluate LLM on API-oriented code\ngeneration. To address this gap, we propose AutoAPIEval, a lightweight and\nautomated framework designed to evaluate the capabilities of LLMs in\nAPI-oriented code generation. Our framework works with any library that\nprovides API documentation and focuses on two unit tasks: API recommendation\nand code example generation, along with four metrics to evaluate the generated\nAPIs and code examples, such as the proportion of incorrect API recommendations\nfor Task 1, and the proportion of code examples where no specific API is\ninvoked and uncompilable/unexecutable code examples for Task 2. In addition, we\nconducted a case study on three LLMs (ChatGPT, MagiCoder, and DeepSeek Coder)\nand Java Runtime Environment 8 to demonstrate the framework's effectiveness.\nOur findings reveal substantial variability in LLM performance across tasks,\nwith ChatGPT adhering better to instructions, while sharing similar\neffectiveness in code example generation with its counterparts (i.e., MagiCoder\nand DeekSeek Coder). We also identify key factors associated with code quality,\nsuch as API popularity and model confidence, and build classifiers that achieve\nhigh accuracy in detecting incorrect API recommendations and erroneous code\nexamples. Retrieval-augmented generation enhances the quality of code generated\nby LLMs, though its effectiveness varies across different LLMs.\n","authors":["Yixi Wu","Pengfei He","Zehao Wang","Shaowei Wang","Yuan Tian","Tse-Hsun Chen"],"pdf_url":"https://arxiv.org/pdf/2409.15228v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08201v2","updated":"2024-09-26T14:56:57Z","published":"2024-09-12T16:38:20Z","title":"Machine Learning for Two-Sample Testing under Right-Censored Data: A\n Simulation Study","summary":" The focus of this study is to evaluate the effectiveness of Machine Learning\n(ML) methods for two-sample testing with right-censored observations. To\nachieve this, we develop several ML-based methods with varying architectures\nand implement them as two-sample tests. Each method is an ensemble (stacking)\nthat combines predictions from classical two-sample tests. This paper presents\nthe results of training the proposed ML methods, examines their statistical\npower compared to classical two-sample tests, analyzes the null distribution of\nthe proposed methods when the null hypothesis is true, and evaluates the\nsignificance of the features incorporated into the proposed methods. In total,\nthis work covers 18 methods for two-sample testing under right-censored\nobservations, including the proposed methods and classical well-studied\ntwo-sample tests. All results from numerical experiments were obtained from a\nsynthetic dataset generated using the inverse transform sampling method and\nreplicated multiple times through Monte Carlo simulation. To test the\ntwo-sample problem with right-censored observations, one can use the proposed\ntwo-sample methods (scripts, dataset, and models are available on GitHub and\nHugging Face).\n","authors":["Petr Philonenko","Sergey Postovalov"],"pdf_url":"https://arxiv.org/pdf/2409.08201v2.pdf","comment":"20 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17906v1","updated":"2024-09-26T14:52:40Z","published":"2024-09-26T14:52:40Z","title":"Graph Reasoning with Large Language Models via Pseudo-code Prompting","summary":" Large language models (LLMs) have recently achieved remarkable success in\nvarious reasoning tasks in the field of natural language processing. This\nsuccess of LLMs has also motivated their use in graph-related tasks. Among\nothers, recent work has explored whether LLMs can solve graph problems such as\ncounting the number of connected components of a graph or computing the\nshortest path distance between two nodes. Although LLMs possess preliminary\ngraph reasoning abilities, they might still struggle to solve some seemingly\nsimple problems. In this paper, we investigate whether prompting via\npseudo-code instructions can improve the performance of LLMs in solving graph\nproblems. Our experiments demonstrate that using pseudo-code instructions\ngenerally improves the performance of all considered LLMs. The graphs,\npseudo-code prompts, and evaluation code are publicly available.\n","authors":["Konstantinos Skianis","Giannis Nikolentzos","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2409.17906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17902v1","updated":"2024-09-26T14:50:20Z","published":"2024-09-26T14:50:20Z","title":"Designing Short-Stage CDC-XPUFs: Balancing Reliability, Cost, and\n Security in IoT Devices","summary":" The rapid expansion of Internet of Things (IoT) devices demands robust and\nresource-efficient security solutions. Physically Unclonable Functions (PUFs),\nwhich generate unique cryptographic keys from inherent hardware variations,\noffer a promising approach. However, traditional PUFs like Arbiter PUFs (APUFs)\nand XOR Arbiter PUFs (XOR-PUFs) are susceptible to machine learning (ML) and\nreliability-based attacks. In this study, we investigate\nComponent-Differentially Challenged XOR-PUFs (CDC-XPUFs), a less explored\nvariant, to address these vulnerabilities. We propose an optimized CDC-XPUF\ndesign that incorporates a pre-selection strategy to enhance reliability and\nintroduces a novel lightweight architecture to reduce hardware overhead.\nRigorous testing demonstrates that our design significantly lowers resource\nconsumption, maintains strong resistance to ML attacks, and improves\nreliability, effectively mitigating reliability-based attacks. These results\nhighlight the potential of CDC-XPUFs as a secure and efficient candidate for\nwidespread deployment in resource-constrained IoT systems.\n","authors":["Gaoxiang Li","Yu Zhuang"],"pdf_url":"https://arxiv.org/pdf/2409.17902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17896v1","updated":"2024-09-26T14:47:14Z","published":"2024-09-26T14:47:14Z","title":"Model-Free versus Model-Based Reinforcement Learning for Fixed-Wing UAV\n Attitude Control Under Varying Wind Conditions","summary":" This paper evaluates and compares the performance of model-free and\nmodel-based reinforcement learning for the attitude control of fixed-wing\nunmanned aerial vehicles using PID as a reference point. The comparison focuses\non their ability to handle varying flight dynamics and wind disturbances in a\nsimulated environment. Our results show that the Temporal Difference Model\nPredictive Control agent outperforms both the PID controller and other\nmodel-free reinforcement learning methods in terms of tracking accuracy and\nrobustness over different reference difficulties, particularly in nonlinear\nflight regimes. Furthermore, we introduce actuation fluctuation as a key metric\nto assess energy efficiency and actuator wear, and we test two different\napproaches from the literature: action variation penalty and conditioning for\naction policy smoothness. We also evaluate all control methods when subject to\nstochastic turbulence and gusts separately, so as to measure their effects on\ntracking performance, observe their limitations and outline their implications\non the Markov decision process formalism.\n","authors":["David Olivares","Pierre Fournier","Pavan Vasishta","Julien Marzat"],"pdf_url":"https://arxiv.org/pdf/2409.17896v1.pdf","comment":"Published at ICINCO 2024"},{"id":"http://arxiv.org/abs/2409.17889v1","updated":"2024-09-26T14:38:54Z","published":"2024-09-26T14:38:54Z","title":"A multi-source data power load forecasting method using attention\n mechanism-based parallel cnn-gru","summary":" Accurate power load forecasting is crucial for improving energy efficiency\nand ensuring power supply quality. Considering the power load forecasting\nproblem involves not only dynamic factors like historical load variations but\nalso static factors such as climate conditions that remain constant over\nspecific periods. From the model-agnostic perspective, this paper proposes a\nparallel structure network to extract important information from both dynamic\nand static data. Firstly, based on complexity learning theory, it is\ndemonstrated that models integrated through parallel structures exhibit\nsuperior generalization abilities compared to individual base learners.\nAdditionally, the higher the independence between base learners, the stronger\nthe generalization ability of the parallel structure model. This suggests that\nthe structure of machine learning models inherently contains significant\ninformation. Building on this theoretical foundation, a parallel convolutional\nneural network (CNN)-gate recurrent unit (GRU) attention model (PCGA) is\nemployed to address the power load forecasting issue, aiming to effectively\nintegrate the influences of dynamic and static features. The CNN module is\nresponsible for capturing spatial characteristics from static data, while the\nGRU module captures long-term dependencies in dynamic time series data. The\nattention layer is designed to focus on key information from the\nspatial-temporal features extracted by the parallel CNN-GRU. To substantiate\nthe advantages of the parallel structure model in extracting and integrating\nmulti-source information, a series of experiments are conducted.\n","authors":["Chao Min","Yijia Wang","Bo Zhang","Xin Ma","Junyi Cui"],"pdf_url":"https://arxiv.org/pdf/2409.17889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.12060v2","updated":"2024-09-26T14:32:36Z","published":"2021-06-22T21:15:00Z","title":"Faster Randomized Methods for Orthogonality Constrained Problems","summary":" Recent literature has advocated the use of randomized methods for\naccelerating the solution of various matrix problems arising throughout data\nscience and computational science. One popular strategy for leveraging\nrandomization is to use it as a way to reduce problem size. However, methods\nbased on this strategy lack sufficient accuracy for some applications.\nRandomized preconditioning is another approach for leveraging randomization,\nwhich provides higher accuracy. The main challenge in using randomized\npreconditioning is the need for an underlying iterative method, thus randomized\npreconditioning so far have been applied almost exclusively to solving\nregression problems and linear systems. In this article, we show how to expand\nthe application of randomized preconditioning to another important set of\nproblems prevalent across data science: optimization problems with\n(generalized) orthogonality constraints. We demonstrate our approach, which is\nbased on the framework of Riemannian optimization and Riemannian\npreconditioning, on the problem of computing the dominant canonical\ncorrelations and on the Fisher linear discriminant analysis problem. For both\nproblems, we evaluate the effect of preconditioning on the computational costs\nand asymptotic convergence, and demonstrate empirically the utility of our\napproach.\n","authors":["Boris Shustin","Haim Avron"],"pdf_url":"https://arxiv.org/pdf/2106.12060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01807v2","updated":"2024-09-26T14:21:10Z","published":"2023-10-03T05:40:56Z","title":"Discrete, compositional, and symbolic representations through attractor\n dynamics","summary":" Symbolic systems are powerful frameworks for modeling cognitive processes as\nthey encapsulate the rules and relationships fundamental to many aspects of\nhuman reasoning and behavior. Central to these models are systematicity,\ncompositionality, and productivity, making them invaluable in both cognitive\nscience and artificial intelligence. However, certain limitations remain. For\ninstance, the integration of structured symbolic processes and latent\nsub-symbolic processes has been implemented at the computational level through\nfiat methods such as quantization or softmax sampling, which assume, rather\nthan derive, the operations underpinning discretization and symbolicization. In\nthis work, we introduce a novel neural stochastic dynamical systems model that\nintegrates attractor dynamics with symbolic representations to model cognitive\nprocesses akin to the probabilistic language of thought (PLoT). Our model\nsegments the continuous representational space into discrete basins, with\nattractor states corresponding to symbolic sequences, that reflect the\nsemanticity and compositionality characteristic of symbolic systems through\nunsupervised learning, rather than relying on pre-defined primitives. Moreover,\nlike PLoT, our model learns to sample a diverse distribution of attractor\nstates that reflect the mutual information between the input data and the\nsymbolic encodings. This approach establishes a unified framework that\nintegrates both symbolic and sub-symbolic processing through neural dynamics, a\nneuro-plausible substrate with proven expressivity in AI, offering a more\ncomprehensive model that mirrors the complex duality of cognitive operations.\n","authors":["Andrew Nam","Eric Elmoznino","Nikolay Malkin","James McClelland","Yoshua Bengio","Guillaume Lajoie"],"pdf_url":"https://arxiv.org/pdf/2310.01807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17872v1","updated":"2024-09-26T14:19:07Z","published":"2024-09-26T14:19:07Z","title":"A method for identifying causality in the response of nonlinear\n dynamical systems","summary":" Predicting the response of nonlinear dynamical systems subject to random,\nbroadband excitation is important across a range of scientific disciplines,\nsuch as structural dynamics and neuroscience. Building data-driven models\nrequires experimental measurements of the system input and output, but it can\nbe difficult to determine whether inaccuracies in the model stem from modelling\nerrors or noise. This paper presents a novel method to identify the causal\ncomponent of the input-output data from measurements of a system in the\npresence of output noise, as a function of frequency, without needing a high\nfidelity model. An output prediction, calculated using an available model, is\noptimally combined with noisy measurements of the output to predict the input\nto the system. The parameters of the algorithm balance the two output signals\nand are utilised to calculate a nonlinear coherence metric as a measure of\ncausality. This method is applicable to a broad class of nonlinear dynamical\nsystems. There are currently no solutions to this problem in the absence of a\ncomplete benchmark model.\n","authors":["Joseph Massingham","Ole Nielsen","Tore Butlin"],"pdf_url":"https://arxiv.org/pdf/2409.17872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17870v1","updated":"2024-09-26T14:17:58Z","published":"2024-09-26T14:17:58Z","title":"Efficient Arbitrary Precision Acceleration for Large Language Models on\n GPU Tensor Cores","summary":" Large language models (LLMs) have been widely applied but face challenges in\nefficient inference. While quantization methods reduce computational demands,\nultra-low bit quantization with arbitrary precision is hindered by limited GPU\nTensor Core support and inefficient memory management, leading to suboptimal\nacceleration. To address these challenges, we propose a comprehensive\nacceleration scheme for arbitrary precision LLMs. At its core, we introduce a\nnovel bipolar-INT data format that facilitates parallel computing and supports\nsymmetric quantization, effectively reducing data redundancy. Building on this,\nwe implement an arbitrary precision matrix multiplication scheme that\ndecomposes and recovers matrices at the bit level, enabling flexible precision\nwhile maximizing GPU Tensor Core utilization. Furthermore, we develop an\nefficient matrix preprocessing method that optimizes data layout for subsequent\ncomputations. Finally, we design a data recovery-oriented memory management\nsystem that strategically utilizes fast shared memory, significantly enhancing\nkernel execution speed and minimizing memory access latency. Experimental\nresults demonstrate our approach's effectiveness, with up to 13\\times speedup\nin matrix multiplication compared to NVIDIA's CUTLASS. When integrated into\nLLMs, we achieve up to 6.7\\times inference acceleration. These improvements\nsignificantly enhance LLM inference efficiency, enabling broader and more\nresponsive applications of LLMs.\n","authors":["Shaobo Ma","Chao Fang","Haikuo Shao","Zhongfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05208v3","updated":"2024-09-26T14:16:01Z","published":"2023-10-08T15:49:36Z","title":"ZSC-Eval: An Evaluation Toolkit and Benchmark for Multi-agent Zero-shot\n Coordination","summary":" Zero-shot coordination (ZSC) is a new cooperative multi-agent reinforcement\nlearning (MARL) challenge that aims to train an ego agent to work with diverse,\nunseen partners during deployment. The significant difference between the\ndeployment-time partners' distribution and the training partners' distribution\ndetermined by the training algorithm makes ZSC a unique out-of-distribution\n(OOD) generalization challenge. The potential distribution gap between\nevaluation and deployment-time partners leads to inadequate evaluation, which\nis exacerbated by the lack of appropriate evaluation metrics. In this paper, we\npresent ZSC-Eval, the first evaluation toolkit and benchmark for ZSC\nalgorithms. ZSC-Eval consists of: 1) Generation of evaluation partner\ncandidates through behavior-preferring rewards to approximate deployment-time\npartners' distribution; 2) Selection of evaluation partners by Best-Response\nDiversity (BR-Div); 3) Measurement of generalization performance with various\nevaluation partners via the Best-Response Proximity (BR-Prox) metric. We use\nZSC-Eval to benchmark ZSC algorithms in Overcooked and Google Research Football\nenvironments and get novel empirical findings. We also conduct a human\nexperiment of current ZSC algorithms to verify the ZSC-Eval's consistency with\nhuman evaluation. ZSC-Eval is now available at\nhttps://github.com/sjtu-marl/ZSC-Eval.\n","authors":["Xihuai Wang","Shao Zhang","Wenhao Zhang","Wentao Dong","Jingxiao Chen","Ying Wen","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.05208v3.pdf","comment":"Accepted in NeurIPS 2024 Dataset and Benchmark Track"},{"id":"http://arxiv.org/abs/2409.17865v1","updated":"2024-09-26T14:15:54Z","published":"2024-09-26T14:15:54Z","title":"Implementing a Nordic-Baltic Federated Health Data Network: a case\n report","summary":" Background: Centralized collection and processing of healthcare data across\nnational borders pose significant challenges, including privacy concerns, data\nheterogeneity and legal barriers. To address some of these challenges, we\nformed an interdisciplinary consortium to develop a feder-ated health data\nnetwork, comprised of six institutions across five countries, to facilitate\nNordic-Baltic cooperation on secondary use of health data. The objective of\nthis report is to offer early insights into our experiences developing this\nnetwork. Methods: We used a mixed-method ap-proach, combining both experimental\ndesign and implementation science to evaluate the factors affecting the\nimplementation of our network. Results: Technically, our experiments indicate\nthat the network functions without significant performance degradation compared\nto centralized simu-lation. Conclusion: While use of interdisciplinary\napproaches holds a potential to solve challeng-es associated with establishing\nsuch collaborative networks, our findings turn the spotlight on the uncertain\nregulatory landscape playing catch up and the significant operational costs.\n","authors":["Taridzo Chomutare","Aleksandar Babic","Laura-Maria Peltonen","Silja Elunurm","Peter Lundberg","Arne Jönsson","Emma Eneling","Ciprian-Virgil Gerstenberger","Troels Siggaard","Raivo Kolde","Oskar Jerdhaf","Martin Hansson","Alexandra Makhlysheva","Miroslav Muzny","Erik Ylipää","Søren Brunak","Hercules Dalianis"],"pdf_url":"https://arxiv.org/pdf/2409.17865v1.pdf","comment":"24 pages (including appendices), 1 figure"},{"id":"http://arxiv.org/abs/2409.17864v1","updated":"2024-09-26T14:12:23Z","published":"2024-09-26T14:12:23Z","title":"A Multimodal Single-Branch Embedding Network for Recommendation in\n Cold-Start and Missing Modality Scenarios","summary":" Most recommender systems adopt collaborative filtering (CF) and provide\nrecommendations based on past collective interactions. Therefore, the\nperformance of CF algorithms degrades when few or no interactions are\navailable, a scenario referred to as cold-start. To address this issue,\nprevious work relies on models leveraging both collaborative data and side\ninformation on the users or items. Similar to multimodal learning, these models\naim at combining collaborative and content representations in a shared\nembedding space. In this work we propose a novel technique for multimodal\nrecommendation, relying on a multimodal Single-Branch embedding network for\nRecommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction\ndata as well as multimodal side information using the same single-branch\nembedding network on different modalities. This makes SiBraR effective in\nscenarios of missing modality, including cold start. Our extensive experiments\non large-scale recommendation datasets from three different recommendation\ndomains (music, movie, and e-commerce) and providing multimodal content\ninformation (audio, text, image, labels, and interactions) show that SiBraR\nsignificantly outperforms CF as well as state-of-the-art content-based RSs in\ncold-start scenarios, and is competitive in warm scenarios. We show that\nSiBraR's recommendations are accurate in missing modality scenarios, and that\nthe model is able to map different modalities to the same region of the shared\nembedding space, hence reducing the modality gap.\n","authors":["Christian Ganhör","Marta Moscati","Anna Hausberger","Shah Nawaz","Markus Schedl"],"pdf_url":"https://arxiv.org/pdf/2409.17864v1.pdf","comment":"Accepted at 18th ACM Conference on Recommender Systems (RecSys '24)"},{"id":"http://arxiv.org/abs/2409.17858v1","updated":"2024-09-26T14:05:32Z","published":"2024-09-26T14:05:32Z","title":"How Feature Learning Can Improve Neural Scaling Laws","summary":" We develop a solvable model of neural scaling laws beyond the kernel limit.\nTheoretical analysis of this model shows how performance scales with model\nsize, training time, and the total amount of available data. We identify three\nscaling regimes corresponding to varying task difficulties: hard, easy, and\nsuper easy tasks. For easy and super-easy target functions, which lie in the\nreproducing kernel Hilbert space (RKHS) defined by the initial infinite-width\nNeural Tangent Kernel (NTK), the scaling exponents remain unchanged between\nfeature learning and kernel regime models. For hard tasks, defined as those\noutside the RKHS of the initial NTK, we demonstrate both analytically and\nempirically that feature learning can improve scaling with training time and\ncompute, nearly doubling the exponent for hard tasks. This leads to a different\ncompute optimal strategy to scale parameters and training time in the feature\nlearning regime. We support our finding that feature learning improves the\nscaling law for hard tasks but not for easy and super-easy tasks with\nexperiments of nonlinear MLPs fitting functions with power-law Fourier spectra\non the circle and CNNs learning vision tasks.\n","authors":["Blake Bordelon","Alexander Atanasov","Cengiz Pehlevan"],"pdf_url":"https://arxiv.org/pdf/2409.17858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17852v1","updated":"2024-09-26T13:58:06Z","published":"2024-09-26T13:58:06Z","title":"AMARO: All Heavy-Atom Transferable Neural Network Potentials of Protein\n Thermodynamics","summary":" All-atom molecular simulations offer detailed insights into macromolecular\nphenomena, but their substantial computational cost hinders the exploration of\ncomplex biological processes. We introduce Advanced Machine-learning Atomic\nRepresentation Omni-force-field (AMARO), a new neural network potential (NNP)\nthat combines an O(3)-equivariant message-passing neural network architecture,\nTensorNet, with a coarse-graining map that excludes hydrogen atoms. AMARO\ndemonstrates the feasibility of training coarser NNP, without prior energy\nterms, to run stable protein dynamics with scalability and generalization\ncapabilities.\n","authors":["Antonio Mirarchi","Raul P. Pelaez","Guillem Simeon","Gianni De Fabritiis"],"pdf_url":"https://arxiv.org/pdf/2409.17852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17841v1","updated":"2024-09-26T13:45:36Z","published":"2024-09-26T13:45:36Z","title":"Machine Learning-based vs Deep Learning-based Anomaly Detection in\n Multivariate Time Series for Spacecraft Attitude Sensors","summary":" In the framework of Failure Detection, Isolation and Recovery (FDIR) on\nspacecraft, new AI-based approaches are emerging in the state of the art to\novercome the limitations commonly imposed by traditional threshold checking.\n The present research aims at characterizing two different approaches to the\nproblem of stuck values detection in multivariate time series coming from\nspacecraft attitude sensors. The analysis reveals the performance differences\nin the two approaches, while commenting on their interpretability and\ngeneralization to different scenarios.\n","authors":["R. Gallon","F. Schiemenz","A. Krstova","A. Menicucci","E. Gill"],"pdf_url":"https://arxiv.org/pdf/2409.17841v1.pdf","comment":"Accepted for the ESA SPAICE Conference 2024"},{"id":"http://arxiv.org/abs/2409.17836v1","updated":"2024-09-26T13:38:33Z","published":"2024-09-26T13:38:33Z","title":"Language Models as Zero-shot Lossless Gradient Compressors: Towards\n General Neural Parameter Prior Models","summary":" Despite the widespread use of statistical prior models in various fields,\nsuch models for neural network gradients have long been overlooked. The\ninherent challenge stems from their high-dimensional structures and complex\ninterdependencies, which complicate effective modeling. In this work, we\ndemonstrate the potential of large language models (LLMs) to act as gradient\npriors in a zero-shot setting. We examine the property by considering lossless\ngradient compression -- a critical application in distributed learning -- that\ndepends heavily on precise probability modeling. To achieve this, we introduce\nLM-GC, a novel method that integrates LLMs with arithmetic coding. Our\ntechnique converts plain gradients into text-like formats, enhancing token\nefficiency by up to 38 times compared to their plain representations. We ensure\nthat this data conversion maintains a close alignment with the structure of\nplain gradients and the symbols commonly recognized by LLMs. Our experiments\nindicate that LM-GC surpasses existing state-of-the-art lossless compression\nmethods, improving compression rates by 10\\% up to 17.2\\% across various\ndatasets and architectures. Additionally, our approach shows promising\ncompatibility with lossy compression techniques such as quantization and\nsparsification. These findings highlight the significant potential of LLMs as a\nmodel for effectively handling gradients. We will release the source code upon\npublication.\n","authors":["Hui-Po Wang","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2409.17836v1.pdf","comment":"To appear in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17833v1","updated":"2024-09-26T13:35:42Z","published":"2024-09-26T13:35:42Z","title":"Ordinary Differential Equations for Enhanced 12-Lead ECG Generation","summary":" In the realm of artificial intelligence, the generation of realistic training\ndata for supervised learning tasks presents a significant challenge. This is\nparticularly true in the synthesis of electrocardiograms (ECGs), where the\nobjective is to develop a synthetic 12-lead ECG model. The primary complexity\nof this task stems from accurately modeling the intricate biological and\nphysiological interactions among different ECG leads. Although mathematical\nprocess simulators have shed light on these dynamics, effectively incorporating\nthis understanding into generative models is not straightforward. In this work,\nwe introduce an innovative method that employs ordinary differential equations\n(ODEs) to enhance the fidelity of generating 12-lead ECG data. This approach\nintegrates a system of ODEs that represent cardiac dynamics directly into the\ngenerative model's optimization process, allowing for the production of\nbiologically plausible ECG training data that authentically reflects real-world\nvariability and inter-lead dependencies. We conducted an empirical analysis of\nthousands of ECGs and found that incorporating cardiac simulation insights into\nthe data generation process significantly improves the accuracy of heart\nabnormality classifiers trained on this synthetic 12-lead ECG data.\n","authors":["Yakir Yehuda","Kira Radinsky"],"pdf_url":"https://arxiv.org/pdf/2409.17833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00551v2","updated":"2024-09-26T13:34:41Z","published":"2024-06-01T20:46:40Z","title":"Strategic Linear Contextual Bandits","summary":" Motivated by the phenomenon of strategic agents gaming a recommender system\nto maximize the number of times they are recommended to users, we study a\nstrategic variant of the linear contextual bandit problem, where the arms can\nstrategically misreport privately observed contexts to the learner. We treat\nthe algorithm design problem as one of mechanism design under uncertainty and\npropose the Optimistic Grim Trigger Mechanism (OptGTM) that incentivizes the\nagents (i.e., arms) to report their contexts truthfully while simultaneously\nminimizing regret. We also show that failing to account for the strategic\nnature of the agents results in linear regret. However, a trade-off between\nmechanism design and regret minimization appears to be unavoidable. More\nbroadly, this work aims to provide insight into the intersection of online\nlearning and mechanism design.\n","authors":["Thomas Kleine Buening","Aadirupa Saha","Christos Dimitrakakis","Haifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2406.00551v2.pdf","comment":"To appear at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.04769v2","updated":"2024-09-26T13:31:40Z","published":"2024-06-07T09:15:29Z","title":"Diffusion-based Generative Image Outpainting for Recovery of\n FOV-Truncated CT Images","summary":" Field-of-view (FOV) recovery of truncated chest CT scans is crucial for\naccurate body composition analysis, which involves quantifying skeletal muscle\nand subcutaneous adipose tissue (SAT) on CT slices. This, in turn, enables\ndisease prognostication. Here, we present a method for recovering truncated CT\nslices using generative image outpainting. We train a diffusion model and apply\nit to truncated CT slices generated by simulating a small FOV. Our model\nreliably recovers the truncated anatomy and outperforms the previous\nstate-of-the-art despite being trained on 87% less data.\n","authors":["Michelle Espranita Liman","Daniel Rueckert","Florian J. Fintelmann","Philip Müller"],"pdf_url":"https://arxiv.org/pdf/2406.04769v2.pdf","comment":"Shared last authorship: Florian J. Fintelmann and Philip M\\\"uller"},{"id":"http://arxiv.org/abs/2409.17113v2","updated":"2024-09-26T13:30:51Z","published":"2024-09-25T17:27:02Z","title":"Characterizing stable regions in the residual stream of LLMs","summary":" We identify \"stable regions\" in the residual stream of Transformers, where\nthe model's output remains insensitive to small activation changes, but\nexhibits high sensitivity at region boundaries. These regions emerge during\ntraining and become more defined as training progresses or model size\nincreases. The regions appear to be much larger than previously studied\npolytopes. Our analysis suggests that these stable regions align with semantic\ndistinctions, where similar prompts cluster within regions, and activations\nfrom the same region lead to similar next token predictions. This work provides\na promising research direction for understanding the complexity of neural\nnetworks, shedding light on training dynamics, and advancing interpretability.\n","authors":["Jett Janiak","Jacek Karwowski","Chatrik Singh Mangat","Giorgi Giglemiani","Nora Petrova","Stefan Heimersheim"],"pdf_url":"https://arxiv.org/pdf/2409.17113v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14372v2","updated":"2024-09-26T13:23:54Z","published":"2024-05-23T09:48:48Z","title":"Learning Constrained Markov Decision Processes With Non-stationary\n Rewards and Constraints","summary":" In constrained Markov decision processes (CMDPs) with adversarial rewards and\nconstraints, a well-known impossibility result prevents any algorithm from\nattaining both sublinear regret and sublinear constraint violation, when\ncompeting against a best-in-hindsight policy that satisfies constraints on\naverage. In this paper, we show that this negative result can be eased in CMDPs\nwith non-stationary rewards and constraints, by providing algorithms whose\nperformances smoothly degrade as non-stationarity increases. Specifically, we\npropose algorithms attaining $\\tilde{\\mathcal{O}} (\\sqrt{T} + C)$ regret and\npositive constraint violation under bandit feedback, where $C$ is a corruption\nvalue measuring the environment non-stationarity. This can be $\\Theta(T)$ in\nthe worst case, coherently with the impossibility result for adversarial CMDPs.\nFirst, we design an algorithm with the desired guarantees when $C$ is known.\nThen, in the case $C$ is unknown, we show how to obtain the same results by\nembedding such an algorithm in a general meta-procedure. This is of independent\ninterest, as it can be applied to any non-stationary constrained online\nlearning setting.\n","authors":["Francesco Emanuele Stradi","Anna Lunghi","Matteo Castiglioni","Alberto Marchesi","Nicola Gatti"],"pdf_url":"https://arxiv.org/pdf/2405.14372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17825v1","updated":"2024-09-26T13:22:22Z","published":"2024-09-26T13:22:22Z","title":"Physics-aligned Schrödinger bridge","summary":" The reconstruction of physical fields from sparse measurements is pivotal in\nboth scientific research and engineering applications. Traditional methods are\nincreasingly supplemented by deep learning models due to their efficacy in\nextracting features from data. However, except for the low accuracy on complex\nphysical systems, these models often fail to comply with essential physical\nconstraints, such as governing equations and boundary conditions. To overcome\nthis limitation, we introduce a novel data-driven field reconstruction\nframework, termed the Physics-aligned Schr\\\"{o}dinger Bridge (PalSB). This\nframework leverages a diffusion Schr\\\"{o}dinger bridge mechanism that is\nspecifically tailored to align with physical constraints. The PalSB approach\nincorporates a dual-stage training process designed to address both local\nreconstruction mapping and global physical principles. Additionally, a\nboundary-aware sampling technique is implemented to ensure adherence to\nphysical boundary conditions. We demonstrate the effectiveness of PalSB through\nits application to three complex nonlinear systems: cylinder flow from Particle\nImage Velocimetry experiments, two-dimensional turbulence, and a\nreaction-diffusion system. The results reveal that PalSB not only achieves\nhigher accuracy but also exhibits enhanced compliance with physical constraints\ncompared to existing methods. This highlights PalSB's capability to generate\nhigh-quality representations of intricate physical interactions, showcasing its\npotential for advancing field reconstruction techniques.\n","authors":["Zeyu Li","Hongkun Dou","Shen Fang","Wang Han","Yue Deng","Lijun Yang"],"pdf_url":"https://arxiv.org/pdf/2409.17825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17808v1","updated":"2024-09-26T13:02:28Z","published":"2024-09-26T13:02:28Z","title":"Generative Modeling of Molecular Dynamics Trajectories","summary":" Molecular dynamics (MD) is a powerful technique for studying microscopic\nphenomena, but its computational cost has driven significant interest in the\ndevelopment of deep learning-based surrogate models. We introduce generative\nmodeling of molecular trajectories as a paradigm for learning flexible\nmulti-task surrogate models of MD from data. By conditioning on appropriately\nchosen frames of the trajectory, we show such generative models can be adapted\nto diverse tasks such as forward simulation, transition path sampling, and\ntrajectory upsampling. By alternatively conditioning on part of the molecular\nsystem and inpainting the rest, we also demonstrate the first steps towards\ndynamics-conditioned molecular design. We validate the full set of these\ncapabilities on tetrapeptide simulations and show that our model can produce\nreasonable ensembles of protein monomers. Altogether, our work illustrates how\ngenerative modeling can unlock value from MD data towards diverse downstream\ntasks that are not straightforward to address with existing methods or even MD\nitself. Code is available at https://github.com/bjing2016/mdgen.\n","authors":["Bowen Jing","Hannes Stärk","Tommi Jaakkola","Bonnie Berger"],"pdf_url":"https://arxiv.org/pdf/2409.17808v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17806v1","updated":"2024-09-26T12:59:09Z","published":"2024-09-26T12:59:09Z","title":"Continual learning with task specialist","summary":" Continual learning (CL) adapt the deep learning scenarios with timely updated\ndatasets. However, existing CL models suffer from the catastrophic forgetting\nissue, where new knowledge replaces past learning. In this paper, we propose\nContinual Learning with Task Specialists (CLTS) to address the issues of\ncatastrophic forgetting and limited labelled data in real-world datasets by\nperforming class incremental learning of the incoming stream of data. The model\nconsists of Task Specialists (T S) and Task Predictor (T P ) with pre-trained\nStable Diffusion (SD) module. Here, we introduce a new specialist to handle a\nnew task sequence and each T S has three blocks; i) a variational autoencoder\n(V AE) to learn the task distribution in a low dimensional latent space, ii) a\nK-Means block to perform data clustering and iii) Bootstrapping Language-Image\nPre-training (BLIP ) model to generate a small batch of captions from the input\ndata. These captions are fed as input to the pre-trained stable diffusion model\n(SD) for the generation of task samples. The proposed model does not store any\ntask samples for replay, instead uses generated samples from SD to train the T\nP module. A comparison study with four SOTA models conducted on three\nreal-world datasets shows that the proposed model outperforms all the selected\nbaselines\n","authors":["Indu Solomon","Aye Phyu Phyu Aung","Uttam Kumar","Senthilnath Jayavelu"],"pdf_url":"https://arxiv.org/pdf/2409.17806v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17804v1","updated":"2024-09-26T12:57:47Z","published":"2024-09-26T12:57:47Z","title":"Enriched Functional Tree-Based Classifiers: A Novel Approach Leveraging\n Derivatives and Geometric Features","summary":" The positioning of this research falls within the scalar-on-function\nclassification literature, a field of significant interest across various\ndomains, particularly in statistics, mathematics, and computer science. This\nstudy introduces an advanced methodology for supervised classification by\nintegrating Functional Data Analysis (FDA) with tree-based ensemble techniques\nfor classifying high-dimensional time series. The proposed framework, Enriched\nFunctional Tree-Based Classifiers (EFTCs), leverages derivative and geometric\nfeatures, benefiting from the diversity inherent in ensemble methods to further\nenhance predictive performance and reduce variance. While our approach has been\ntested on the enrichment of Functional Classification Trees (FCTs), Functional\nK-NN (FKNN), Functional Random Forest (FRF), Functional XGBoost (FXGB), and\nFunctional LightGBM (FLGBM), it could be extended to other tree-based and\nnon-tree-based classifiers, with appropriate considerations emerging from this\ninvestigation. Through extensive experimental evaluations on seven real-world\ndatasets and six simulated scenarios, this proposal demonstrates fascinating\nimprovements over traditional approaches, providing new insights into the\napplication of FDA in complex, high-dimensional learning problems.\n","authors":["Fabrizio Maturo","Annamaria Porreca"],"pdf_url":"https://arxiv.org/pdf/2409.17804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10615v2","updated":"2024-09-26T12:55:43Z","published":"2024-06-15T12:27:35Z","title":"Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation","summary":" Given the high cost of collecting robotic data in the real world, sample\nefficiency is a consistently compelling pursuit in robotics. In this paper, we\nintroduce SGRv2, an imitation learning framework that enhances sample\nefficiency through improved visual and action representations. Central to the\ndesign of SGRv2 is the incorporation of a critical inductive bias-action\nlocality, which posits that robot's actions are predominantly influenced by the\ntarget object and its interactions with the local environment. Extensive\nexperiments in both simulated and real-world settings demonstrate that action\nlocality is essential for boosting sample efficiency. SGRv2 excels in RLBench\ntasks with keyframe control using merely 5 demonstrations and surpasses the RVT\nbaseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and\nMimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR.\nIn real-world environments, with only eight demonstrations, SGRv2 can perform a\nvariety of tasks at a markedly higher success rate compared to baseline models.\nProject website: http://sgrv2-robot.github.io\n","authors":["Tong Zhang","Yingdong Hu","Jiacheng You","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2406.10615v2.pdf","comment":"CoRL 2024. Project website: http://sgrv2-robot.github.io"},{"id":"http://arxiv.org/abs/2409.17790v1","updated":"2024-09-26T12:37:22Z","published":"2024-09-26T12:37:22Z","title":"CASPFormer: Trajectory Prediction from BEV Images with Deformable\n Attention","summary":" Motion prediction is an important aspect for Autonomous Driving (AD) and\nAdvance Driver Assistance Systems (ADAS). Current state-of-the-art motion\nprediction methods rely on High Definition (HD) maps for capturing the\nsurrounding context of the ego vehicle. Such systems lack scalability in\nreal-world deployment as HD maps are expensive to produce and update in\nreal-time. To overcome this issue, we propose Context Aware Scene Prediction\nTransformer (CASPFormer), which can perform multi-modal motion prediction from\nrasterized Bird-Eye-View (BEV) images. Our system can be integrated with any\nupstream perception module that is capable of generating BEV images. Moreover,\nCASPFormer directly decodes vectorized trajectories without any postprocessing.\nTrajectories are decoded recurrently using deformable attention, as it is\ncomputationally efficient and provides the network with the ability to focus\nits attention on the important spatial locations of the BEV images. In\naddition, we also address the issue of mode collapse for generating multiple\nscene-consistent trajectories by incorporating learnable mode queries. We\nevaluate our model on the nuScenes dataset and show that it reaches\nstate-of-the-art across multiple metrics\n","authors":["Harsh Yadav","Maximilian Schaefer","Kun Zhao","Tobias Meisen"],"pdf_url":"https://arxiv.org/pdf/2409.17790v1.pdf","comment":"Under Review at ICPR 2024, Kolkata"},{"id":"http://arxiv.org/abs/2409.06364v2","updated":"2024-09-26T12:33:46Z","published":"2024-09-10T09:42:58Z","title":"What happens to diffusion model likelihood when your model is\n conditional?","summary":" Diffusion Models (DMs) iteratively denoise random samples to produce\nhigh-quality data. The iterative sampling process is derived from Stochastic\nDifferential Equations (SDEs), allowing a speed-quality trade-off chosen at\ninference. Another advantage of sampling with differential equations is exact\nlikelihood computation. These likelihoods have been used to rank unconditional\nDMs and for out-of-domain classification. Despite the many existing and\npossible uses of DM likelihoods, the distinct properties captured are unknown,\nespecially in conditional contexts such as Text-To-Image (TTI) or\nText-To-Speech synthesis (TTS). Surprisingly, we find that TTS DM likelihoods\nare agnostic to the text input. TTI likelihood is more expressive but cannot\ndiscern confounding prompts. Our results show that applying DMs to conditional\ntasks reveals inconsistencies and strengthens claims that the properties of DM\nlikelihood are unknown. This impact sheds light on the previously unknown\nnature of DM likelihoods. Although conditional DMs maximise likelihood, the\nlikelihood in question is not as sensitive to the conditioning input as one\nexpects. This investigation provides a new point-of-view on diffusion\nlikelihoods.\n","authors":["Mattias Cross","Anton Ragni"],"pdf_url":"https://arxiv.org/pdf/2409.06364v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14590v2","updated":"2024-09-26T12:29:45Z","published":"2024-09-22T20:47:04Z","title":"Explainable AI needs formal notions of explanation correctness","summary":" The use of machine learning (ML) in critical domains such as medicine poses\nrisks and requires regulation. One requirement is that decisions of ML systems\nin high-risk applications should be human-understandable. The field of\n\"explainable artificial intelligence\" (XAI) seemingly addresses this need.\nHowever, in its current form, XAI is unfit to provide quality control for ML;\nit itself needs scrutiny. Popular XAI methods cannot reliably answer important\nquestions about ML models, their training data, or a given test input. We\nrecapitulate results demonstrating that popular XAI methods systematically\nattribute importance to input features that are independent of the prediction\ntarget. This limits their utility for purposes such as model and data\n(in)validation, model improvement, and scientific discovery. We argue that the\nfundamental reason for this limitation is that current XAI methods do not\naddress well-defined problems and are not evaluated against objective criteria\nof explanation correctness. Researchers should formally define the problems\nthey intend to solve first and then design methods accordingly. This will lead\nto notions of explanation correctness that can be theoretically verified and\nobjective metrics of explanation performance that can be assessed using\nground-truth data.\n","authors":["Stefan Haufe","Rick Wilming","Benedict Clark","Rustam Zhumagambetov","Danny Panknin","Ahcène Boubekki"],"pdf_url":"https://arxiv.org/pdf/2409.14590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17786v1","updated":"2024-09-26T12:29:13Z","published":"2024-09-26T12:29:13Z","title":"Predicting the Stay Length of Patients in Hospitals using Convolutional\n Gated Recurrent Deep Learning Model","summary":" Predicting hospital length of stay (LoS) stands as a critical factor in\nshaping public health strategies. This data serves as a cornerstone for\ngovernments to discern trends, patterns, and avenues for enhancing healthcare\ndelivery. In this study, we introduce a robust hybrid deep learning model, a\ncombination of Multi-layer Convolutional (CNNs) deep learning, Gated Recurrent\nUnits (GRU), and Dense neural networks, that outperforms 11 conventional and\nstate-of-the-art Machine Learning (ML) and Deep Learning (DL) methodologies in\naccurately forecasting inpatient hospital stay duration. Our investigation\ndelves into the implementation of this hybrid model, scrutinising variables\nlike geographic indicators tied to caregiving institutions, demographic markers\nencompassing patient ethnicity, race, and age, as well as medical attributes\nsuch as the CCS diagnosis code, APR DRG code, illness severity metrics, and\nhospital stay duration. Statistical evaluations reveal the pinnacle LoS\naccuracy achieved by our proposed model (CNN-GRU-DNN), which averages at 89%\nacross a 10-fold cross-validation test, surpassing LSTM, BiLSTM, GRU, and\nConvolutional Neural Networks (CNNs) by 19%, 18.2%, 18.6%, and 7%,\nrespectively. Accurate LoS predictions not only empower hospitals to optimise\nresource allocation and curb expenses associated with prolonged stays but also\npave the way for novel strategies in hospital stay management. This avenue\nholds promise for catalysing advancements in healthcare research and\ninnovation, inspiring a new era of precision-driven healthcare practices.\n","authors":["Mehdi Neshat","Michael Phipps","Chris A. Browne","Nicole T. Vargas","Seyedali Mirjalili"],"pdf_url":"https://arxiv.org/pdf/2409.17786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08757v4","updated":"2024-09-26T12:17:24Z","published":"2024-03-13T17:55:34Z","title":"Efficient Combinatorial Optimization via Heat Diffusion","summary":" Combinatorial optimization problems are widespread but inherently challenging\ndue to their discrete nature. The primary limitation of existing methods is\nthat they can only access a small fraction of the solution space at each\niteration, resulting in limited efficiency for searching the global optimal. To\novercome this challenge, diverging from conventional efforts of expanding the\nsolver's search scope, we focus on enabling information to actively propagate\nto the solver through heat diffusion. By transforming the target function while\npreserving its optima, heat diffusion facilitates information flow from distant\nregions to the solver, providing more efficient navigation. Utilizing heat\ndiffusion, we propose a framework for solving general combinatorial\noptimization problems. The proposed methodology demonstrates superior\nperformance across a range of the most challenging and widely encountered\ncombinatorial optimizations. Echoing recent advancements in harnessing\nthermodynamics for generative artificial intelligence, our study further\nreveals its significant potential in advancing combinatorial optimization.\n","authors":["Hengyuan Ma","Wenlian Lu","Jianfeng Feng"],"pdf_url":"https://arxiv.org/pdf/2403.08757v4.pdf","comment":"After the rebuttal version for NeurIPS 2024 (poster). Code is\n available in https://github.com/AwakerMhy/HeO"},{"id":"http://arxiv.org/abs/2309.16928v3","updated":"2024-09-26T12:09:22Z","published":"2023-09-29T02:04:24Z","title":"Learning to Receive Help: Intervention-Aware Concept Embedding Models","summary":" Concept Bottleneck Models (CBMs) tackle the opacity of neural architectures\nby constructing and explaining their predictions using a set of high-level\nconcepts. A special property of these models is that they permit concept\ninterventions, wherein users can correct mispredicted concepts and thus improve\nthe model's performance. Recent work, however, has shown that intervention\nefficacy can be highly dependent on the order in which concepts are intervened\non and on the model's architecture and training hyperparameters. We argue that\nthis is rooted in a CBM's lack of train-time incentives for the model to be\nappropriately receptive to concept interventions. To address this, we propose\nIntervention-aware Concept Embedding models (IntCEMs), a novel CBM-based\narchitecture and training paradigm that improves a model's receptiveness to\ntest-time interventions. Our model learns a concept intervention policy in an\nend-to-end fashion from where it can sample meaningful intervention\ntrajectories at train-time. This conditions IntCEMs to effectively select and\nreceive concept interventions when deployed at test-time. Our experiments show\nthat IntCEMs significantly outperform state-of-the-art concept-interpretable\nmodels when provided with test-time concept interventions, demonstrating the\neffectiveness of our approach.\n","authors":["Mateo Espinosa Zarlenga","Katherine M. Collins","Krishnamurthy Dvijotham","Adrian Weller","Zohreh Shams","Mateja Jamnik"],"pdf_url":"https://arxiv.org/pdf/2309.16928v3.pdf","comment":"Accepted as a spotlight at the Thirty-seventh Conference on Neural\n Information Processing Systems (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2409.17763v1","updated":"2024-09-26T11:58:41Z","published":"2024-09-26T11:58:41Z","title":"Confidence intervals uncovered: Are we ready for real-world medical\n imaging AI?","summary":" Medical imaging is spearheading the AI transformation of healthcare.\nPerformance reporting is key to determine which methods should be translated\ninto clinical practice. Frequently, broad conclusions are simply derived from\nmean performance values. In this paper, we argue that this common practice is\noften a misleading simplification as it ignores performance variability. Our\ncontribution is threefold. (1) Analyzing all MICCAI segmentation papers (n =\n221) published in 2023, we first observe that more than 50\\% of papers do not\nassess performance variability at all. Moreover, only one (0.5\\%) paper\nreported confidence intervals (CIs) for model performance. (2) To address the\nreporting bottleneck, we show that the unreported standard deviation (SD) in\nsegmentation papers can be approximated by a second-order polynomial function\nof the mean Dice similarity coefficient (DSC). Based on external validation\ndata from 56 previous MICCAI challenges, we demonstrate that this approximation\ncan accurately reconstruct the CI of a method using information provided in\npublications. (3) Finally, we reconstructed 95\\% CIs around the mean DSC of\nMICCAI 2023 segmentation papers. The median CI width was 0.03 which is three\ntimes larger than the median performance gap between the first and second\nranked method. For more than 60\\% of papers, the mean performance of the\nsecond-ranked method was within the CI of the first-ranked method. We conclude\nthat current publications typically do not provide sufficient evidence to\nsupport which models could potentially be translated into clinical practice.\n","authors":["Evangelia Christodoulou","Annika Reinke","Rola Houhou","Piotr Kalinowski","Selen Erkan","Carole H. Sudre","Ninon Burgos","Sofiène Boutaj","Sophie Loizillon","Maëlys Solal","Nicola Rieke","Veronika Cheplygina","Michela Antonelli","Leon D. Mayer","Minu D. Tizabi","M. Jorge Cardoso","Amber Simpson","Paul F. Jäger","Annette Kopp-Schneider","Gaël Varoquaux","Olivier Colliot","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2409.17763v1.pdf","comment":"Paper accepted at MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2408.10672v2","updated":"2024-09-26T11:42:31Z","published":"2024-08-20T09:17:11Z","title":"Neural Exploratory Landscape Analysis","summary":" Recent research in Meta-Black-Box Optimization (MetaBBO) have shown that\nmeta-trained neural networks can effectively guide the design of black-box\noptimizers, significantly reducing the need for expert tuning and delivering\nrobust performance across complex problem distributions. Despite their success,\na paradox remains: MetaBBO still rely on human-crafted Exploratory Landscape\nAnalysis features to inform the meta-level agent about the low-level\noptimization progress. To address the gap, this paper proposes Neural\nExploratory Landscape Analysis (NeurELA), a novel framework that dynamically\nprofiles landscape features through a two-stage, attention-based neural\nnetwork, executed in an entirely end-to-end fashion. NeurELA is pre-trained\nover a variety of MetaBBO algorithms using a multi-task neuroevolution\nstrategy. Extensive experiments show that NeurELA achieves consistently\nsuperior performance when integrated into different and even unseen MetaBBO\ntasks and can be efficiently fine-tuned for further performance boost. This\nadvancement marks a pivotal step in making MetaBBO algorithms more autonomous\nand broadly applicable.The source code of NeurELA can be accessed at\nhttps://anonymous.4open.science/r/Neur-ELA-303C.\n","authors":["Zeyuan Ma","Jiacheng Chen","Hongshu Guo","Yue-Jiao Gong"],"pdf_url":"https://arxiv.org/pdf/2408.10672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2001.07495v5","updated":"2024-09-26T11:42:25Z","published":"2020-01-21T13:05:31Z","title":"Unsupervisedly Learned Representations: Should the Quest be Over?","summary":" After four decades of research there still exists a Classification accuracy\ngap of about 20% between our best Unsupervisedly Learned Representations\nmethods and the accuracy rates achieved by intelligent animals. It thus may\nwell be that we are looking in the wrong direction. A possible solution to this\npuzzle is presented. We demonstrate that Reinforcement Learning can learn\nrepresentations which achieve the same accuracy as that of animals. Our main\nmodest contribution lies in the observations that: a. when applied to a real\nworld environment Reinforcement Learning does not require labels, and thus may\nbe legitimately considered as Unsupervised Learning, and b. in contrast, when\nReinforcement Learning is applied in a simulated environment it does inherently\nrequire labels and should thus be generally be considered as Supervised\nLearning. The corollary of these observations is that further search for\nUnsupervised Learning competitive paradigms which may be trained in simulated\nenvironments may be futile.\n","authors":["Daniel N. Nissani"],"pdf_url":"https://arxiv.org/pdf/2001.07495v5.pdf","comment":"To be published at The 6th International Conference on Machine\n Learning, Optimization and Data Science - LOD 2020"},{"id":"http://arxiv.org/abs/2409.17754v1","updated":"2024-09-26T11:36:08Z","published":"2024-09-26T11:36:08Z","title":"Byzantine-Robust Aggregation for Securing Decentralized Federated\n Learning","summary":" Federated Learning (FL) emerges as a distributed machine learning approach\nthat addresses privacy concerns by training AI models locally on devices.\nDecentralized Federated Learning (DFL) extends the FL paradigm by eliminating\nthe central server, thereby enhancing scalability and robustness through the\navoidance of a single point of failure. However, DFL faces significant\nchallenges in optimizing security, as most Byzantine-robust algorithms proposed\nin the literature are designed for centralized scenarios. In this paper, we\npresent a novel Byzantine-robust aggregation algorithm to enhance the security\nof Decentralized Federated Learning environments, coined WFAgg. This proposal\nhandles the adverse conditions and strength robustness of dynamic decentralized\ntopologies at the same time by employing multiple filters to identify and\nmitigate Byzantine attacks. Experimental results demonstrate the effectiveness\nof the proposed algorithm in maintaining model accuracy and convergence in the\npresence of various Byzantine attack scenarios, outperforming state-of-the-art\ncentralized Byzantine-robust aggregation schemes (such as Multi-Krum or\nClustering). These algorithms are evaluated on an IID image classification\nproblem in both centralized and decentralized scenarios.\n","authors":["Diego Cajaraville-Aboy","Ana Fernández-Vilas","Rebeca P. Díaz-Redondo","Manuel Fernández-Veiga"],"pdf_url":"https://arxiv.org/pdf/2409.17754v1.pdf","comment":"18 pages, 7 figures, 1 table"},{"id":"http://arxiv.org/abs/2409.17745v1","updated":"2024-09-26T11:19:09Z","published":"2024-09-26T11:19:09Z","title":"Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval\n Model","summary":" A supervised ranking model, despite its advantage of being effective, usually\ninvolves complex processing - typically multiple stages of task-specific\npre-training and fine-tuning. This has motivated researchers to explore simpler\npipelines leveraging large language models (LLMs) that are capable of working\nin a zero-shot manner. However, since zero-shot inference does not make use of\na training set of pairs of queries and their relevant documents, its\nperformance is mostly worse than that of supervised models, which are trained\non such example pairs. Motivated by the existing findings that training\nexamples generally improve zero-shot performance, in our work, we explore if\nthis also applies to ranking models. More specifically, given a query and a\npair of documents, the preference prediction task is improved by augmenting\nexamples of preferences for similar queries from a training set. Our proposed\npairwise few-shot ranker demonstrates consistent improvements over the\nzero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset)\nretrieval benchmarks. Our method also achieves a close performance to that of a\nsupervised model without requiring any complex training pipeline.\n","authors":["Nilanjan Sinhababu","Andrew Parry","Debasis Ganguly","Debasis Samanta","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2409.17745v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17730v1","updated":"2024-09-26T11:00:19Z","published":"2024-09-26T11:00:19Z","title":"Autoregressive Generation Strategies for Top-K Sequential\n Recommendations","summary":" The goal of modern sequential recommender systems is often formulated in\nterms of next-item prediction. In this paper, we explore the applicability of\ngenerative transformer-based models for the Top-K sequential recommendation\ntask, where the goal is to predict items a user is likely to interact with in\nthe \"near future\".\n We explore commonly used autoregressive generation strategies, including\ngreedy decoding, beam search, and temperature sampling, to evaluate their\nperformance for the Top-K sequential recommendation task. In addition, we\npropose novel Reciprocal Rank Aggregation (RRA) and Relevance Aggregation (RA)\ngeneration strategies based on multi-sequence generation with temperature\nsampling and subsequent aggregation.\n Experiments on diverse datasets give valuable insights regarding commonly\nused strategies' applicability and show that suggested approaches improve\nperformance on longer time horizons compared to widely-used Top-K prediction\napproach and single-sequence autoregressive generation strategies.\n","authors":["Anna Volodkevich","Danil Gusak","Anton Klenitskiy","Alexey Vasilev"],"pdf_url":"https://arxiv.org/pdf/2409.17730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17726v1","updated":"2024-09-26T10:56:27Z","published":"2024-09-26T10:56:27Z","title":"Recent advances in interpretable machine learning using structure-based\n protein representations","summary":" Recent advancements in machine learning (ML) are transforming the field of\nstructural biology. For example, AlphaFold, a groundbreaking neural network for\nprotein structure prediction, has been widely adopted by researchers. The\navailability of easy-to-use interfaces and interpretable outcomes from the\nneural network architecture, such as the confidence scores used to color the\npredicted structures, have made AlphaFold accessible even to non-ML experts. In\nthis paper, we present various methods for representing protein 3D structures\nfrom low- to high-resolution, and show how interpretable ML methods can support\ntasks such as predicting protein structures, protein function, and\nprotein-protein interactions. This survey also emphasizes the significance of\ninterpreting and visualizing ML-based inference for structure-based protein\nrepresentations that enhance interpretability and knowledge discovery.\nDeveloping such interpretable approaches promises to further accelerate fields\nincluding drug development and protein design.\n","authors":["Luiz Felipe Vecchietti","Minji Lee","Begench Hangeldiyev","Hyunkyu Jung","Hahnbeom Park","Tae-Kyun Kim","Meeyoung Cha","Ho Min Kim"],"pdf_url":"https://arxiv.org/pdf/2409.17726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17716v1","updated":"2024-09-26T10:38:35Z","published":"2024-09-26T10:38:35Z","title":"QuForge: A Library for Qudits Simulation","summary":" Quantum computing with qudits, an extension of qubits to multiple levels, is\na research field less mature than qubit-based quantum computing. However,\nqudits can offer some advantages over qubits, by representing information with\nfewer separated components. In this article, we present QuForge, a Python-based\nlibrary designed to simulate quantum circuits with qudits. This library\nprovides the necessary quantum gates for implementing quantum algorithms,\ntailored to any chosen qudit dimension. Built on top of differentiable\nframeworks, QuForge supports execution on accelerating devices such as GPUs and\nTPUs, significantly speeding up simulations. It also supports sparse\noperations, leading to a reduction in memory consumption compared to other\nlibraries. Additionally, by constructing quantum circuits as differentiable\ngraphs, QuForge facilitates the implementation of quantum machine learning\nalgorithms, enhancing the capabilities and flexibility of quantum computing\nresearch.\n","authors":["Tiago de Souza Farias","Lucas Friedrich","Jonas Maziero"],"pdf_url":"https://arxiv.org/pdf/2409.17716v1.pdf","comment":"18 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.17711v1","updated":"2024-09-26T10:27:19Z","published":"2024-09-26T10:27:19Z","title":"Efficient Pointwise-Pairwise Learning-to-Rank for News Recommendation","summary":" News recommendation is a challenging task that involves personalization based\non the interaction history and preferences of each user. Recent works have\nleveraged the power of pretrained language models (PLMs) to directly rank news\nitems by using inference approaches that predominately fall into three\ncategories: pointwise, pairwise, and listwise learning-to-rank. While pointwise\nmethods offer linear inference complexity, they fail to capture crucial\ncomparative information between items that is more effective for ranking tasks.\nConversely, pairwise and listwise approaches excel at incorporating these\ncomparisons but suffer from practical limitations: pairwise approaches are\neither computationally expensive or lack theoretical guarantees, and listwise\nmethods often perform poorly in practice. In this paper, we propose a novel\nframework for PLM-based news recommendation that integrates both pointwise\nrelevance prediction and pairwise comparisons in a scalable manner. We present\na rigorous theoretical analysis of our framework, establishing conditions under\nwhich our approach guarantees improved performance. Extensive experiments show\nthat our approach outperforms the state-of-the-art methods on the MIND and\nAdressa news recommendation datasets.\n","authors":["Nithish Kannen","Yao Ma","Gerrit J. J. van den Burg","Jean Baptiste Faddoul"],"pdf_url":"https://arxiv.org/pdf/2409.17711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15600v2","updated":"2024-09-26T10:26:18Z","published":"2024-08-28T07:48:39Z","title":"Exploring Selective Layer Fine-Tuning in Federated Learning","summary":" Federated learning (FL) has emerged as a promising paradigm for fine-tuning\nfoundation models using distributed data in a privacy-preserving manner. Under\nlimited computational resources, clients often find it more practical to\nfine-tune a selected subset of layers, rather than the entire model, based on\ntheir task-specific data. In this study, we provide a thorough theoretical\nexploration of selective layer fine-tuning in FL, emphasizing a flexible\napproach that allows the clients to adjust their selected layers according to\ntheir local data and resources. We theoretically demonstrate that the layer\nselection strategy has a significant impact on model convergence in two\ncritical aspects: the importance of selected layers and the heterogeneous\nchoices across clients. Drawing from these insights, we further propose a\nstrategic layer selection method that utilizes local gradients and regulates\nlayer selections across clients. The extensive experiments on both image and\ntext datasets demonstrate the effectiveness of the proposed strategy compared\nwith several baselines, highlighting its advances in identifying critical\nlayers that adapt to the client heterogeneity and training dynamics in FL.\n","authors":["Yuchang Sun","Yuexiang Xie","Bolin Ding","Yaliang Li","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15953v2","updated":"2024-09-26T10:22:34Z","published":"2024-08-28T17:12:01Z","title":"Modeling and Analyzing the Influence of Non-Item Pages on Sequential\n Next-Item Prediction","summary":" Analyzing sequences of interactions between users and items, sequential\nrecommendation models can learn user intent and make predictions about the next\nitem. Next to item interactions, most systems also have interactions with what\nwe call non-item pages: these pages are not related to specific items but still\ncan provide insights of the user's interests, as, for example, navigation\npages.\n We therefore propose a general way to include these non-item pages in\nsequential recommendation models to enhance next-item prediction. First, we\ndemonstrate the influence of non-item pages on following interactions with the\nhypotheses testing framework HypTrails and propose methods for representing\nnon-item pages in sequential recommendation models. Subsequently, we adapt\npopular sequential recommender models to integrate non-item pages and\ninvestigate their performance with different item representation strategies as\nwell as their ability to handle noisy data. To show the general capabilities of\nthe models to integrate non-item pages, we create a synthetic dataset for a\ncontrolled setting and then evaluate the improvements from including non-item\npages on two real-world datasets.\n Our results show that non-item pages are a valuable source of information,\nand incorporating them in sequential recommendation models increases the\nperformance of next-item prediction across all analyzed model architectures.\n","authors":["Elisabeth Fischer","Albin Zehe","Andreas Hotho","Daniel Schlör"],"pdf_url":"https://arxiv.org/pdf/2408.15953v2.pdf","comment":"37 pages, 19 figures; Submitted to ACM TORS"},{"id":"http://arxiv.org/abs/2407.14788v2","updated":"2024-09-26T10:21:33Z","published":"2024-07-20T07:39:07Z","title":"On the Design and Analysis of LLM-Based Algorithms","summary":" We initiate a formal investigation into the design and analysis of LLM-based\nalgorithms, i.e. algorithms that contain one or multiple calls of large\nlanguage models (LLMs) as sub-routines and critically rely on the capabilities\nof LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt\nengineering to complicated LLM-powered agent systems and compound AI systems,\nhave achieved remarkable empirical success, the design and optimization of them\nhave mostly relied on heuristics and trial-and-errors, which is largely due to\na lack of formal and analytical study for these algorithms. To fill this gap,\nwe start by identifying the computational-graph representation of LLM-based\nalgorithms, the design principle of task decomposition, and some key\nabstractions, which then facilitate our formal analysis for the accuracy and\nefficiency of LLM-based algorithms, despite the black-box nature of LLMs.\nThrough extensive analytical and empirical investigation in a series of case\nstudies, we demonstrate that the proposed framework is broadly applicable to a\nwide range of scenarios and diverse patterns of LLM-based algorithms, such as\nparallel, hierarchical and recursive task decomposition. Our proposed framework\nholds promise for advancing LLM-based algorithms, by revealing the reasons\nbehind curious empirical phenomena, guiding the choices of hyperparameters,\npredicting the empirical performance of algorithms, and inspiring new algorithm\ndesign. To promote further study of LLM-based algorithms, we release our source\ncode at\nhttps://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm.\n","authors":["Yanxi Chen","Yaliang Li","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.14788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17704v1","updated":"2024-09-26T10:20:59Z","published":"2024-09-26T10:20:59Z","title":"Transfer Learning in $\\ell_1$ Regularized Regression: Hyperparameter\n Selection Strategy based on Sharp Asymptotic Analysis","summary":" Transfer learning techniques aim to leverage information from multiple\nrelated datasets to enhance prediction quality against a target dataset. Such\nmethods have been adopted in the context of high-dimensional sparse regression,\nand some Lasso-based algorithms have been invented: Trans-Lasso and Pretraining\nLasso are such examples. These algorithms require the statistician to select\nhyperparameters that control the extent and type of information transfer from\nrelated datasets. However, selection strategies for these hyperparameters, as\nwell as the impact of these choices on the algorithm's performance, have been\nlargely unexplored. To address this, we conduct a thorough, precise study of\nthe algorithm in a high-dimensional setting via an asymptotic analysis using\nthe replica method. Our approach reveals a surprisingly simple behavior of the\nalgorithm: Ignoring one of the two types of information transferred to the\nfine-tuning stage has little effect on generalization performance, implying\nthat efforts for hyperparameter selection can be significantly reduced. Our\ntheoretical findings are also empirically supported by real-world applications\non the IMDb dataset.\n","authors":["Koki Okajima","Tomoyuki Obuchi"],"pdf_url":"https://arxiv.org/pdf/2409.17704v1.pdf","comment":"23 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.17703v1","updated":"2024-09-26T10:20:25Z","published":"2024-09-26T10:20:25Z","title":"PGN: The RNN's New Successor is Effective for Long-Range Time Series\n Forecasting","summary":" Due to the recurrent structure of RNN, the long information propagation path\nposes limitations in capturing long-term dependencies, gradient\nexplosion/vanishing issues, and inefficient sequential execution. Based on\nthis, we propose a novel paradigm called Parallel Gated Network (PGN) as the\nnew successor to RNN. PGN directly captures information from previous time\nsteps through the designed Historical Information Extraction (HIE) layer and\nleverages gated mechanisms to select and fuse it with the current time step\ninformation. This reduces the information propagation path to $\\mathcal{O}(1)$,\neffectively addressing the limitations of RNN. To enhance PGN's performance in\nlong-range time series forecasting tasks, we propose a novel temporal modeling\nframework called Temporal PGN (TPGN). TPGN incorporates two branches to\ncomprehensively capture the semantic information of time series. One branch\nutilizes PGN to capture long-term periodic patterns while preserving their\nlocal characteristics. The other branch employs patches to capture short-term\ninformation and aggregate the global representation of the series. TPGN\nachieves a theoretical complexity of $\\mathcal{O}(\\sqrt{L})$, ensuring\nefficiency in its operations. Experimental results on five benchmark datasets\ndemonstrate the state-of-the-art (SOTA) performance and high efficiency of\nTPGN, further confirming the effectiveness of PGN as the new successor to RNN\nin long-range time series forecasting. The code is available in this\nrepository: \\url{https://github.com/Water2sea/TPGN}.\n","authors":["Yuxin Jia","Youfang Lin","Jing Yu","Shuo Wang","Tianhao Liu","Huaiyu Wan"],"pdf_url":"https://arxiv.org/pdf/2409.17703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17699v1","updated":"2024-09-26T10:12:19Z","published":"2024-09-26T10:12:19Z","title":"MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard\n for Prompt Attacks","summary":" The proliferation of Large Language Models (LLMs) in diverse applications\nunderscores the pressing need for robust security measures to thwart potential\njailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger\ndata integrity and user privacy. Guardrails serve as crucial protective\nmechanisms against such threats, but existing models often fall short in terms\nof both detection accuracy, and computational efficiency. This paper advocates\nfor the significance of jailbreak attack prevention on LLMs, and emphasises the\nrole of input guardrails in safeguarding these models. We introduce MoJE\n(Mixture of Jailbreak Expert), a novel guardrail architecture designed to\nsurpass current limitations in existing state-of-the-art guardrails. By\nemploying simple linguistic statistical techniques, MoJE excels in detecting\njailbreak attacks while maintaining minimal computational overhead during model\ninference. Through rigorous experimentation, MoJE demonstrates superior\nperformance capable of detecting 90% of the attacks without compromising benign\nprompts, enhancing LLMs security against jailbreak attacks.\n","authors":["Giandomenico Cornacchia","Giulio Zizzo","Kieran Fraser","Muhammad Zaid Hamed","Ambrish Rawat","Mark Purcell"],"pdf_url":"https://arxiv.org/pdf/2409.17699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17163v2","updated":"2024-09-26T10:07:06Z","published":"2024-07-24T11:07:20Z","title":"dlordinal: a Python package for deep ordinal classification","summary":" dlordinal is a new Python library that unifies many recent deep ordinal\nclassification methodologies available in the literature. Developed using\nPyTorch as underlying framework, it implements the top performing\nstate-of-the-art deep learning techniques for ordinal classification problems.\nOrdinal approaches are designed to leverage the ordering information present in\nthe target variable. Specifically, it includes loss functions, various output\nlayers, dropout techniques, soft labelling methodologies, and other\nclassification strategies, all of which are appropriately designed to\nincorporate the ordinal information. Furthermore, as the performance metrics to\nassess novel proposals in ordinal classification depend on the distance between\ntarget and predicted classes in the ordinal scale, suitable ordinal evaluation\nmetrics are also included. dlordinal is distributed under the BSD-3-Clause\nlicense and is available at https://github.com/ayrna/dlordinal.\n","authors":["Francisco Bérchez-Moreno","Víctor M. Vargas","Rafael Ayllón-Gavilán","David Guijo-Rubio","César Hervás-Martínez","Juan C. Fernández","Pedro A. Gutiérrez"],"pdf_url":"https://arxiv.org/pdf/2407.17163v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17692v1","updated":"2024-09-26T09:57:16Z","published":"2024-09-26T09:57:16Z","title":"MIO: A Foundation Model on Multimodal Tokens","summary":" In this paper, we introduce MIO, a novel foundation model built on multimodal\ntokens, capable of understanding and generating speech, text, images, and\nvideos in an end-to-end, autoregressive manner. While the emergence of large\nlanguage models (LLMs) and multimodal large language models (MM-LLMs) propels\nadvancements in artificial general intelligence through their versatile\ncapabilities, they still lack true any-to-any understanding and generation.\nRecently, the release of GPT-4o has showcased the remarkable potential of\nany-to-any LLMs for complex real-world tasks, enabling omnidirectional input\nand output across images, speech, and text. However, it is closed-source and\ndoes not support the generation of multimodal interleaved sequences. To address\nthis gap, we present MIO, which is trained on a mixture of discrete tokens\nacross four modalities using causal multimodal modeling. MIO undergoes a\nfour-stage training process: (1) alignment pre-training, (2) interleaved\npre-training, (3) speech-enhanced pre-training, and (4) comprehensive\nsupervised fine-tuning on diverse textual, visual, and speech tasks. Our\nexperimental results indicate that MIO exhibits competitive, and in some cases\nsuperior, performance compared to previous dual-modal baselines, any-to-any\nmodel baselines, and even modality-specific baselines. Moreover, MIO\ndemonstrates advanced capabilities inherent to its any-to-any feature, such as\ninterleaved video-text generation, chain-of-visual-thought reasoning, visual\nguideline generation, instructional image editing, etc.\n","authors":["Zekun Wang","King Zhu","Chunpu Xu","Wangchunshu Zhou","Jiaheng Liu","Yibo Zhang","Jiashuo Wang","Ning Shi","Siyu Li","Yizhi Li","Haoran Que","Zhaoxiang Zhang","Yuanxing Zhang","Ge Zhang","Ke Xu","Jie Fu","Wenhao Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17692v1.pdf","comment":"Technical Report. Codes and models will be available soon"},{"id":"http://arxiv.org/abs/2409.17691v1","updated":"2024-09-26T09:56:13Z","published":"2024-09-26T09:56:13Z","title":"Efficient Bias Mitigation Without Privileged Information","summary":" Deep neural networks trained via empirical risk minimisation often exhibit\nsignificant performance disparities across groups, particularly when group and\ntask labels are spuriously correlated (e.g., \"grassy background\" and \"cows\").\nExisting bias mitigation methods that aim to address this issue often either\nrely on group labels for training or validation, or require an extensive\nhyperparameter search. Such data and computational requirements hinder the\npractical deployment of these methods, especially when datasets are too large\nto be group-annotated, computational resources are limited, and models are\ntrained through already complex pipelines. In this paper, we propose Targeted\nAugmentations for Bias Mitigation (TAB), a simple hyperparameter-free framework\nthat leverages the entire training history of a helper model to identify\nspurious samples, and generate a group-balanced training set from which a\nrobust model can be trained. We show that TAB improves worst-group performance\nwithout any group information or model selection, outperforming existing\nmethods while maintaining overall accuracy.\n","authors":["Mateo Espinosa Zarlenga","Swami Sankaranarayanan","Jerone T. A. Andrews","Zohreh Shams","Mateja Jamnik","Alice Xiang"],"pdf_url":"https://arxiv.org/pdf/2409.17691v1.pdf","comment":"Accepted at the 18th European Conference on Computer Vision (ECCV\n 2024) as an Oral presentation"},{"id":"http://arxiv.org/abs/2312.05181v3","updated":"2024-09-26T09:52:13Z","published":"2023-12-08T17:08:03Z","title":"Tenplex: Dynamic Parallelism for Deep Learning using Parallelizable\n Tensor Collections","summary":" Deep learning (DL) jobs use multi-dimensional parallelism, i.e. combining\ndata, model, and pipeline parallelism, to use large GPU clusters efficiently.\nLong-running jobs may experience changes to their GPU allocation: (i) resource\nelasticity during training adds or removes GPUs; (ii) hardware maintenance may\nrequire redeployment on different GPUs; and (iii) GPU failures force jobs to\nrun with fewer devices. Current DL frameworks tie jobs to a set of GPUs and\nthus lack support for these scenarios. In particular, they cannot change the\nmulti-dimensional parallelism of an already-running job in an efficient and\nmodel-independent way.\n We describe Scalai, a state management library for DL systems that enables\njobs to change their parallelism dynamically after the GPU allocation is\nupdated at runtime. Scalai achieves this through a new abstraction, a\nparallelizable tensor collection (PTC), that externalizes the job state during\ntraining. After a GPU change, Scalai uses the PTC to transform the job state:\nthe PTC repartitions the dataset state under data parallelism and exposes it to\nDL workers through a virtual file system; and the PTC obtains the model state\nas partitioned checkpoints and transforms them to reflect the new\nparallelization configuration. For efficiency, Scalai executes PTC\ntransformations in parallel with minimum data movement between workers. Our\nexperiments show that Scalai enables DL jobs to support dynamic parallelization\nwith low overhead.\n","authors":["Marcel Wagenländer","Guo Li","Bo Zhao","Luo Mai","Peter Pietzuch"],"pdf_url":"https://arxiv.org/pdf/2312.05181v3.pdf","comment":"The 30th Symposium on Operating Systems Principles (SOSP24)"},{"id":"http://arxiv.org/abs/2409.17687v1","updated":"2024-09-26T09:51:29Z","published":"2024-09-26T09:51:29Z","title":"Graph Edit Distance with General Costs Using Neural Set Divergence","summary":" Graph Edit Distance (GED) measures the (dis-)similarity between two given\ngraphs, in terms of the minimum-cost edit sequence that transforms one graph to\nthe other. However, the exact computation of GED is NP-Hard, which has recently\nmotivated the design of neural methods for GED estimation. However, they do not\nexplicitly account for edit operations with different costs. In response, we\npropose GRAPHEDX, a neural GED estimator that can work with general costs\nspecified for the four edit operations, viz., edge deletion, edge addition,\nnode deletion and node addition. We first present GED as a quadratic assignment\nproblem (QAP) that incorporates these four costs. Then, we represent each graph\nas a set of node and edge embeddings and use them to design a family of neural\nset divergence surrogates. We replace the QAP terms corresponding to each\noperation with their surrogates. Computing such neural set divergence require\naligning nodes and edges of the two graphs. We learn these alignments using a\nGumbel-Sinkhorn permutation generator, additionally ensuring that the node and\nedge alignments are consistent with each other. Moreover, these alignments are\ncognizant of both the presence and absence of edges between node-pairs.\nExperiments on several datasets, under a variety of edit cost settings, show\nthat GRAPHEDX consistently outperforms state-of-the-art methods and heuristics\nin terms of prediction error.\n","authors":["Eeshaan Jain","Indradyumna Roy","Saswat Meher","Soumen Chakrabarti","Abir De"],"pdf_url":"https://arxiv.org/pdf/2409.17687v1.pdf","comment":"Published at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17685v1","updated":"2024-09-26T09:51:08Z","published":"2024-09-26T09:51:08Z","title":"Artificial Data Point Generation in Clustered Latent Space for Small\n Medical Datasets","summary":" One of the growing trends in machine learning is the use of data generation\ntechniques, since the performance of machine learning models is dependent on\nthe quantity of the training dataset. However, in many medical applications,\ncollecting large datasets is challenging due to resource constraints, which\nleads to overfitting and poor generalization. This paper introduces a novel\nmethod, Artificial Data Point Generation in Clustered Latent Space (AGCL),\ndesigned to enhance classification performance on small medical datasets\nthrough synthetic data generation. The AGCL framework involves feature\nextraction, K-means clustering, cluster evaluation based on a class separation\nmetric, and the generation of synthetic data points from clusters with distinct\nclass representations. This method was applied to Parkinson's disease\nscreening, utilizing facial expression data, and evaluated across multiple\nmachine learning classifiers. Experimental results demonstrate that AGCL\nsignificantly improves classification accuracy compared to baseline, GN and\nkNNMTD. AGCL achieved the highest overall test accuracy of 83.33% and\ncross-validation accuracy of 90.90% in majority voting over different emotions,\nconfirming its effectiveness in augmenting small datasets.\n","authors":["Yasaman Haghbin","Hadi Moradi","Reshad Hosseini"],"pdf_url":"https://arxiv.org/pdf/2409.17685v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.17684v1","updated":"2024-09-26T09:51:07Z","published":"2024-09-26T09:51:07Z","title":"Preserving logical and functional dependencies in synthetic tabular data","summary":" Dependencies among attributes are a common aspect of tabular data. However,\nwhether existing tabular data generation algorithms preserve these dependencies\nwhile generating synthetic data is yet to be explored. In addition to the\nexisting notion of functional dependencies, we introduce the notion of logical\ndependencies among the attributes in this article. Moreover, we provide a\nmeasure to quantify logical dependencies among attributes in tabular data.\nUtilizing this measure, we compare several state-of-the-art synthetic data\ngeneration algorithms and test their capability to preserve logical and\nfunctional dependencies on several publicly available datasets. We demonstrate\nthat currently available synthetic tabular data generation algorithms do not\nfully preserve functional dependencies when they generate synthetic datasets.\nIn addition, we also showed that some tabular synthetic data generation models\ncan preserve inter-attribute logical dependencies. Our review and comparison of\nthe state-of-the-art reveal research needs and opportunities to develop\ntask-specific synthetic tabular data generation models.\n","authors":["Chaithra Umesh","Kristian Schultz","Manjunath Mahendra","Saparshi Bej","Olaf Wolkenhauer"],"pdf_url":"https://arxiv.org/pdf/2409.17684v1.pdf","comment":"Submitted to Pattern Recognition Journal"},{"id":"http://arxiv.org/abs/2208.13197v2","updated":"2024-09-26T09:40:31Z","published":"2022-08-28T10:47:32Z","title":"IDP-PGFE: An Interpretable Disruption Predictor based on Physics-Guided\n Feature Extraction","summary":" Disruption prediction has made rapid progress in recent years, especially in\nmachine learning (ML)-based methods. Understanding why a predictor makes a\ncertain prediction can be as crucial as the prediction's accuracy for future\ntokamak disruption predictors. The purpose of most disruption predictors is\naccuracy or cross-machine capability. However, if a disruption prediction model\ncan be interpreted, it can tell why certain samples are classified as\ndisruption precursors. This allows us to tell the types of incoming disruption\nand gives us insight into the mechanism of disruption. This paper designs a\ndisruption predictor called Interpretable Disruption Predictor based On\nPhysics-guided feature extraction (IDP-PGFE) on J-TEXT. The prediction\nperformance of the model is effectively improved by extracting physics-guided\nfeatures. A high-performance model is required to ensure the validity of the\ninterpretation results. The interpretability study of IDP-PGFE provides an\nunderstanding of J-TEXT disruption and is generally consistent with existing\ncomprehension of disruption. IDP-PGFE has been applied to the disruption due to\ncontinuously increasing density towards density limit experiments on J-TEXT.\nThe time evolution of the PGFE features contribution demonstrates that the\napplication of ECRH triggers radiation-caused disruption, which lowers the\ndensity at disruption. While the application of RMP indeed raises the density\nlimit in J-TEXT. The interpretability study guides intuition on the physical\nmechanisms of density limit disruption that RMPs affect not only the MHD\ninstabilities but also the radiation profile, which delays density limit\ndisruption.\n","authors":["Chengshuo Shen","Wei Zheng","Yonghua Ding","Xinkun Ai","Fengming Xue","Yu Zhong","Nengchao Wang","Li Gao","Zhipeng Chen","Zhoujun Yang","Zhongyong Chen","Yuan Pan","J-TEXT team"],"pdf_url":"https://arxiv.org/pdf/2208.13197v2.pdf","comment":"17 pages, 13 figures"},{"id":"http://arxiv.org/abs/2409.17677v1","updated":"2024-09-26T09:36:47Z","published":"2024-09-26T09:36:47Z","title":"Optimal Memorization Capacity of Transformers","summary":" Recent research in the field of machine learning has increasingly focused on\nthe memorization capacity of Transformers, but how efficient they are is not\nyet well understood. We demonstrate that Transformers can memorize labels with\n$\\tilde{O}(\\sqrt{N})$ parameters in a next-token prediction setting for $N$\ninput sequences of length $n$, which is proved to be optimal up to logarithmic\nfactors. This indicates that Transformers can efficiently perform memorization\nwith little influence from the input length $n$ owing to the benefit of\nparameter sharing. We also analyze the memorization capacity in the\nsequence-to-sequence setting, and find that $\\tilde{O}(\\sqrt{nN})$ parameters\nare not only sufficient, but also necessary at least for Transformers with\nhardmax. These results suggest that while self-attention mechanisms can\nefficiently identify input sequences, the feed-forward network becomes a\nbottleneck when associating a label to each token.\n","authors":["Tokio Kajitsuka","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2409.17677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13503v2","updated":"2024-09-26T09:26:05Z","published":"2024-09-20T13:44:00Z","title":"SatFed: A Resource-Efficient LEO Satellite-Assisted Heterogeneous\n Federated Learning Framework","summary":" Traditional federated learning (FL) frameworks rely heavily on terrestrial\nnetworks, where coverage limitations and increasing bandwidth congestion\nsignificantly hinder model convergence. Fortunately, the advancement of\nlow-Earth orbit (LEO) satellite networks offers promising new communication\navenues to augment traditional terrestrial FL. Despite this potential, the\nlimited satellite-ground communication bandwidth and the heterogeneous\noperating environments of ground devices-including variations in data,\nbandwidth, and computing power-pose substantial challenges for effective and\nrobust satellite-assisted FL. To address these challenges, we propose SatFed, a\nresource-efficient satellite-assisted heterogeneous FL framework. SatFed\nimplements freshness-based model prioritization queues to optimize the use of\nhighly constrained satellite-ground bandwidth, ensuring the transmission of the\nmost critical models. Additionally, a multigraph is constructed to capture\nreal-time heterogeneous relationships between devices, including data\ndistribution, terrestrial bandwidth, and computing capability. This multigraph\nenables SatFed to aggregate satellite-transmitted models into peer guidance,\nenhancing local training in heterogeneous environments. Extensive experiments\nwith real-world LEO satellite networks demonstrate that SatFed achieves\nsuperior performance and robustness compared to state-of-the-art benchmarks.\n","authors":["Yuxin Zhang","Zheng Lin","Zhe Chen","Zihan Fang","Wenjun Zhu","Xianhao Chen","Jin Zhao","Yue Gao"],"pdf_url":"https://arxiv.org/pdf/2409.13503v2.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.17663v1","updated":"2024-09-26T09:21:48Z","published":"2024-09-26T09:21:48Z","title":"Explanation Bottleneck Models","summary":" Recent concept-based interpretable models have succeeded in providing\nmeaningful explanations by pre-defined concept sets. However, the dependency on\nthe pre-defined concepts restricts the application because of the limited\nnumber of concepts for explanations. This paper proposes a novel interpretable\ndeep neural network called explanation bottleneck models (XBMs). XBMs generate\na text explanation from the input without pre-defined concepts and then predict\na final task prediction based on the generated explanation by leveraging\npre-trained vision-language encoder-decoder models. To achieve both the target\ntask performance and the explanation quality, we train XBMs through the target\ntask loss with the regularization penalizing the explanation decoder via the\ndistillation from the frozen pre-trained decoder. Our experiments, including a\ncomparison to state-of-the-art concept bottleneck models, confirm that XBMs\nprovide accurate and fluent natural language explanations without pre-defined\nconcept sets. Code will be available at https://github.com/yshinya6/xbm/.\n","authors":["Shin'ya Yamaguchi","Kosuke Nishida"],"pdf_url":"https://arxiv.org/pdf/2409.17663v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.14816v2","updated":"2024-09-26T09:11:28Z","published":"2024-09-23T08:46:15Z","title":"VARADE: a Variational-based AutoRegressive model for Anomaly Detection\n on the Edge","summary":" Detecting complex anomalies on massive amounts of data is a crucial task in\nIndustry 4.0, best addressed by deep learning. However, available solutions are\ncomputationally demanding, requiring cloud architectures prone to latency and\nbandwidth issues. This work presents VARADE, a novel solution implementing a\nlight autoregressive framework based on variational inference, which is best\nsuited for real-time execution on the edge. The proposed approach was validated\non a robotic arm, part of a pilot production line, and compared with several\nstate-of-the-art algorithms, obtaining the best trade-off between anomaly\ndetection accuracy, power consumption and inference frequency on two different\nedge platforms.\n","authors":["Alessio Mascolini","Sebastiano Gaiardelli","Francesco Ponzio","Nicola Dall'Ora","Enrico Macii","Sara Vinco","Santa Di Cataldo","Franco Fummi"],"pdf_url":"https://arxiv.org/pdf/2409.14816v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15897v2","updated":"2024-09-26T09:04:58Z","published":"2023-12-26T06:20:55Z","title":"Recursive Distillation for Open-Set Distributed Robot Localization","summary":" A typical assumption in state-of-the-art self-localization models is that an\nannotated training dataset is available for the target workspace. However, this\nis not necessarily true when a robot travels around the general open world.\nThis work introduces a novel training scheme for open-world distributed robot\nsystems. In our scheme, a robot (``student\") can ask the other robots it meets\nat unfamiliar places (``teachers\") for guidance. Specifically, a\npseudo-training dataset is reconstructed from the teacher model and then used\nfor continual learning of the student model under domain, class, and vocabulary\nincremental setup. Unlike typical knowledge transfer schemes, our scheme\nintroduces only minimal assumptions on the teacher model, so that it can handle\nvarious types of open-set teachers, including those uncooperative, untrainable\n(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In\nthis paper, we investigate a ranking function as an instance of such generic\nmodels, using a challenging data-free recursive distillation scenario, where a\nstudent once trained can recursively join the next-generation open teacher set.\n","authors":["Kenta Tsukahara","Kanji Tanaka"],"pdf_url":"https://arxiv.org/pdf/2312.15897v2.pdf","comment":"5 pages, 4 figures, technical report"},{"id":"http://arxiv.org/abs/2409.15246v2","updated":"2024-09-26T08:48:03Z","published":"2024-09-23T17:42:05Z","title":"On-Air Deep Learning Integrated Semantic Inference Models for Enhanced\n Earth Observation Satellite Networks","summary":" Earth Observation (EO) systems play a crucial role in achieving Sustainable\nDevelopment Goals by collecting and analyzing vital global data through\nsatellite networks. These systems are essential for tasks like mapping,\ndisaster monitoring, and resource management, but they face challenges in\nprocessing and transmitting large volumes of EO data, especially in specialized\nfields such as agriculture and real-time disaster response. Domain-adapted\nLarge Language Models (LLMs) provide a promising solution by facilitating data\nfusion between extensive EO data and semantic EO data. By improving integration\nand interpretation of diverse datasets, LLMs address the challenges of\nprocessing specialized information in agriculture and disaster response\napplications. This fusion enhances the accuracy and relevance of transmitted\ndata. This paper presents a framework for semantic communication in EO\nsatellite networks, aimed at improving data transmission efficiency and overall\nsystem performance through cognitive processing techniques. The proposed system\nemploys Discrete-Task-Oriented Source-Channel Coding (DT-JSCC) and Semantic\nData Augmentation (SA) to focus on relevant information while minimizing\ncommunication overhead. By integrating cognitive semantic processing and\ninter-satellite links, the framework enhances the analysis and transmission of\nmultispectral satellite imagery, improving object detection, pattern\nrecognition, and real-time decision-making. The introduction of Cognitive\nSemantic Augmentation (CSA) allows satellites to process and transmit semantic\ninformation, boosting adaptability to changing environments and application\nneeds. This end-to-end architecture is tailored for next-generation satellite\nnetworks, such as those supporting 6G, and demonstrates significant\nimprovements in efficiency and accuracy.\n","authors":["Hong-fu Chou","Vu Nguyen Ha","Prabhu Thiruvasagam","Thanh-Dung Le","Geoffrey Eappen","Ti Ti Nguyen","Luis M. Garces-Socarras","Jorge L. Gonzalez-Rios","Juan Carlos Merlano-Duncan","Symeon Chatzinotas"],"pdf_url":"https://arxiv.org/pdf/2409.15246v2.pdf","comment":"18 pages, 10 figures, magazine"},{"id":"http://arxiv.org/abs/2409.17643v1","updated":"2024-09-26T08:46:48Z","published":"2024-09-26T08:46:48Z","title":"Efficient Fairness-Performance Pareto Front Computation","summary":" There is a well known intrinsic trade-off between the fairness of a\nrepresentation and the performance of classifiers derived from the\nrepresentation. Due to the complexity of optimisation algorithms in most modern\nrepresentation learning approaches, for a given method it may be non-trivial to\ndecide whether the obtained fairness-performance curve of the method is\noptimal, i.e., whether it is close to the true Pareto front for these\nquantities for the underlying data distribution.\n In this paper we propose a new method to compute the optimal Pareto front,\nwhich does not require the training of complex representation models. We show\nthat optimal fair representations possess several useful structural properties,\nand that these properties enable a reduction of the computation of the Pareto\nFront to a compact discrete problem. We then also show that these compact\napproximating problems can be efficiently solved via off-the shelf\nconcave-convex programming methods.\n Since our approach is independent of the specific model of representations,\nit may be used as the benchmark to which representation learning algorithms may\nbe compared. We experimentally evaluate the approach on a number of real world\nbenchmark datasets.\n","authors":["Mark Kozdoba","Binyamin Perets","Shie Mannor"],"pdf_url":"https://arxiv.org/pdf/2409.17643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02733v3","updated":"2024-09-26T08:45:22Z","published":"2023-06-05T09:29:46Z","title":"Realising Synthetic Active Inference Agents, Part II: Variational\n Message Updates","summary":" The Free Energy Principle (FEP) describes (biological) agents as minimising a\nvariational Free Energy (FE) with respect to a generative model of their\nenvironment. Active Inference (AIF) is a corollary of the FEP that describes\nhow agents explore and exploit their environment by minimising an expected FE\nobjective. In two related papers, we describe a scalable, epistemic approach to\nsynthetic AIF, by message passing on free-form Forney-style Factor Graphs\n(FFGs). A companion paper (part I) introduces a Constrained FFG (CFFG) notation\nthat visually represents (generalised) FE objectives for AIF. The current paper\n(part II) derives message passing algorithms that minimise (generalised) FE\nobjectives on a CFFG by variational calculus. A comparison between simulated\nBethe and generalised FE agents illustrates how the message passing approach to\nsynthetic AIF induces epistemic behaviour on a T-maze navigation task.\nExtension of the T-maze simulation to 1) learning goal statistics, and 2) a\nmulti-agent bargaining setting, illustrate how this approach encourages reuse\nof nodes and updates in alternative settings. With a full message passing\naccount of synthetic AIF agents, it becomes possible to derive and reuse\nmessage updates across models and move closer to industrial applications of\nsynthetic AIF.\n","authors":["Thijs van de Laar","Magnus Koudahl","Bert de Vries"],"pdf_url":"https://arxiv.org/pdf/2306.02733v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17635v1","updated":"2024-09-26T08:32:31Z","published":"2024-09-26T08:32:31Z","title":"FlowMAC: Conditional Flow Matching for Audio Coding at Low Bit Rates","summary":" This paper introduces FlowMAC, a novel neural audio codec for high-quality\ngeneral audio compression at low bit rates based on conditional flow matching\n(CFM). FlowMAC jointly learns a mel spectrogram encoder, quantizer and decoder.\nAt inference time the decoder integrates a continuous normalizing flow via an\nODE solver to generate a high-quality mel spectrogram. This is the first time\nthat a CFM-based approach is applied to general audio coding, enabling a\nscalable, simple and memory efficient training. Our subjective evaluations show\nthat FlowMAC at 3 kbps achieves similar quality as state-of-the-art GAN-based\nand DDPM-based neural audio codecs at double the bit rate. Moreover, FlowMAC\noffers a tunable inference pipeline, which permits to trade off complexity and\nquality. This enables real-time coding on CPU, while maintaining high\nperceptual quality.\n","authors":["Nicola Pia","Martin Strauss","Markus Multrus","Bernd Edler"],"pdf_url":"https://arxiv.org/pdf/2409.17635v1.pdf","comment":"Submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2409.17632v1","updated":"2024-09-26T08:28:14Z","published":"2024-09-26T08:28:14Z","title":"Model-Free Stochastic Process Modeling and Optimization using\n Normalizing Flows","summary":" Real-world chemical processes often exhibit stochastic dynamics with\nnon-trivial correlations and state-dependent fluctuations. However, most\nprocess models simply add stationary noise terms to a deterministic prediction,\nwhich can lead to inaccurate predictions. This work proposes using conditional\nnormalizing flows as discrete-time models (DTMs) to learn the stochastic\ndynamics of chemical processes. Normalizing flows learn an explicit expression\nof the system states' probability density function (PDF) given prior states and\ncontrol inputs. The resulting model naturally allows for formulating stochastic\nand probabilistic setpoint-tracking objectives and chance constraints. In\napplications to a continuous reactor and a reactor cascade, the normalizing\nflow yields stable simulations over long time horizons and high-quality results\nin stochastic and probabilistic MPC formulation for open-loop control.\nFurthermore, a chance-constrained optimization finds reliable startup controls\nfor the reactor cascade with stochastic reactions. In conclusion, the\nconditional normalizing flow presents an excellent choice for modeling\nnonlinear stochastic dynamics.\n","authors":["Eike Cramer"],"pdf_url":"https://arxiv.org/pdf/2409.17632v1.pdf","comment":"13 pages, 7 Figures, 5 Tables"},{"id":"http://arxiv.org/abs/2409.17628v1","updated":"2024-09-26T08:22:09Z","published":"2024-09-26T08:22:09Z","title":"Convolutional Signal Propagation: A Simple Scalable Algorithm for\n Hypergraphs","summary":" Last decade has seen the emergence of numerous methods for learning on\ngraphs, particularly Graph Neural Networks (GNNs). These methods, however, are\noften not directly applicable to more complex structures like bipartite graphs\n(equivalent to hypergraphs), which represent interactions among two entity\ntypes (e.g. a user liking a movie). This paper proposes Convolutional Signal\nPropagation (CSP), a non-parametric simple and scalable method that natively\noperates on bipartite graphs (hypergraphs) and can be implemented with just a\nfew lines of code. After defining CSP, we demonstrate its relationship with\nwell-established methods like label propagation, Naive Bayes, and Hypergraph\nConvolutional Networks. We evaluate CSP against several reference methods on\nreal-world datasets from multiple domains, focusing on retrieval and\nclassification tasks. Our results show that CSP offers competitive performance\nwhile maintaining low computational complexity, making it an ideal first choice\nas a baseline for hypergraph node classification and retrieval. Moreover,\ndespite operating on hypergraphs, CSP achieves good results in tasks typically\nnot associated with hypergraphs, such as natural language processing.\n","authors":["Pavel Procházka","Marek Dědič","Lukáš Bajer"],"pdf_url":"https://arxiv.org/pdf/2409.17628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.11531v2","updated":"2024-09-26T08:20:59Z","published":"2023-09-20T10:50:28Z","title":"EPTQ: Enhanced Post-Training Quantization via Hessian-guided\n Network-wise Optimization","summary":" Quantization is a key method for deploying deep neural networks on edge\ndevices with limited memory and computation resources. Recent improvements in\nPost-Training Quantization (PTQ) methods were achieved by an additional local\noptimization process for learning the weight quantization rounding policy.\nHowever, a gap exists when employing network-wise optimization with small\nrepresentative datasets. In this paper, we propose a new method for enhanced\nPTQ (EPTQ) that employs a network-wise quantization optimization process, which\nbenefits from considering cross-layer dependencies during optimization. EPTQ\nenables network-wise optimization with a small representative dataset using a\nnovel sample-layer attention score based on a label-free Hessian matrix upper\nbound. The label-free approach makes our method suitable for the PTQ scheme. We\ngive a theoretical analysis for the said bound and use it to construct a\nknowledge distillation loss that guides the optimization to focus on the more\nsensitive layers and samples. In addition, we leverage the Hessian upper bound\nto improve the weight quantization parameters selection by focusing on the more\nsensitive elements in the weight tensors. Empirically, by employing EPTQ we\nachieve state-of-the-art results on various models, tasks, and datasets,\nincluding ImageNet classification, COCO object detection, and Pascal-VOC for\nsemantic segmentation.\n","authors":["Ofir Gordon","Elad Cohen","Hai Victor Habi","Arnon Netzer"],"pdf_url":"https://arxiv.org/pdf/2309.11531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17625v1","updated":"2024-09-26T08:20:05Z","published":"2024-09-26T08:20:05Z","title":"Benign or Not-Benign Overfitting in Token Selection of Attention\n Mechanism","summary":" Modern over-parameterized neural networks can be trained to fit the training\ndata perfectly while still maintaining a high generalization performance. This\n\"benign overfitting\" phenomenon has been studied in a surge of recent\ntheoretical work; however, most of these studies have been limited to linear\nmodels or two-layer neural networks. In this work, we analyze benign\noverfitting in the token selection mechanism of the attention architecture,\nwhich characterizes the success of transformer models. We first show the\nexistence of a benign overfitting solution and explain its mechanism in the\nattention architecture. Next, we discuss whether the model converges to such a\nsolution, raising the difficulties specific to the attention architecture. We\nthen present benign overfitting cases and not-benign overfitting cases by\nconditioning different scenarios based on the behavior of attention\nprobabilities during training. To the best of our knowledge, this is the first\nstudy to characterize benign overfitting for the attention mechanism.\n","authors":["Keitaro Sakamoto","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2409.17625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17622v1","updated":"2024-09-26T08:16:59Z","published":"2024-09-26T08:16:59Z","title":"Neural P$^3$M: A Long-Range Interaction Modeling Enhancer for Geometric\n GNNs","summary":" Geometric graph neural networks (GNNs) have emerged as powerful tools for\nmodeling molecular geometry. However, they encounter limitations in effectively\ncapturing long-range interactions in large molecular systems. To address this\nchallenge, we introduce Neural P$^3$M, a versatile enhancer of geometric GNNs\nto expand the scope of their capabilities by incorporating mesh points\nalongside atoms and reimaging traditional mathematical operations in a\ntrainable manner. Neural P$^3$M exhibits flexibility across a wide range of\nmolecular systems and demonstrates remarkable accuracy in predicting energies\nand forces, outperforming on benchmarks such as the MD22 dataset. It also\nachieves an average improvement of 22% on the OE62 dataset while integrating\nwith various architectures.\n","authors":["Yusong Wang","Chaoran Cheng","Shaoning Li","Yuxuan Ren","Bin Shao","Ge Liu","Pheng-Ann Heng","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.17622v1.pdf","comment":"Published as a conference paper at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.16959v2","updated":"2024-09-26T08:12:59Z","published":"2024-06-21T03:21:22Z","title":"Recurrent Stochastic Configuration Networks for Temporal Data Analytics","summary":" Temporal data modelling techniques with neural networks are useful in many\ndomain applications, including time-series forecasting and control engineering.\nThis paper aims at developing a recurrent version of stochastic configuration\nnetworks (RSCNs) for problem solving, where we have no underlying assumption on\nthe dynamic orders of the input variables. Given a collection of historical\ndata, we first build an initial RSCN model in the light of a supervisory\nmechanism, followed by an online update of the output weights by using a\nprojection algorithm. Some theoretical results are established, including the\necho state property, the universal approximation property of RSCNs for both the\noffline and online learnings, and the convergence of the output weights. The\nproposed RSCN model is remarkably distinguished from the well-known echo state\nnetworks (ESNs) in terms of the way of assigning the input random weight matrix\nand a special structure of the random feedback matrix. A comprehensive\ncomparison study among the long short-term memory (LSTM) network, the original\nESN, and several state-of-the-art ESN methods such as the simple cycle\nreservoir (SCR), the polynomial ESN (PESN), the leaky-integrator ESN (LIESN)\nand RSCN is carried out. Numerical results clearly indicate that the proposed\nRSCN performs favourably over all of the datasets.\n","authors":["Dianhui Wang","Gang Dang"],"pdf_url":"https://arxiv.org/pdf/2406.16959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17612v1","updated":"2024-09-26T08:03:19Z","published":"2024-09-26T08:03:19Z","title":"Diversity-Driven Synthesis: Enhancing Dataset Distillation through\n Directed Weight Adjustment","summary":" The sharp increase in data-related expenses has motivated research into\ncondensing datasets while retaining the most informative features. Dataset\ndistillation has thus recently come to the fore. This paradigm generates\nsynthetic dataset that are representative enough to replace the original\ndataset in training a neural network. To avoid redundancy in these synthetic\ndatasets, it is crucial that each element contains unique features and remains\ndiverse from others during the synthesis stage. In this paper, we provide a\nthorough theoretical and empirical analysis of diversity within synthesized\ndatasets. We argue that enhancing diversity can improve the parallelizable yet\nisolated synthesizing approach. Specifically, we introduce a novel method that\nemploys dynamic and directed weight adjustment techniques to modulate the\nsynthesis process, thereby maximizing the representativeness and diversity of\neach synthetic instance. Our method ensures that each batch of synthetic data\nmirrors the characteristics of a large, varying subset of the original dataset.\nExtensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet,\nand ImageNet-1K, demonstrate the superior performance of our method,\nhighlighting its effectiveness in producing diverse and representative\nsynthetic datasets with minimal computational expense.\n","authors":["Jiawei Du","Xin Zhang","Juncheng Hu","Wenxin Huang","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.17612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15254v3","updated":"2024-09-26T08:01:39Z","published":"2024-09-23T17:53:42Z","title":"Archon: An Architecture Search Framework for Inference-Time Techniques","summary":" Inference-time techniques are emerging as highly effective tools to increase\nlarge language model (LLM) capabilities. However, there is still limited\nunderstanding of the best practices for developing systems that combine\ninference-time techniques with one or more LLMs, with challenges including: (1)\neffectively allocating inference compute budget, (2) understanding the\ninteractions between different combinations of inference-time techniques and\ntheir impact on downstream performance, and 3) efficiently searching over the\nlarge space of model choices, inference-time techniques, and their\ncompositions. To address these challenges, we introduce Archon, an automated\nframework for designing inference-time architectures. Archon defines an\nextensible design space, encompassing methods such as generation ensembling,\nmulti-sampling, ranking, fusion, critiquing, verification, and unit testing. It\nthen transforms the problem of selecting and combining LLMs and inference-time\ntechniques into a hyperparameter optimization objective. To optimize this\nobjective, we introduce automated Inference-Time Architecture Search (ITAS)\nalgorithms. Given target benchmark(s), an inference compute budget, and\navailable LLMs, ITAS outputs optimized architectures. We evaluate Archon\narchitectures across a wide range of instruction-following and reasoning\nbenchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval,\nMixEval Hard, MATH, and CodeContests. We show that automatically designed\ninference-time architectures by Archon outperform strong models such as GPT-4o\nand Claude 3.5 Sonnet on these benchmarks, achieving an average increase of\n15.1 and 11.2 percentage points with all-source models and open-source models,\nrespectively. We make our code and datasets available publicly on Github:\nhttps://github.com/ScalingIntelligence/Archon.\n","authors":["Jon Saad-Falcon","Adrian Gamarra Lafuente","Shlok Natarajan","Nahum Maru","Hristo Todorov","Etash Guha","E. Kelly Buchanan","Mayee Chen","Neel Guha","Christopher Ré","Azalia Mirhoseini"],"pdf_url":"https://arxiv.org/pdf/2409.15254v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16341v2","updated":"2024-09-26T07:54:10Z","published":"2024-09-24T17:20:02Z","title":"Quality Matters: Evaluating Synthetic Data for Tool-Using LLMs","summary":" Training large language models (LLMs) for external tool usage is a rapidly\nexpanding field, with recent research focusing on generating synthetic data to\naddress the shortage of available data. However, the absence of systematic data\nquality checks poses complications for properly training and testing models. To\nthat end, we propose two approaches for assessing the reliability of data for\ntraining LLMs to use external tools. The first approach uses intuitive,\nhuman-defined correctness criteria. The second approach uses a model-driven\nassessment with in-context evaluation. We conduct a thorough evaluation of data\nquality on two popular benchmarks, followed by an extrinsic evaluation that\nshowcases the impact of data quality on model performance. Our results\ndemonstrate that models trained on high-quality data outperform those trained\non unvalidated data, even when trained with a smaller quantity of data. These\nfindings empirically support the significance of assessing and ensuring the\nreliability of training data for tool-using LLMs.\n","authors":["Shadi Iskander","Nachshon Cohen","Zohar Karnin","Ori Shapira","Sofia Tolmach"],"pdf_url":"https://arxiv.org/pdf/2409.16341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04428v2","updated":"2024-09-26T07:53:04Z","published":"2024-09-06T17:48:44Z","title":"Hybrid Spiking Neural Networks for Low-Power Intra-Cortical\n Brain-Machine Interfaces","summary":" Intra-cortical brain-machine interfaces (iBMIs) have the potential to\ndramatically improve the lives of people with paraplegia by restoring their\nability to perform daily activities. However, current iBMIs suffer from\nscalability and mobility limitations due to bulky hardware and wiring. Wireless\niBMIs offer a solution but are constrained by a limited data rate. To overcome\nthis challenge, we are investigating hybrid spiking neural networks for\nembedded neural decoding in wireless iBMIs. The networks consist of a temporal\nconvolution-based compression followed by recurrent processing and a final\ninterpolation back to the original sequence length. As recurrent units, we\nexplore gated recurrent units (GRUs), leaky integrate-and-fire (LIF) neurons,\nand a combination of both - spiking GRUs (sGRUs) and analyze their differences\nin terms of accuracy, footprint, and activation sparsity. To that end, we train\ndecoders on the \"Nonhuman Primate Reaching with Multichannel Sensorimotor\nCortex Electrophysiology\" dataset and evaluate it using the NeuroBench\nframework, targeting both tracks of the IEEE BioCAS Grand Challenge on Neural\nDecoding. Our approach achieves high accuracy in predicting velocities of\nprimate reaching movements from multichannel primary motor cortex recordings\nwhile maintaining a low number of synaptic operations, surpassing the current\nbaseline models in the NeuroBench framework. This work highlights the potential\nof hybrid neural networks to facilitate wireless iBMIs with high decoding\nprecision and a substantial increase in the number of monitored neurons, paving\nthe way toward more advanced neuroprosthetic technologies.\n","authors":["Alexandru Vasilache","Jann Krausse","Klaus Knobloch","Juergen Becker"],"pdf_url":"https://arxiv.org/pdf/2409.04428v2.pdf","comment":"This work has been accepted at the 2024 IEEE Biomedical Circuits and\n Systems Conference"},{"id":"http://arxiv.org/abs/2408.03944v2","updated":"2024-09-26T07:47:50Z","published":"2024-07-22T03:56:27Z","title":"Improving Fast Adversarial Training Paradigm: An Example Taxonomy\n Perspective","summary":" While adversarial training is an effective defense method against adversarial\nattacks, it notably increases the training cost. To this end, fast adversarial\ntraining (FAT) is presented for efficient training and has become a hot\nresearch topic. However, FAT suffers from catastrophic overfitting, which leads\nto a performance drop compared with multi-step adversarial training. However,\nthe cause of catastrophic overfitting remains unclear and lacks exploration. In\nthis paper, we present an example taxonomy in FAT, which identifies that\ncatastrophic overfitting is caused by the imbalance between the inner and outer\noptimization in FAT. Furthermore, we investigated the impact of varying degrees\nof training loss, revealing a correlation between training loss and\ncatastrophic overfitting. Based on these observations, we redesign the loss\nfunction in FAT with the proposed dynamic label relaxation to concentrate the\nloss range and reduce the impact of misclassified examples. Meanwhile, we\nintroduce batch momentum initialization to enhance the diversity to prevent\ncatastrophic overfitting in an efficient manner. Furthermore, we also propose\nCatastrophic Overfitting aware Loss Adaptation (COLA), which employs a separate\ntraining strategy for examples based on their loss degree. Our proposed method,\nnamed example taxonomy aware FAT (ETA), establishes an improved paradigm for\nFAT. Experiment results demonstrate our ETA achieves state-of-the-art\nperformance. Comprehensive experiments on four standard datasets demonstrate\nthe competitiveness of our proposed method.\n","authors":["Jie Gui","Chengze Jiang","Minjing Dong","Kun Tong","Xinli Shi","Yuan Yan Tang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.03944v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2409.17605v1","updated":"2024-09-26T07:43:12Z","published":"2024-09-26T07:43:12Z","title":"Good Data Is All Imitation Learning Needs","summary":" In this paper, we address the limitations of traditional teacher-student\nmodels, imitation learning, and behaviour cloning in the context of\nAutonomous/Automated Driving Systems (ADS), where these methods often struggle\nwith incomplete coverage of real-world scenarios. To enhance the robustness of\nsuch models, we introduce the use of Counterfactual Explanations (CFEs) as a\nnovel data augmentation technique for end-to-end ADS. CFEs, by generating\ntraining samples near decision boundaries through minimal input modifications,\nlead to a more comprehensive representation of expert driver strategies,\nparticularly in safety-critical scenarios. This approach can therefore help\nimprove the model's ability to handle rare and challenging driving events, such\nas anticipating darting out pedestrians, ultimately leading to safer and more\ntrustworthy decision-making for ADS. Our experiments in the CARLA simulator\ndemonstrate that CF-Driver outperforms the current state-of-the-art method,\nachieving a higher driving score and lower infraction rates. Specifically,\nCF-Driver attains a driving score of 84.2, surpassing the previous best model\nby 15.02 percentage points. These results highlight the effectiveness of\nincorporating CFEs in training end-to-end ADS. To foster further research, the\nCF-Driver code is made publicly available.\n","authors":["Amir Samadi","Konstantinos Koufos","Kurt Debattista","Mehrdad Dianati"],"pdf_url":"https://arxiv.org/pdf/2409.17605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17604v1","updated":"2024-09-26T07:40:47Z","published":"2024-09-26T07:40:47Z","title":"RmGPT: Rotating Machinery Generative Pretrained Model","summary":" In industry, the reliability of rotating machinery is critical for production\nefficiency and safety. Current methods of Prognostics and Health Management\n(PHM) often rely on task-specific models, which face significant challenges in\nhandling diverse datasets with varying signal characteristics, fault modes and\noperating conditions. Inspired by advancements in generative pretrained models,\nwe propose RmGPT, a unified model for diagnosis and prognosis tasks. RmGPT\nintroduces a novel token-based framework, incorporating Signal Tokens, Prompt\nTokens, Time-Frequency Task Tokens and Fault Tokens to handle heterogeneous\ndata within a unified model architecture. We leverage self-supervised learning\nfor robust feature extraction and introduce a next signal token prediction\npretraining strategy, alongside efficient prompt learning for task-specific\nadaptation. Extensive experiments demonstrate that RmGPT significantly\noutperforms state-of-the-art algorithms, achieving near-perfect accuracy in\ndiagnosis tasks and exceptionally low errors in prognosis tasks. Notably, RmGPT\nexcels in few-shot learning scenarios, achieving 92% accuracy in 16-class\none-shot experiments, highlighting its adaptability and robustness. This work\nestablishes RmGPT as a powerful PHM foundation model for rotating machinery,\nadvancing the scalability and generalizability of PHM solutions.\n","authors":["Yilin Wang","Yifei Yu","Kong Sun","Peixuan Lei","Yuxuan Zhang","Enrico Zio","Aiguo Xia","Yuanxiang Li"],"pdf_url":"https://arxiv.org/pdf/2409.17604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16206v2","updated":"2024-09-26T07:32:09Z","published":"2024-05-25T12:35:31Z","title":"GlycanML: A Multi-Task and Multi-Structure Benchmark for Glycan Machine\n Learning","summary":" Glycans are basic biomolecules and perform essential functions within living\norganisms. The rapid increase of functional glycan data provides a good\nopportunity for machine learning solutions to glycan understanding. However,\nthere still lacks a standard machine learning benchmark for glycan function\nprediction. In this work, we fill this blank by building a comprehensive\nbenchmark for Glycan Machine Learning (GlycanML). The GlycanML benchmark\nconsists of diverse types of tasks including glycan taxonomy prediction, glycan\nimmunogenicity prediction, glycosylation type prediction, and protein-glycan\ninteraction prediction. Glycans can be represented by both sequences and graphs\nin GlycanML, which enables us to extensively evaluate sequence-based models and\ngraph neural networks (GNNs) on benchmark tasks. Furthermore, by concurrently\nperforming eight glycan taxonomy prediction tasks, we introduce the\nGlycanML-MTL testbed for multi-task learning (MTL) algorithms. Experimental\nresults show the superiority of modeling glycans with multi-relational GNNs,\nand suitable MTL methods can further boost model performance. We provide all\ndatasets and source codes at https://github.com/GlycanML/GlycanML and maintain\na leaderboard at https://GlycanML.github.io/project\n","authors":["Minghao Xu","Yunteng Geng","Yihang Zhang","Ling Yang","Jian Tang","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.16206v2.pdf","comment":"Research project paper. All code and data are released"},{"id":"http://arxiv.org/abs/2409.17592v1","updated":"2024-09-26T07:19:12Z","published":"2024-09-26T07:19:12Z","title":"Deep Manifold Part 1: Anatomy of Neural Network Manifold","summary":" Based on the numerical manifold method principle, we developed a mathematical\nframework of a neural network manifold: Deep Manifold and discovered that\nneural networks: 1) is numerical computation combining forward and inverse; 2)\nhave near infinite degrees of freedom; 3) exponential learning capacity with\ndepth; 4) have self-progressing boundary conditions; 5) has training hidden\nbottleneck. We also define two concepts: neural network learning space and deep\nmanifold space and introduce two concepts: neural network intrinsic pathway and\nfixed point. We raise three fundamental questions: 1). What is the training\ncompletion definition; 2). where is the deep learning convergence point (neural\nnetwork fixed point); 3). How important is token timestamp in training data\ngiven negative time is critical in inverse problem.\n","authors":["Max Y. Ma","Gen-Hua Shi"],"pdf_url":"https://arxiv.org/pdf/2409.17592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17591v1","updated":"2024-09-26T07:16:38Z","published":"2024-09-26T07:16:38Z","title":"Conjugate Bayesian Two-step Change Point Detection for Hawkes Process","summary":" The Bayesian two-step change point detection method is popular for the Hawkes\nprocess due to its simplicity and intuitiveness. However, the non-conjugacy\nbetween the point process likelihood and the prior requires most existing\nBayesian two-step change point detection methods to rely on non-conjugate\ninference methods. These methods lack analytical expressions, leading to low\ncomputational efficiency and impeding timely change point detection. To address\nthis issue, this work employs data augmentation to propose a conjugate Bayesian\ntwo-step change point detection method for the Hawkes process, which proves to\nbe more accurate and efficient. Extensive experiments on both synthetic and\nreal data demonstrate the superior effectiveness and efficiency of our method\ncompared to baseline methods. Additionally, we conduct ablation studies to\nexplore the robustness of our method concerning various hyperparameters. Our\ncode is publicly available at https://github.com/Aurora2050/CoBay-CPD.\n","authors":["Zeyue Zhang","Xiaoling Lu","Feng Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.17591v1.pdf","comment":"10 pages, accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17587v1","updated":"2024-09-26T07:07:08Z","published":"2024-09-26T07:07:08Z","title":"Multimodal Banking Dataset: Understanding Client Needs through Event\n Sequences","summary":" Financial organizations collect a huge amount of data about clients that\ntypically has a temporal (sequential) structure and is collected from various\nsources (modalities). Due to privacy issues, there are no large-scale\nopen-source multimodal datasets of event sequences, which significantly limits\nthe research in this area. In this paper, we present the industrial-scale\npublicly available multimodal banking dataset, MBD, that contains more than\n1.5M corporate clients with several modalities: 950M bank transactions, 1B geo\nposition events, 5M embeddings of dialogues with technical support and monthly\naggregated purchases of four bank's products. All entries are properly\nanonymized from real proprietary bank data. Using this dataset, we introduce a\nnovel benchmark with two business tasks: campaigning (purchase prediction in\nthe next month) and matching of clients. We provide numerical results that\ndemonstrate the superiority of our multi-modal baselines over single-modal\ntechniques for each task. As a result, the proposed dataset can open new\nperspectives and facilitate the future development of practically important\nlarge-scale multimodal algorithms for event sequences.\n HuggingFace Link: https://huggingface.co/datasets/ai-lab/MBD\n Github Link: https://github.com/Dzhambo/MBD\n","authors":["Mollaev Dzhambulat","Alexander Kostin","Postnova Maria","Ivan Karpukhin","Ivan A Kireev","Gleb Gusev","Andrey Savchenko"],"pdf_url":"https://arxiv.org/pdf/2409.17587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06379v3","updated":"2024-09-26T07:05:47Z","published":"2023-10-10T07:43:41Z","title":"Understanding the Expressivity and Trainability of Fourier Neural\n Operator: A Mean-Field Perspective","summary":" In this paper, we explores the expressivity and trainability of the Fourier\nNeural Operator (FNO). We establish a mean-field theory for the FNO, analyzing\nthe behavior of the random FNO from an edge of chaos perspective. Our\ninvestigation into the expressivity of a random FNO involves examining the\nordered-chaos phase transition of the network based on the weight distribution.\nThis phase transition demonstrates characteristics unique to the FNO, induced\nby mode truncation, while also showcasing similarities to those of densely\nconnected networks. Furthermore, we identify a connection between expressivity\nand trainability: the ordered and chaotic phases correspond to regions of\nvanishing and exploding gradients, respectively. This finding provides a\npractical prerequisite for the stable training of the FNO. Our experimental\nresults corroborate our theoretical findings.\n","authors":["Takeshi Koshizuka","Masahiro Fujisawa","Yusuke Tanaka","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2310.06379v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17583v1","updated":"2024-09-26T07:01:29Z","published":"2024-09-26T07:01:29Z","title":"Let the Quantum Creep In: Designing Quantum Neural Network Models by\n Gradually Swapping Out Classical Components","summary":" Artificial Intelligence (AI), with its multiplier effect and wide\napplications in multiple areas, could potentially be an important application\nof quantum computing. Since modern AI systems are often built on neural\nnetworks, the design of quantum neural networks becomes a key challenge in\nintegrating quantum computing into AI. To provide a more fine-grained\ncharacterisation of the impact of quantum components on the performance of\nneural networks, we propose a framework where classical neural network layers\nare gradually replaced by quantum layers that have the same type of input and\noutput while keeping the flow of information between layers unchanged,\ndifferent from most current research in quantum neural network, which favours\nan end-to-end quantum model. We start with a simple three-layer classical\nneural network without any normalisation layers or activation functions, and\ngradually change the classical layers to the corresponding quantum versions. We\nconduct numerical experiments on image classification datasets such as the\nMNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of\nperformance brought by the systematic introduction of quantum components.\nThrough this framework, our research sheds new light on the design of future\nquantum neural network models where it could be more favourable to search for\nmethods and frameworks that harness the advantages from both the classical and\nquantum worlds.\n","authors":["Peiyong Wang","Casey. R. Myers","Lloyd C. L. Hollenberg","Udaya Parampalli"],"pdf_url":"https://arxiv.org/pdf/2409.17583v1.pdf","comment":"50 pages (including Appendix), many figures, accepted as a poster on\n QTML2024. Code available at\n https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In"},{"id":"http://arxiv.org/abs/2409.17582v1","updated":"2024-09-26T07:01:06Z","published":"2024-09-26T07:01:06Z","title":"Multiplicative Logit Adjustment Approximates Neural-Collapse-Aware\n Decision Boundary Adjustment","summary":" Real-world data distributions are often highly skewed. This has spurred a\ngrowing body of research on long-tailed recognition to address this imbalance\nin training classification models. Among the methods studied, multiplicative\nlogit adjustment (MLA) stands out as a simple and effective method. However, it\nlacks theoretical guarantees, which raises concerns about the optimality of its\nadjustment method. We provide a theoretical justification for the effectiveness\nof MLA with the following two-step theory. First, we develop a theory that\nadjusts optimal decision boundaries by estimating feature spread on the basis\nof neural collapse. Then, we demonstrate that MLA approximates this optimal\nmethod. Additionally, through experiments on long-tailed datasets, we\nillustrate the practical usefulness of MLA under more realistic conditions. We\nalso offer experimental insights to guide the tuning of MLA's hyperparameters.\n","authors":["Naoya Hasegawa","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2409.17582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17567v1","updated":"2024-09-26T06:28:56Z","published":"2024-09-26T06:28:56Z","title":"Derandomizing Multi-Distribution Learning","summary":" Multi-distribution or collaborative learning involves learning a single\npredictor that works well across multiple data distributions, using samples\nfrom each during training. Recent research on multi-distribution learning,\nfocusing on binary loss and finite VC dimension classes, has shown near-optimal\nsample complexity that is achieved with oracle efficient algorithms. That is,\nthese algorithms are computationally efficient given an efficient ERM for the\nclass. Unlike in classical PAC learning, where the optimal sample complexity is\nachieved with deterministic predictors, current multi-distribution learning\nalgorithms output randomized predictors. This raises the question: can these\nalgorithms be derandomized to produce a deterministic predictor for multiple\ndistributions? Through a reduction to discrepancy minimization, we show that\nderandomizing multi-distribution learning is computationally hard, even when\nERM is computationally efficient. On the positive side, we identify a\nstructural condition enabling an efficient black-box reduction, converting\nexisting randomized multi-distribution predictors into deterministic ones.\n","authors":["Kasper Green Larsen","Omar Montasser","Nikita Zhivotovskiy"],"pdf_url":"https://arxiv.org/pdf/2409.17567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17565v1","updated":"2024-09-26T06:27:26Z","published":"2024-09-26T06:27:26Z","title":"Pixel-Space Post-Training of Latent Diffusion Models","summary":" Latent diffusion models (LDMs) have made significant advancements in the\nfield of image generation in recent years. One major advantage of LDMs is their\nability to operate in a compressed latent space, allowing for more efficient\ntraining and deployment. However, despite these advantages, challenges with\nLDMs still remain. For example, it has been observed that LDMs often generate\nhigh-frequency details and complex compositions imperfectly. We hypothesize\nthat one reason for these flaws is due to the fact that all pre- and\npost-training of LDMs are done in latent space, which is typically $8 \\times 8$\nlower spatial-resolution than the output images. To address this issue, we\npropose adding pixel-space supervision in the post-training process to better\npreserve high-frequency details. Experimentally, we show that adding a\npixel-space objective significantly improves both supervised quality\nfine-tuning and preference-based post-training by a large margin on a\nstate-of-the-art DiT transformer and U-Net diffusion models in both visual\nquality and visual flaw metrics, while maintaining the same text alignment\nquality.\n","authors":["Christina Zhang","Simran Motwani","Matthew Yu","Ji Hou","Felix Juefei-Xu","Sam Tsai","Peter Vajda","Zijian He","Jialiang Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16997v2","updated":"2024-09-26T06:13:04Z","published":"2024-09-25T15:02:25Z","title":"INT-FlashAttention: Enabling Flash Attention for INT8 Quantization","summary":" As the foundation of large language models (LLMs), self-attention module\nfaces the challenge of quadratic time and memory complexity with respect to\nsequence length. FlashAttention accelerates attention computation and reduces\nits memory usage by leveraging the GPU memory hierarchy. A promising research\ndirection is to integrate FlashAttention with quantization methods. This paper\nintroduces INT-FlashAttention, the first INT8 quantization architecture\ncompatible with the forward workflow of FlashAttention, which significantly\nimproves the inference speed of FlashAttention on Ampere GPUs. We implement our\nINT-FlashAttention prototype with fully INT8 activations and general\nmatrix-multiplication (GEMM) kernels, making it the first attention operator\nwith fully INT8 input. As a general token-level post-training quantization\nframework, INT-FlashAttention is also compatible with other data formats like\nINT4, etc. Experimental results show INT-FlashAttention achieves 72% faster\ninference speed and 82% smaller quantization error compared to standard\nFlashAttention with FP16 and FP8 data format.\n","authors":["Shimao Chen","Zirui Liu","Zhiying Wu","Ce Zheng","Peizhuang Cong","Zihan Jiang","Yuhan Wu","Lei Su","Tong Yang"],"pdf_url":"https://arxiv.org/pdf/2409.16997v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17557v1","updated":"2024-09-26T06:10:29Z","published":"2024-09-26T06:10:29Z","title":"Joint Source-Channel Coding: Fundamentals and Recent Progress in\n Practical Designs","summary":" Semantic- and task-oriented communication has emerged as a promising approach\nto reducing the latency and bandwidth requirements of next-generation mobile\nnetworks by transmitting only the most relevant information needed to complete\na specific task at the receiver. This is particularly advantageous for\nmachine-oriented communication of high data rate content, such as images and\nvideos, where the goal is rapid and accurate inference, rather than perfect\nsignal reconstruction. While semantic- and task-oriented compression can be\nimplemented in conventional communication systems, joint source-channel coding\n(JSCC) offers an alternative end-to-end approach by optimizing compression and\nchannel coding together, or even directly mapping the source signal to the\nmodulated waveform. Although all digital communication systems today rely on\nseparation, thanks to its modularity, JSCC is known to achieve higher\nperformance in finite blocklength scenarios, and to avoid cliff and the\nlevelling-off effects in time-varying channel scenarios. This article provides\nan overview of the information theoretic foundations of JSCC, surveys practical\nJSCC designs over the decades, and discusses the reasons for their limited\nadoption in practical systems. We then examine the recent resurgence of JSCC,\ndriven by the integration of deep learning techniques, particularly through\nDeepJSCC, highlighting its many surprising advantages in various scenarios.\nFinally, we discuss why it may be time to reconsider today's strictly separate\narchitectures, and reintroduce JSCC to enable high-fidelity, low-latency\ncommunications in critical applications such as autonomous driving, drone\nsurveillance, or wearable systems.\n","authors":["Deniz Gündüz","Michèle A. Wigger","Tze-Yang Tung","Ping Zhang","Yong Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.17557v1.pdf","comment":"Under review for possible publication"},{"id":"http://arxiv.org/abs/2307.08038v2","updated":"2024-09-26T06:02:24Z","published":"2023-07-16T13:34:44Z","title":"Bivariate DeepKriging for Large-scale Spatial Interpolation of Wind\n Fields","summary":" High spatial resolution wind data are essential for a wide range of\napplications in climate, oceanographic and meteorological studies. Large-scale\nspatial interpolation or downscaling of bivariate wind fields having velocity\nin two dimensions is a challenging task because wind data tend to be\nnon-Gaussian with high spatial variability and heterogeneity. In spatial\nstatistics, cokriging is commonly used for predicting bivariate spatial fields.\nHowever, the cokriging predictor is not optimal except for Gaussian processes.\nAdditionally, cokriging is computationally prohibitive for large datasets. In\nthis paper, we propose a method, called bivariate DeepKriging, which is a\nspatially dependent deep neural network (DNN) with an embedding layer\nconstructed by spatial radial basis functions for bivariate spatial data\nprediction. We then develop a distribution-free uncertainty quantification\nmethod based on bootstrap and ensemble DNN. Our proposed approach outperforms\nthe traditional cokriging predictor with commonly used covariance functions,\nsuch as the linear model of co-regionalization and flexible bivariate Mat\\'ern\ncovariance. We demonstrate the computational efficiency and scalability of the\nproposed DNN model, with computations that are, on average, 20 times faster\nthan those of conventional techniques. We apply the bivariate DeepKriging\nmethod to the wind data over the Middle East region at 506,771 locations. The\nprediction performance of the proposed method is superior over the cokriging\npredictors and dramatically reduces computation time.\n","authors":["Pratik Nag","Ying Sun","Brian J Reich"],"pdf_url":"https://arxiv.org/pdf/2307.08038v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17555v1","updated":"2024-09-26T05:57:35Z","published":"2024-09-26T05:57:35Z","title":"Advancing Open-Set Domain Generalization Using Evidential Bi-Level\n Hardest Domain Scheduler","summary":" In Open-Set Domain Generalization (OSDG), the model is exposed to both new\nvariations of data appearance (domains) and open-set conditions, where both\nknown and novel categories are present at test time. The challenges of this\ntask arise from the dual need to generalize across diverse domains and\naccurately quantify category novelty, which is critical for applications in\ndynamic environments. Recently, meta-learning techniques have demonstrated\nsuperior results in OSDG, effectively orchestrating the meta-train and -test\ntasks by employing varied random categories and predefined domain partition\nstrategies. These approaches prioritize a well-designed training schedule over\ntraditional methods that focus primarily on data augmentation and the\nenhancement of discriminative feature learning. The prevailing meta-learning\nmodels in OSDG typically utilize a predefined sequential domain scheduler to\nstructure data partitions. However, a crucial aspect that remains inadequately\nexplored is the influence brought by strategies of domain schedulers during\ntraining. In this paper, we observe that an adaptive domain scheduler benefits\nmore in OSDG compared with prefixed sequential and random domain schedulers. We\npropose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve\nan adaptive domain scheduler. This method strategically sequences domains by\nassessing their reliabilities in utilizing a follower network, trained with\nconfidence scores learned in an evidential manner, regularized by max rebiasing\ndiscrepancy, and optimized in a bi-level manner. The results show that our\nmethod substantially improves OSDG performance and achieves more discriminative\nembeddings for both the seen and unseen categories. The source code will be\navailable at https://github.com/KPeng9510/EBiL-HaDS.\n","authors":["Kunyu Peng","Di Wen","Kailun Yang","Ao Luo","Yufan Chen","Jia Fu","M. Saquib Sarfraz","Alina Roitberg","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2409.17555v1.pdf","comment":"Accepted to NeurIPS 2024. The source code will be available at\n https://github.com/KPeng9510/EBiL-HaDS"},{"id":"http://arxiv.org/abs/2406.14990v2","updated":"2024-09-26T05:51:20Z","published":"2024-06-21T09:03:37Z","title":"Learning Variable Compliance Control From a Few Demonstrations for\n Bimanual Robot with Haptic Feedback Teleoperation System","summary":" Automating dexterous, contact-rich manipulation tasks using rigid robots is a\nsignificant challenge in robotics. Rigid robots, defined by their actuation\nthrough position commands, face issues of excessive contact forces due to their\ninability to adapt to contact with the environment, potentially causing damage.\nWhile compliance control schemes have been introduced to mitigate these issues\nby controlling forces via external sensors, they are hampered by the need for\nfine-tuning task-specific controller parameters. Learning from Demonstrations\n(LfD) offers an intuitive alternative, allowing robots to learn manipulations\nthrough observed actions. In this work, we introduce a novel system to enhance\nthe teaching of dexterous, contact-rich manipulations to rigid robots. Our\nsystem is twofold: firstly, it incorporates a teleoperation interface utilizing\nVirtual Reality (VR) controllers, designed to provide an intuitive and\ncost-effective method for task demonstration with haptic feedback. Secondly, we\npresent Comp-ACT (Compliance Control via Action Chunking with Transformers), a\nmethod that leverages the demonstrations to learn variable compliance control\nfrom a few demonstrations. Our methods have been validated across various\ncomplex contact-rich manipulation tasks using single-arm and bimanual robot\nsetups in simulated and real-world environments, demonstrating the\neffectiveness of our system in teaching robots dexterous manipulations with\nenhanced adaptability and safety. Code available at:\nhttps://github.com/omron-sinicx/CompACT\n","authors":["Tatsuya Kamijo","Cristian C. Beltran-Hernandez","Masashi Hamaya"],"pdf_url":"https://arxiv.org/pdf/2406.14990v2.pdf","comment":"Accepted to IROS 2024"},{"id":"http://arxiv.org/abs/2409.17550v1","updated":"2024-09-26T05:39:52Z","published":"2024-09-26T05:39:52Z","title":"A Simple but Strong Baseline for Sounding Video Generation: Effective\n Adaptation of Audio and Video Diffusion Models for Joint Generation","summary":" In this work, we build a simple but strong baseline for sounding video\ngeneration. Given base diffusion models for audio and video, we integrate them\nwith additional modules into a single model and train it to make the model\njointly generate audio and video. To enhance alignment between audio-video\npairs, we introduce two novel mechanisms in our model. The first one is\ntimestep adjustment, which provides different timestep information to each base\nmodel. It is designed to align how samples are generated along with timesteps\nacross modalities. The second one is a new design of the additional modules,\ntermed Cross-Modal Conditioning as Positional Encoding (CMC-PE). In CMC-PE,\ncross-modal information is embedded as if it represents temporal position\ninformation, and the embeddings are fed into the model like positional\nencoding. Compared with the popular cross-attention mechanism, CMC-PE provides\na better inductive bias for temporal alignment in the generated data.\nExperimental results validate the effectiveness of the two newly introduced\nmechanisms and also demonstrate that our method outperforms existing methods.\n","authors":["Masato Ishii","Akio Hayakawa","Takashi Shibuya","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2409.17550v1.pdf","comment":"The source code will be released soon"},{"id":"http://arxiv.org/abs/2409.17546v1","updated":"2024-09-26T05:25:25Z","published":"2024-09-26T05:25:25Z","title":"MASSFormer: Mobility-Aware Spectrum Sensing using Transformer-Driven\n Tiered Structure","summary":" In this paper, we develop a novel mobility-aware transformer-driven tiered\nstructure (MASSFormer) based cooperative spectrum sensing method that\neffectively models the spatio-temporal dynamics of user movements. Unlike\nexisting methods, our method considers a dynamic scenario involving mobile\nprimary users (PUs) and secondary users (SUs)and addresses the complexities\nintroduced by user mobility. The transformer architecture utilizes an attention\nmechanism, enabling the proposed method to adeptly model the temporal dynamics\nof user mobility by effectively capturing long-range dependencies within the\ninput data. The proposed method first computes tokens from the sequence of\ncovariance matrices (CMs) for each SU and processes them in parallel using the\nSUtransformer network to learn the spatio-temporal features at SUlevel.\nSubsequently, the collaborative transformer network learns the group-level PU\nstate from all SU-level feature representations. The attention-based sequence\npooling method followed by the transformer encoder adjusts the contributions of\nall tokens. The main goal of predicting the PU states at each SU-level and\ngroup-level is to improve detection performance even more. We conducted a\nsufficient amount of simulations and compared the detection performance of\ndifferent SS methods. The proposed method is tested under imperfect reporting\nchannel scenarios to show robustness. The efficacy of our method is validated\nwith the simulation results demonstrating its higher performance compared with\nexisting methods in terms of detection probability, sensing error, and\nclassification accuracy.\n","authors":["Dimpal Janu","Sandeep Mandia","Kuldeep Singh","Sandeep Kumar"],"pdf_url":"https://arxiv.org/pdf/2409.17546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17545v1","updated":"2024-09-26T05:24:14Z","published":"2024-09-26T05:24:14Z","title":"Modulated Intervention Preference Optimization (MIPO): Keey the Easy,\n Refine the Difficult","summary":" Preference optimization methods typically begin training with a well-trained\nSFT model as a reference model. In RLHF and DPO, a regularization term is used\nduring the preference optimization process to prevent the policy model from\ndeviating too far from the reference model's distribution, thereby avoiding the\ngeneration of anomalous responses. When the reference model is already\nwell-aligned with the given data or only requires slight adjustments, this\napproach can produce a well-aligned model. However, if the reference model is\nnot aligned with the given data and requires significant deviation from its\ncurrent state, a regularization term may actually hinder the model alignment.\nIn this study, we propose \\textbf{Modulated Intervention Preference\nOptimization (MIPO)} to address this issue. MIPO modulates the degree of\nintervention from the reference model based on how well the given data is\naligned with it. If the data is well-aligned, the intervention is increased to\nprevent the policy model from diverging significantly from reference model.\nConversely, if the alignment is poor, the interference is reduced to facilitate\nmore extensive training. We compare the performance of MIPO and DPO using\nMistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental\nresults demonstrate that MIPO consistently outperforms DPO across various\nevaluation scenarios.\n","authors":["Cheolhun Jang"],"pdf_url":"https://arxiv.org/pdf/2409.17545v1.pdf","comment":"8pages, submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2409.17544v1","updated":"2024-09-26T05:22:16Z","published":"2024-09-26T05:22:16Z","title":"Optimizing the Induced Correlation in Omnibus Joint Graph Embeddings","summary":" Theoretical and empirical evidence suggests that joint graph embedding\nalgorithms induce correlation across the networks in the embedding space. In\nthe Omnibus joint graph embedding framework, previous results explicitly\ndelineated the dual effects of the algorithm-induced and model-inherent\ncorrelations on the correlation across the embedded networks. Accounting for\nand mitigating the algorithm-induced correlation is key to subsequent\ninference, as sub-optimal Omnibus matrix constructions have been demonstrated\nto lead to loss in inference fidelity. This work presents the first efforts to\nautomate the Omnibus construction in order to address two key questions in this\njoint embedding framework: the correlation-to-OMNI problem and the flat\ncorrelation problem. In the flat correlation problem, we seek to understand the\nminimum algorithm-induced flat correlation (i.e., the same across all graph\npairs) produced by a generalized Omnibus embedding. Working in a subspace of\nthe fully general Omnibus matrices, we prove both a lower bound for this flat\ncorrelation and that the classical Omnibus construction induces the maximal\nflat correlation. In the correlation-to-OMNI problem, we present an algorithm\n-- named corr2Omni -- that, from a given matrix of estimated pairwise graph\ncorrelations, estimates the matrix of generalized Omnibus weights that induces\noptimal correlation in the embedding space. Moreover, in both simulated and\nreal data settings, we demonstrate the increased effectiveness of our corr2Omni\nalgorithm versus the classical Omnibus construction.\n","authors":["Konstantinos Pantazis","Michael Trosset","William N. Frost","Carey E. Priebe","Vince Lyzinski"],"pdf_url":"https://arxiv.org/pdf/2409.17544v1.pdf","comment":"34 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.17538v1","updated":"2024-09-26T04:56:49Z","published":"2024-09-26T04:56:49Z","title":"On the Implicit Relation Between Low-Rank Adaptation and Differential\n Privacy","summary":" A significant approach in natural language processing involves large-scale\npre-training on general domain data followed by adaptation to specific tasks or\ndomains. As models grow in size, full fine-tuning all parameters becomes\nincreasingly impractical. To address this, some methods for low-rank task\nadaptation of language models have been proposed, e.g. LoRA and FLoRA. These\nmethods keep the pre-trained model weights fixed and incorporate trainable\nlow-rank decomposition matrices into some layers of the transformer\narchitecture, called adapters. This approach significantly reduces the number\nof trainable parameters required for downstream tasks compared to full\nfine-tuning all parameters. In this work, we look at low-rank adaptation from\nthe lens of data privacy. We show theoretically that the low-rank adaptation\nused in LoRA and FLoRA is equivalent to injecting some random noise into the\nbatch gradients w.r.t the adapter parameters coming from their full\nfine-tuning, and we quantify the variance of the injected noise. By\nestablishing a Berry-Esseen type bound on the total variation distance between\nthe noise distribution and a Gaussian distribution with the same variance, we\nshow that the dynamics of LoRA and FLoRA are very close to differentially\nprivate full fine-tuning the adapters, which suggests that low-rank adaptation\nimplicitly provides privacy w.r.t the fine-tuning data. Finally, using\nJohnson-Lindenstrauss lemma, we show that when augmented with gradient\nclipping, low-rank adaptation is almost equivalent to differentially private\nfull fine-tuning adapters with a fixed noise scale.\n","authors":["Saber Malekmohammadi","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2409.17538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13689v3","updated":"2024-09-26T04:54:43Z","published":"2024-08-24T23:20:38Z","title":"Decentralised Variational Inference Frameworks for Multi-object Tracking\n on Sensor Network","summary":" This paper tackles the challenge of multi-sensor multi-object tracking by\nproposing various decentralised Variational Inference (VI) schemes that match\nthe tracking performance of centralised sensor fusion with only local message\nexchanges among neighboring sensors. We first establish a centralised VI sensor\nfusion scheme as a benchmark and analyse the limitations of its decentralised\ncounterpart, which requires sensors to await consensus at each VI iteration.\nTherefore, we propose a decentralised gradient-based VI framework that\noptimises the Locally Maximised Evidence Lower Bound (LM-ELBO) instead of the\nstandard ELBO, which reduces the parameter search space and enables faster\nconvergence, making it particularly beneficial for decentralised tracking.This\nproposed framework is inherently self-evolving, improving with advancements in\ndecentralised optimisation techniques for convergence guarantees and\nefficiency. Further, we enhance the convergence speed of proposed decentralised\nschemes using natural gradients and gradient tracking strategies. Results\nverify that our decentralised VI schemes are empirically equivalent to\ncentralised fusion in tracking performance. Notably, the decentralised natural\ngradient VI method is the most communication-efficient, with communication\ncosts comparable to suboptimal decentralised strategies while delivering\nnotably higher tracking accuracy.\n","authors":["Qing Li","Runze Gan","Simon Godsill"],"pdf_url":"https://arxiv.org/pdf/2408.13689v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15734v2","updated":"2024-09-26T04:37:48Z","published":"2024-09-24T04:39:47Z","title":"Trust-Region Sequential Quadratic Programming for Stochastic\n Optimization with Random Models","summary":" In this work, we consider solving optimization problems with a stochastic\nobjective and deterministic equality constraints. We propose a Trust-Region\nSequential Quadratic Programming method to find both first- and second-order\nstationary points. Our method utilizes a random model to represent the\nobjective function, which is constructed from stochastic observations of the\nobjective and is designed to satisfy proper adaptive accuracy conditions with a\nhigh but fixed probability. To converge to first-order stationary points, our\nmethod computes a gradient step in each iteration defined by minimizing a\nquadratic approximation of the objective subject to a (relaxed) linear\napproximation of the problem constraints and a trust-region constraint. To\nconverge to second-order stationary points, our method additionally computes an\neigen step to explore the negative curvature of the reduced Hessian matrix, as\nwell as a second-order correction step to address the potential Maratos effect,\nwhich arises due to the nonlinearity of the problem constraints. Such an effect\nmay impede the method from moving away from saddle points. Both gradient and\neigen step computations leverage a novel parameter-free decomposition of the\nstep and the trust-region radius, accounting for the proportions among the\nfeasibility residual, optimality residual, and negative curvature. We establish\nglobal almost sure first- and second-order convergence guarantees for our\nmethod, and present computational results on CUTEst problems, regression\nproblems, and saddle-point problems to demonstrate its superiority over\nexisting line-search-based stochastic methods.\n","authors":["Yuchen Fang","Sen Na","Michael W. Mahoney","Mladen Kolar"],"pdf_url":"https://arxiv.org/pdf/2409.15734v2.pdf","comment":"41 pages, 3 figures"},{"id":"http://arxiv.org/abs/2406.05316v3","updated":"2024-09-26T03:54:15Z","published":"2024-06-08T01:32:44Z","title":"CMamba: Channel Correlation Enhanced State Space Models for Multivariate\n Time Series Forecasting","summary":" Recent advancements in multivariate time series forecasting have been\npropelled by Linear-based, Transformer-based, and Convolution-based models,\nwith Transformer-based architectures gaining prominence for their efficacy in\ntemporal and cross-channel mixing. More recently, Mamba, a state space model,\nhas emerged with robust sequence and feature mixing capabilities. However, the\nsuitability of the vanilla Mamba design for time series forecasting remains an\nopen question, particularly due to its inadequate handling of cross-channel\ndependencies. Capturing cross-channel dependencies is critical in enhancing the\nperformance of multivariate time series prediction. Recent findings show that\nself-attention excels in capturing cross-channel dependencies, whereas other\nsimpler mechanisms, such as MLP, may degrade model performance. This is\ncounterintuitive, as MLP, being a learnable architecture, should theoretically\ncapture both correlations and irrelevances, potentially leading to neutral or\nimproved performance. Diving into the self-attention mechanism, we attribute\nthe observed degradation in MLP performance to its lack of data dependence and\nglobal receptive field, which result in MLP's lack of generalization ability.\nBased on the above insights, we introduce a refined Mamba variant tailored for\ntime series forecasting. Our proposed model, \\textbf{CMamba}, incorporates a\nmodified Mamba (M-Mamba) module for temporal dependencies modeling, a global\ndata-dependent MLP (GDD-MLP) to effectively capture cross-channel dependencies,\nand a Channel Mixup mechanism to mitigate overfitting. Comprehensive\nexperiments conducted on seven real-world datasets demonstrate the efficacy of\nour model in improving forecasting performance.\n","authors":["Chaolv Zeng","Zhanyu Liu","Guanjie Zheng","Linghe Kong"],"pdf_url":"https://arxiv.org/pdf/2406.05316v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17517v1","updated":"2024-09-26T03:52:41Z","published":"2024-09-26T03:52:41Z","title":"Dataset Distillation-based Hybrid Federated Learning on Non-IID Data","summary":" In federated learning, the heterogeneity of client data has a great impact on\nthe performance of model training. Many heterogeneity issues in this process\nare raised by non-independently and identically distributed (Non-IID) data.\nThis study focuses on the issue of label distribution skew. To address it, we\npropose a hybrid federated learning framework called HFLDD, which integrates\ndataset distillation to generate approximately independent and equally\ndistributed (IID) data, thereby improving the performance of model training.\nParticularly, we partition the clients into heterogeneous clusters, where the\ndata labels among different clients within a cluster are unbalanced while the\ndata labels among different clusters are balanced. The cluster headers collect\ndistilled data from the corresponding cluster members, and conduct model\ntraining in collaboration with the server. This training process is like\ntraditional federated learning on IID data, and hence effectively alleviates\nthe impact of Non-IID data on model training. Furthermore, we compare our\nproposed method with typical baseline methods on public datasets. Experimental\nresults demonstrate that when the data labels are severely imbalanced, the\nproposed HFLDD outperforms the baseline methods in terms of both test accuracy\nand communication cost.\n","authors":["Xiufang Shi","Wei Zhang","Mincheng Wu","Guangyi Liu","Zhenyu Wen","Shibo He","Tejal Shah","Rajiv Ranjan"],"pdf_url":"https://arxiv.org/pdf/2409.17517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11337v3","updated":"2024-09-26T03:51:19Z","published":"2023-02-18T07:40:03Z","title":"Bayesian Matrix Decomposition and Applications","summary":" The sole aim of this book is to give a self-contained introduction to\nconcepts and mathematical tools in Bayesian matrix decomposition in order to\nseamlessly introduce matrix decomposition techniques and their applications in\nsubsequent sections. However, we clearly realize our inability to cover all the\nuseful and interesting results concerning Bayesian matrix decomposition and\ngiven the paucity of scope to present this discussion, e.g., the separated\nanalysis of variational inference for conducting the optimization. We refer the\nreader to literature in the field of Bayesian analysis for a more detailed\nintroduction to the related fields.\n This book is primarily a summary of purpose, significance of important\nBayesian matrix decomposition methods, e.g., real-valued decomposition,\nnonnegative matrix factorization, Bayesian interpolative decomposition, and the\norigin and complexity of the methods which shed light on their applications.\nThe mathematical prerequisite is a first course in statistics and linear\nalgebra. Other than this modest background, the development is self-contained,\nwith rigorous proof provided throughout.\n","authors":["Jun Lu"],"pdf_url":"https://arxiv.org/pdf/2302.11337v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17516v1","updated":"2024-09-26T03:50:55Z","published":"2024-09-26T03:50:55Z","title":"Functional Classification of Spiking Signal Data Using Artificial\n Intelligence Techniques: A Review","summary":" Human brain neuron activities are incredibly significant nowadays. Neuronal\nbehavior is assessed by analyzing signal data such as electroencephalography\n(EEG), which can offer scientists valuable information about diseases and\nhuman-computer interaction. One of the difficulties researchers confront while\nevaluating these signals is the existence of large volumes of spike data.\nSpikes are some considerable parts of signal data that can happen as a\nconsequence of vital biomarkers or physical issues such as electrode movements.\nHence, distinguishing types of spikes is important. From this spot, the spike\nclassification concept commences. Previously, researchers classified spikes\nmanually. The manual classification was not precise enough as it involves\nextensive analysis. Consequently, Artificial Intelligence (AI) was introduced\ninto neuroscience to assist clinicians in classifying spikes correctly. This\nreview discusses the importance and use of AI in spike classification, focusing\non the recognition of neural activity noises. The task is divided into three\nmain components: preprocessing, classification, and evaluation. Existing\nmethods are introduced and their importance is determined. The review also\nhighlights the need for more efficient algorithms. The primary goal is to\nprovide a perspective on spike classification for future research and provide a\ncomprehensive understanding of the methodologies and issues involved. The\nreview organizes materials in the spike classification field for future\nstudies. In this work, numerous studies were extracted from different\ndatabases. The PRISMA-related research guidelines were then used to choose\npapers. Then, research studies based on spike classification using machine\nlearning and deep learning approaches with effective preprocessing were\nselected.\n","authors":["Danial Sharifrazi","Nouman Javed","Javad Hassannataj Joloudari","Roohallah Alizadehsani","Prasad N. Paradkar","Ru-San Tan","U. Rajendra Acharya","Asim Bhatti"],"pdf_url":"https://arxiv.org/pdf/2409.17516v1.pdf","comment":"8 figures, 32 pages"},{"id":"http://arxiv.org/abs/2409.17513v1","updated":"2024-09-26T03:48:47Z","published":"2024-09-26T03:48:47Z","title":"Comparing Unidirectional, Bidirectional, and Word2vec Models for\n Discovering Vulnerabilities in Compiled Lifted Code","summary":" Ransomware and other forms of malware cause significant financial and\noperational damage to organizations by exploiting long-standing and often\ndifficult-to-detect software vulnerabilities. To detect vulnerabilities such as\nbuffer overflows in compiled code, this research investigates the application\nof unidirectional transformer-based embeddings, specifically GPT-2. Using a\ndataset of LLVM functions, we trained a GPT-2 model to generate embeddings,\nwhich were subsequently used to build LSTM neural networks to differentiate\nbetween vulnerable and non-vulnerable code. Our study reveals that embeddings\nfrom the GPT-2 model significantly outperform those from bidirectional models\nof BERT and RoBERTa, achieving an accuracy of 92.5% and an F1-score of 89.7%.\nLSTM neural networks were developed with both frozen and unfrozen embedding\nmodel layers. The model with the highest performance was achieved when the\nembedding layers were unfrozen. Further, the research finds that, in exploring\nthe impact of different optimizers within this domain, the SGD optimizer\ndemonstrates superior performance over Adam. Overall, these findings reveal\nimportant insights into the potential of unidirectional transformer-based\napproaches in enhancing cybersecurity defenses.\n","authors":["Gary A. McCully","John D. Hastings","Shengjie Xu","Adam Fortier"],"pdf_url":"https://arxiv.org/pdf/2409.17513v1.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.17510v1","updated":"2024-09-26T03:40:12Z","published":"2024-09-26T03:40:12Z","title":"NeuroPath: A Neural Pathway Transformer for Joining the Dots of Human\n Connectomes","summary":" Although modern imaging technologies allow us to study connectivity between\ntwo distinct brain regions in-vivo, an in-depth understanding of how anatomical\nstructure supports brain function and how spontaneous functional fluctuations\nemerge remarkable cognition is still elusive. Meanwhile, tremendous efforts\nhave been made in the realm of machine learning to establish the nonlinear\nmapping between neuroimaging data and phenotypic traits. However, the absence\nof neuroscience insight in the current approaches poses significant challenges\nin understanding cognitive behavior from transient neural activities. To\naddress this challenge, we put the spotlight on the coupling mechanism of\nstructural connectivity (SC) and functional connectivity (FC) by formulating\nsuch network neuroscience question into an expressive graph representation\nlearning problem for high-order topology. Specifically, we introduce the\nconcept of topological detour to characterize how a ubiquitous instance of FC\n(direct link) is supported by neural pathways (detour) physically wired by SC,\nwhich forms a cyclic loop interacted by brain structure and function. In the\nclich\\'e of machine learning, the multi-hop detour pathway underlying SC-FC\ncoupling allows us to devise a novel multi-head self-attention mechanism within\nTransformer to capture multi-modal feature representation from paired graphs of\nSC and FC. Taken together, we propose a biological-inspired deep model, coined\nas NeuroPath, to find putative connectomic feature representations from the\nunprecedented amount of neuroimages, which can be plugged into various\ndownstream applications such as task recognition and disease diagnosis. We have\nevaluated NeuroPath on large-scale public datasets including HCP and UK Biobank\nunder supervised and zero-shot learning, where the state-of-the-art performance\nby our NeuroPath indicates great potential in network neuroscience.\n","authors":["Ziquan Wei","Tingting Dan","Jiaqi Ding","Paul J Laurienti","Guorong Wu"],"pdf_url":"https://arxiv.org/pdf/2409.17510v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17508v1","updated":"2024-09-26T03:33:26Z","published":"2024-09-26T03:33:26Z","title":"Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task\n Learning Via Connector-MoE","summary":" Multi-modal large language models (MLLMs) have shown impressive capabilities\nas a general-purpose interface for various visual and linguistic tasks.\nHowever, building a unified MLLM for multi-task learning in the medical field\nremains a thorny challenge. To mitigate the tug-of-war problem of multi-modal\nmulti-task optimization, recent advances primarily focus on improving the LLM\ncomponents, while neglecting the connector that bridges the gap between\nmodalities. In this paper, we introduce Uni-Med, a novel medical generalist\nfoundation model which consists of a universal visual feature extraction\nmodule, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting\nfrom the proposed CMoE that leverages a well-designed router with a mixture of\nprojection experts at the connector, Uni-Med achieves efficient solution to the\ntug-of-war problem and can perform six different medical tasks including\nquestion answering, visual question answering, report generation, referring\nexpression comprehension, referring expression generation and image\nclassification. To the best of our knowledge, Uni-Med is the first effort to\ntackle multi-task interference at the connector. Extensive ablation experiments\nvalidate the effectiveness of introducing CMoE under any configuration, with up\nto an average 8% performance gains. We further provide interpretation analysis\nof the tug-of-war problem from the perspective of gradient optimization and\nparameter statistics. Compared to previous state-of-the-art medical MLLMs,\nUni-Med achieves competitive or superior evaluation metrics on diverse tasks.\nCode, data and model will be soon available at GitHub.\n","authors":["Xun Zhu","Ying Hu","Fanbin Mo","Miao Li","Ji Wu"],"pdf_url":"https://arxiv.org/pdf/2409.17508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17505v1","updated":"2024-09-26T03:24:59Z","published":"2024-09-26T03:24:59Z","title":"Sequential Kernelized Stein Discrepancy","summary":" We present a sequential version of the kernelized Stein discrepancy, which\nallows for conducting goodness-of-fit tests for unnormalized densities that are\ncontinuously monitored and adaptively stopped. That is, the sample size need\nnot be fixed prior to data collection; the practitioner can choose whether to\nstop the test or continue to gather evidence at any time while controlling the\nfalse discovery rate. In stark contrast to related literature, we do not impose\nuniform boundedness on the Stein kernel. Instead, we exploit the potential\nboundedness of the Stein kernel at arbitrary point evaluations to define test\nmartingales, that give way to the subsequent novel sequential tests. We prove\nthe validity of the test, as well as an asymptotic lower bound for the\nlogarithmic growth of the wealth process under the alternative. We further\nillustrate the empirical performance of the test with a variety of\ndistributions, including restricted Boltzmann machines.\n","authors":["Diego Martinez-Taboada","Aaditya Ramdas"],"pdf_url":"https://arxiv.org/pdf/2409.17505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17504v1","updated":"2024-09-26T03:22:09Z","published":"2024-09-26T03:22:09Z","title":"HaloScope: Harnessing Unlabeled LLM Generations for Hallucination\n Detection","summary":" The surge in applications of large language models (LLMs) has prompted\nconcerns about the generation of misleading or fabricated information, known as\nhallucinations. Therefore, detecting hallucinations has become critical to\nmaintaining trust in LLM-generated content. A primary challenge in learning a\ntruthfulness classifier is the lack of a large amount of labeled truthful and\nhallucinated data. To address the challenge, we introduce HaloScope, a novel\nlearning framework that leverages the unlabeled LLM generations in the wild for\nhallucination detection. Such unlabeled data arises freely upon deploying LLMs\nin the open world, and consists of both truthful and hallucinated information.\nTo harness the unlabeled data, we present an automated membership estimation\nscore for distinguishing between truthful and untruthful generations within\nunlabeled mixture data, thereby enabling the training of a binary truthfulness\nclassifier on top. Importantly, our framework does not require extra data\ncollection and human annotations, offering strong flexibility and practicality\nfor real-world applications. Extensive experiments show that HaloScope can\nachieve superior hallucination detection performance, outperforming the\ncompetitive rivals by a significant margin. Code is available at\nhttps://github.com/deeplearningwisc/haloscope.\n","authors":["Xuefeng Du","Chaowei Xiao","Yixuan Li"],"pdf_url":"https://arxiv.org/pdf/2409.17504v1.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2409.17502v1","updated":"2024-09-26T03:20:09Z","published":"2024-09-26T03:20:09Z","title":"Broadcast Product: Shape-aligned Element-wise Multiplication and Beyond","summary":" We propose a new operator defined between two tensors, the broadcast product.\nThe broadcast product calculates the Hadamard product after duplicating\nelements to align the shapes of the two tensors. Complex tensor operations in\nlibraries like \\texttt{numpy} can be succinctly represented as mathematical\nexpressions using the broadcast product. Finally, we propose a novel tensor\ndecomposition using the broadcast product, highlighting its potential\napplications in dimensionality reduction.\n","authors":["Yusuke Matsui","Tatsuya Yokota"],"pdf_url":"https://arxiv.org/pdf/2409.17502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17499v1","updated":"2024-09-26T03:12:20Z","published":"2024-09-26T03:12:20Z","title":"Does Worst-Performing Agent Lead the Pack? Analyzing Agent Dynamics in\n Unified Distributed SGD","summary":" Distributed learning is essential to train machine learning algorithms across\nheterogeneous agents while maintaining data privacy. We conduct an asymptotic\nanalysis of Unified Distributed SGD (UD-SGD), exploring a variety of\ncommunication patterns, including decentralized SGD and local SGD within\nFederated Learning (FL), as well as the increasing communication interval in\nthe FL setting. In this study, we assess how different sampling strategies,\nsuch as i.i.d. sampling, shuffling, and Markovian sampling, affect the\nconvergence speed of UD-SGD by considering the impact of agent dynamics on the\nlimiting covariance matrix as described in the Central Limit Theorem (CLT). Our\nfindings not only support existing theories on linear speedup and asymptotic\nnetwork independence, but also theoretically and empirically show how efficient\nsampling strategies employed by individual agents contribute to overall\nconvergence in UD-SGD. Simulations reveal that a few agents using highly\nefficient sampling can achieve or surpass the performance of the majority\nemploying moderately improved strategies, providing new insights beyond\ntraditional analyses focusing on the worst-performing agent.\n","authors":["Jie Hu","Yi-Ting Ma","Do Young Eun"],"pdf_url":"https://arxiv.org/pdf/2409.17499v1.pdf","comment":"To appear in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.14578v4","updated":"2024-09-26T02:59:44Z","published":"2024-05-23T13:52:36Z","title":"Surge Phenomenon in Optimal Learning Rate and Batch Size Scaling","summary":" In current deep learning tasks, Adam style optimizers such as Adam, Adagrad,\nRMSProp, Adafactor, and Lion have been widely used as alternatives to SGD style\noptimizers. These optimizers typically update model parameters using the sign\nof gradients, resulting in more stable convergence curves. The learning rate\nand the batch size are the most critical hyperparameters for optimizers, which\nrequire careful tuning to enable effective convergence. Previous research has\nshown that the optimal learning rate increases linearly or follows similar\nrules with batch size for SGD style optimizers. However, this conclusion is not\napplicable to Adam style optimizers. In this paper, we elucidate the connection\nbetween optimal learning rates and batch sizes for Adam style optimizers\nthrough both theoretical analysis and extensive experiments. First, we raise\nthe scaling law between batch sizes and optimal learning rates in the sign of\ngradient case, in which we prove that the optimal learning rate first rises and\nthen falls as the batch size increases. Moreover, the peak value of the surge\nwill gradually move toward the larger batch size as training progresses.\nSecond, we conducted experiments on various CV and NLP tasks and verified the\ncorrectness of the scaling law.\n","authors":["Shuaipeng Li","Penghao Zhao","Hailin Zhang","Xingwu Sun","Hao Wu","Dian Jiao","Weiyan Wang","Chengjun Liu","Zheng Fang","Jinbao Xue","Yangyu Tao","Bin Cui","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2405.14578v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16663v2","updated":"2024-09-26T02:57:52Z","published":"2024-09-25T06:48:25Z","title":"Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles\n Using Latent Space Generative World Models","summary":" We propose the use of latent space generative world models to address the\ncovariate shift problem in autonomous driving. A world model is a neural\nnetwork capable of predicting an agent's next state given past states and\nactions. By leveraging a world model during training, the driving policy\neffectively mitigates covariate shift without requiring an excessive amount of\ntraining data. During end-to-end training, our policy learns how to recover\nfrom errors by aligning with states observed in human demonstrations, so that\nat runtime it can recover from perturbations outside the training distribution.\nAdditionally, we introduce a novel transformer-based perception encoder that\nemploys multi-view cross-attention and a learned scene query. We present\nqualitative and quantitative results, demonstrating significant improvements\nupon prior state of the art in closed-loop testing in the CARLA simulator, as\nwell as showing the ability to handle perturbations in both CARLA and NVIDIA's\nDRIVE Sim.\n","authors":["Alexander Popov","Alperen Degirmenci","David Wehr","Shashank Hegde","Ryan Oldja","Alexey Kamenev","Bertrand Douillard","David Nistér","Urs Muller","Ruchi Bhargava","Stan Birchfield","Nikolai Smolyanskiy"],"pdf_url":"https://arxiv.org/pdf/2409.16663v2.pdf","comment":"7 pages, 6 figures, for ICRA 2025 conference, for associated video\n file, see https://youtu.be/fO7RZ57gVxk"},{"id":"http://arxiv.org/abs/2409.17490v1","updated":"2024-09-26T02:54:19Z","published":"2024-09-26T02:54:19Z","title":"MathDSL: A Domain-Specific Language for Concise Mathematical Solutions\n Via Program Synthesis","summary":" We present MathDSL, a Domain-Specific Language (DSL) for mathematical\nequation solving, which, when deployed in program synthesis models, outperforms\nstate-of-the-art reinforcement-learning-based methods. We also introduce a\nquantitative metric for measuring the conciseness of a mathematical solution\nand demonstrate the improvement in the quality of generated solutions compared\nto other methods. Our system demonstrates that a program synthesis system\n(DreamCoder) using MathDSL can generate programs that solve linear equations\nwith greater accuracy and conciseness than using reinforcement learning\nsystems. Additionally, we demonstrate that if we use the action spaces of\nprevious reinforcement learning systems as DSLs, MathDSL outperforms the\naction-space-DSLs. We use DreamCoder to store equation-solving strategies as\nlearned abstractions in its program library and demonstrate that by using\nMathDSL, these can be converted into human-interpretable solution strategies\nthat could have applications in mathematical education.\n","authors":["Sagnik Anupam","Maddy Bowers","Omar Costilla-Reyes","Armando Solar-Lezama"],"pdf_url":"https://arxiv.org/pdf/2409.17490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04585v2","updated":"2024-09-26T02:53:15Z","published":"2024-01-09T14:42:49Z","title":"EDA-DM: Enhanced Distribution Alignment for Post-Training Quantization\n of Diffusion Models","summary":" Diffusion models have achieved great success in image generation tasks\nthrough iterative noise estimation. However, the heavy denoising process and\ncomplex neural networks hinder their low-latency applications in real-world\nscenarios. Quantization can effectively reduce model complexity, and\npost-training quantization (PTQ), which does not require fine-tuning, is highly\npromising for compressing and accelerating diffusion models. Unfortunately, we\nfind that due to the highly dynamic distribution of activations in different\ndenoising steps, existing PTQ methods for diffusion models suffer from\ndistribution mismatch issues at both calibration sample level and\nreconstruction output level, which makes the performance far from satisfactory,\nespecially in low-bit cases. In this paper, we propose Enhanced Distribution\nAlignment for Post-Training Quantization of Diffusion Models (EDA-DM) to\naddress the above issues. Specifically, at the calibration sample level, we\nselect calibration samples based on the density and variety in the latent\nspace, thus facilitating the alignment of their distribution with the overall\nsamples; and at the reconstruction output level, we modify the loss of block\nreconstruction with the losses of layers, aligning the outputs of quantized\nmodel and full-precision model at different network granularity. Extensive\nexperiments demonstrate that EDA-DM significantly outperforms the existing PTQ\nmethods across various models (DDIM, LDM-4, LDM-8, Stable-Diffusion) and\ndifferent datasets (CIFAR-10, LSUN-Bedroom, LSUN-Church, ImageNet, MS-COCO).\n","authors":["Xuewen Liu","Zhikai Li","Junrui Xiao","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2401.04585v2.pdf","comment":"Code: http://github.com/BienLuky/EDA-DM"}],"Multimedia":[{"id":"http://arxiv.org/abs/2311.04591v4","updated":"2024-09-26T16:35:58Z","published":"2023-11-08T10:45:09Z","title":"Exploring Event-based Human Pose Estimation with 3D Event\n Representations","summary":" Human pose estimation is a fundamental and appealing task in computer vision.\nAlthough traditional cameras are commonly applied, their reliability decreases\nin scenarios under high dynamic range or heavy motion blur, where event cameras\noffer a robust solution. Predominant event-based methods accumulate events into\nframes, ignoring the asynchronous and high temporal resolution that is crucial\nfor distinguishing distinct actions. To address this issue and to unlock the 3D\npotential of event information, we introduce two 3D event representations: the\nRasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The\nRasEPC aggregates events within concise temporal slices at identical positions,\npreserving their 3D attributes along with statistical information, thereby\nsignificantly reducing memory and computational demands. Meanwhile, the DEV\nrepresentation discretizes events into voxels and projects them across three\northogonal planes, utilizing decoupled event attention to retrieve 3D cues from\nthe 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic\nevent-based dataset crafted to facilitate training and quantitative analysis in\noutdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD\ndataset, and our EV-3DPW dataset, with further qualitative validation via a\nderived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our\ncode and dataset have been made publicly available at\nhttps://github.com/MasterHow/EventPointPose.\n","authors":["Xiaoting Yin","Hao Shi","Jiaan Chen","Ze Wang","Yaozu Ye","Kailun Yang","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2311.04591v4.pdf","comment":"Accepted to Computer Vision and Image Understanding (CVPU). Extended\n version of arXiv:2206.04511. The code and dataset are available at\n https://github.com/MasterHow/EventPointPose"},{"id":"http://arxiv.org/abs/2409.17899v1","updated":"2024-09-26T14:49:09Z","published":"2024-09-26T14:49:09Z","title":"Revisiting Acoustic Similarity in Emotional Speech and Music via\n Self-Supervised Representations","summary":" Emotion recognition from speech and music shares similarities due to their\nacoustic overlap, which has led to interest in transferring knowledge between\nthese domains. However, the shared acoustic cues between speech and music,\nparticularly those encoded by Self-Supervised Learning (SSL) models, remain\nlargely unexplored, given the fact that SSL models for speech and music have\nrarely been applied in cross-domain research. In this work, we revisit the\nacoustic similarity between emotion speech and music, starting with an analysis\nof the layerwise behavior of SSL models for Speech Emotion Recognition (SER)\nand Music Emotion Recognition (MER). Furthermore, we perform cross-domain\nadaptation by comparing several approaches in a two-stage fine-tuning process,\nexamining effective ways to utilize music for SER and speech for MER. Lastly,\nwe explore the acoustic similarities between emotional speech and music using\nFrechet audio distance for individual emotions, uncovering the issue of emotion\nbias in both speech and music SSL models. Our findings reveal that while speech\nand music SSL models do capture shared acoustic features, their behaviors can\nvary depending on different emotions due to their training strategies and\ndomain-specificities. Additionally, parameter-efficient fine-tuning can enhance\nSER and MER performance by leveraging knowledge from each other. This study\nprovides new insights into the acoustic similarity between emotional speech and\nmusic, and highlights the potential for cross-domain generalization to improve\nSER and MER systems.\n","authors":["Yujia Sun","Zeyu Zhao","Korin Richmond","Yuanchao Li"],"pdf_url":"https://arxiv.org/pdf/2409.17899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17864v1","updated":"2024-09-26T14:12:23Z","published":"2024-09-26T14:12:23Z","title":"A Multimodal Single-Branch Embedding Network for Recommendation in\n Cold-Start and Missing Modality Scenarios","summary":" Most recommender systems adopt collaborative filtering (CF) and provide\nrecommendations based on past collective interactions. Therefore, the\nperformance of CF algorithms degrades when few or no interactions are\navailable, a scenario referred to as cold-start. To address this issue,\nprevious work relies on models leveraging both collaborative data and side\ninformation on the users or items. Similar to multimodal learning, these models\naim at combining collaborative and content representations in a shared\nembedding space. In this work we propose a novel technique for multimodal\nrecommendation, relying on a multimodal Single-Branch embedding network for\nRecommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction\ndata as well as multimodal side information using the same single-branch\nembedding network on different modalities. This makes SiBraR effective in\nscenarios of missing modality, including cold start. Our extensive experiments\non large-scale recommendation datasets from three different recommendation\ndomains (music, movie, and e-commerce) and providing multimodal content\ninformation (audio, text, image, labels, and interactions) show that SiBraR\nsignificantly outperforms CF as well as state-of-the-art content-based RSs in\ncold-start scenarios, and is competitive in warm scenarios. We show that\nSiBraR's recommendations are accurate in missing modality scenarios, and that\nthe model is able to map different modalities to the same region of the shared\nembedding space, hence reducing the modality gap.\n","authors":["Christian Ganhör","Marta Moscati","Anna Hausberger","Shah Nawaz","Markus Schedl"],"pdf_url":"https://arxiv.org/pdf/2409.17864v1.pdf","comment":"Accepted at 18th ACM Conference on Recommender Systems (RecSys '24)"},{"id":"http://arxiv.org/abs/2409.17678v1","updated":"2024-09-26T09:37:04Z","published":"2024-09-26T09:37:04Z","title":"Modeling the Popularity of Events on Web by Sparsity and\n Mutual-Excitation Guided Graph Neural Network","summary":" The content of a webpage described or posted an event in the cyberspace\ninevitably reflects viewpoints, values and trends of the physical society.\nMapping an event on web to the popularity score plays a pivot role to sense the\nsocial trends from the cyberspace. However, the complex semantic correspondence\nbetween texts and images, as well as the implicit text-image-popularity mapping\nmechanics pose a significant challenge to this non-trivial task. In this paper,\nwe address this problem from a viewpoint of understanding the interpretable\nmapping mechanics. Concretely, we organize the keywords from different events\ninto an unified graph. The unified graph facilitates to model the popularity of\nevents via two-level mappings, i.e., the self excitation and the mutual\nexcitation. The self-excitation assumes that each keyword forms the popularity\nwhile the mutual-excitation models that two keywords would excite each other to\ndetermine the popularity of an event. Specifically, we use Graph Neural Network\n(GNN) as the backbone to model the self-excitation, the mutual excitation and\nthe context of images into a sparse and deep factor model. Besides, to our best\nknowledge, we release a challenge web event dataset for the popularity\nprediction task. The experimental results on three public datasets demonstrate\nthat our method achieves significant improvements and outperforms the\nstate-of-the-art methods. Dataset is publicly available at:\nhttps://github.com/pangjunbiao/Hot-events-dataset.\n","authors":["Jiaxin Deng","Linlin Jia","Junbiao Pang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17596v1","updated":"2024-09-26T07:22:38Z","published":"2024-09-26T07:22:38Z","title":"Subjective and Objective Quality-of-Experience Evaluation Study for Live\n Video Streaming","summary":" In recent years, live video streaming has gained widespread popularity across\nvarious social media platforms. Quality of experience (QoE), which reflects\nend-users' satisfaction and overall experience, plays a critical role for media\nservice providers to optimize large-scale live compression and transmission\nstrategies to achieve perceptually optimal rate-distortion trade-off. Although\nmany QoE metrics for video-on-demand (VoD) have been proposed, there remain\nsignificant challenges in developing QoE metrics for live video streaming. To\nbridge this gap, we conduct a comprehensive study of subjective and objective\nQoE evaluations for live video streaming. For the subjective QoE study, we\nintroduce the first live video streaming QoE dataset, TaoLive QoE, which\nconsists of $42$ source videos collected from real live broadcasts and $1,155$\ncorresponding distorted ones degraded due to a variety of streaming\ndistortions, including conventional streaming distortions such as compression,\nstalling, as well as live streaming-specific distortions like frame skipping,\nvariable frame rate, etc. Subsequently, a human study was conducted to derive\nsubjective QoE scores of videos in the TaoLive QoE dataset. For the objective\nQoE study, we benchmark existing QoE models on the TaoLive QoE dataset as well\nas publicly available QoE datasets for VoD scenarios, highlighting that current\nmodels struggle to accurately assess video QoE, particularly for live content.\nHence, we propose an end-to-end QoE evaluation model, Tao-QoE, which integrates\nmulti-scale semantic features and optical flow-based motion features to\npredicting a retrospective QoE score, eliminating reliance on statistical\nquality of service (QoS) features.\n","authors":["Zehao Zhu","Wei Sun","Jun Jia","Wei Wu","Sibin Deng","Kai Li","Ying Chen","Xiongkuo Min","Jia Wang","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2409.17596v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17550v1","updated":"2024-09-26T05:39:52Z","published":"2024-09-26T05:39:52Z","title":"A Simple but Strong Baseline for Sounding Video Generation: Effective\n Adaptation of Audio and Video Diffusion Models for Joint Generation","summary":" In this work, we build a simple but strong baseline for sounding video\ngeneration. Given base diffusion models for audio and video, we integrate them\nwith additional modules into a single model and train it to make the model\njointly generate audio and video. To enhance alignment between audio-video\npairs, we introduce two novel mechanisms in our model. The first one is\ntimestep adjustment, which provides different timestep information to each base\nmodel. It is designed to align how samples are generated along with timesteps\nacross modalities. The second one is a new design of the additional modules,\ntermed Cross-Modal Conditioning as Positional Encoding (CMC-PE). In CMC-PE,\ncross-modal information is embedded as if it represents temporal position\ninformation, and the embeddings are fed into the model like positional\nencoding. Compared with the popular cross-attention mechanism, CMC-PE provides\na better inductive bias for temporal alignment in the generated data.\nExperimental results validate the effectiveness of the two newly introduced\nmechanisms and also demonstrate that our method outperforms existing methods.\n","authors":["Masato Ishii","Akio Hayakawa","Takashi Shibuya","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2409.17550v1.pdf","comment":"The source code will be released soon"},{"id":"http://arxiv.org/abs/2408.00970v2","updated":"2024-09-26T03:56:00Z","published":"2024-08-02T01:30:18Z","title":"Multimodal Fusion via Hypergraph Autoencoder and Contrastive Learning\n for Emotion Recognition in Conversation","summary":" Multimodal emotion recognition in conversation (MERC) seeks to identify the\nspeakers' emotions expressed in each utterance, offering significant potential\nacross diverse fields. The challenge of MERC lies in balancing speaker modeling\nand context modeling, encompassing both long-distance and short-distance\ncontexts, as well as addressing the complexity of multimodal information\nfusion. Recent research adopts graph-based methods to model intricate\nconversational relationships effectively. Nevertheless, the majority of these\nmethods utilize a fixed fully connected structure to link all utterances,\nrelying on convolution to interpret complex context. This approach can\ninherently heighten the redundancy in contextual messages and excessive graph\nnetwork smoothing, particularly in the context of long-distance conversations.\nTo address this issue, we propose a framework that dynamically adjusts\nhypergraph connections by variational hypergraph autoencoder (VHGAE), and\nemploys contrastive learning to mitigate uncertainty factors during the\nreconstruction process. Experimental results demonstrate the effectiveness of\nour proposal against the state-of-the-art methods on IEMOCAP and MELD datasets.\nWe release the code to support the reproducibility of this work at\nhttps://github.com/yzjred/-HAUCL.\n","authors":["Zijian Yi","Ziming Zhao","Zhishu Shen","Tiehua Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.00970v2.pdf","comment":"Accepted by ACM MULTIMEDIA 2024"},{"id":"http://arxiv.org/abs/2404.09245v2","updated":"2024-09-26T01:25:22Z","published":"2024-04-14T13:14:13Z","title":"Arena: A Patch-of-Interest ViT Inference Acceleration System for\n Edge-Assisted Video Analytics","summary":" The advent of edge computing has made real-time intelligent video analytics\nfeasible. Previous works, based on traditional model architecture (e.g., CNN,\nRNN, etc.), employ various strategies to filter out non-region-of-interest\ncontent to minimize bandwidth and computation consumption but show inferior\nperformance in adverse environments. Recently, visual foundation models based\non transformers have shown great performance in adverse environments due to\ntheir amazing generalization capability. However, they require a large amount\nof computation power, which limits their applications in real-time intelligent\nvideo analytics. In this paper, we find visual foundation models like Vision\nTransformer (ViT) also have a dedicated acceleration mechanism for video\nanalytics. To this end, we introduce Arena, an end-to-end edge-assisted video\ninference acceleration system based on ViT. We leverage the capability of ViT\nthat can be accelerated through token pruning by only offloading and feeding\nPatches-of-Interest to the downstream models. Additionally, we design an\nadaptive keyframe inference switching algorithm tailored to different videos,\ncapable of adapting to the current video content to jointly optimize accuracy\nand bandwidth. Through extensive experiments, our findings reveal that Arena\ncan boost inference speeds by up to 1.58\\(\\times\\) and 1.82\\(\\times\\) on\naverage while consuming only 47\\% and 31\\% of the bandwidth, respectively, all\nwith high inference accuracy.\n","authors":["Haosong Peng","Wei Feng","Hao Li","Yufeng Zhan","Ren Jin","Yuanqing Xia"],"pdf_url":"https://arxiv.org/pdf/2404.09245v2.pdf","comment":null}],"Robotics":[{"id":"http://arxiv.org/abs/2409.18122v1","updated":"2024-09-26T17:58:05Z","published":"2024-09-26T17:58:05Z","title":"RT-GuIDE: Real-Time Gaussian splatting for Information-Driven\n Exploration","summary":" We propose a framework for active mapping and exploration that leverages\nGaussian splatting for constructing information-rich maps. Further, we develop\na parallelized motion planning algorithm that can exploit the Gaussian map for\nreal-time navigation. The Gaussian map constructed onboard the robot is\noptimized for both photometric and geometric quality while enabling real-time\nsituational awareness for autonomy. We show through simulation experiments that\nour method is competitive with approaches that use alternate information gain\nmetrics, while being orders of magnitude faster to compute. In real-world\nexperiments, our algorithm achieves better map quality (10% higher Peak\nSignal-to-Noise Ratio (PSNR) and 30% higher geometric reconstruction accuracy)\nthan Gaussian maps constructed by traditional exploration baselines. Experiment\nvideos and more details can be found on our project page:\nhttps://tyuezhan.github.io/RT_GuIDE/\n","authors":["Yuezhan Tao","Dexter Ong","Varun Murali","Igor Spasojevic","Pratik Chaudhari","Vijay Kumar"],"pdf_url":"https://arxiv.org/pdf/2409.18122v1.pdf","comment":"Submitted to ICRA2025"},{"id":"http://arxiv.org/abs/2409.18121v1","updated":"2024-09-26T17:57:16Z","published":"2024-09-26T17:57:16Z","title":"Robot See Robot Do: Imitating Articulated Object Manipulation with\n Monocular 4D Reconstruction","summary":" Humans can learn to manipulate new objects by simply watching others;\nproviding robots with the ability to learn from such demonstrations would\nenable a natural interface specifying new behaviors. This work develops Robot\nSee Robot Do (RSRD), a method for imitating articulated object manipulation\nfrom a single monocular RGB human demonstration given a single static\nmulti-view object scan. We first propose 4D Differentiable Part Models\n(4D-DPM), a method for recovering 3D part motion from a monocular video with\ndifferentiable rendering. This analysis-by-synthesis approach uses part-centric\nfeature fields in an iterative optimization which enables the use of geometric\nregularizers to recover 3D motions from only a single video. Given this 4D\nreconstruction, the robot replicates object trajectories by planning bimanual\narm motions that induce the demonstrated object part motion. By representing\ndemonstrations as part-centric trajectories, RSRD focuses on replicating the\ndemonstration's intended behavior while considering the robot's own\nmorphological limits, rather than attempting to reproduce the hand's motion. We\nevaluate 4D-DPM's 3D tracking accuracy on ground truth annotated 3D part\ntrajectories and RSRD's physical execution performance on 9 objects across 10\ntrials each on a bimanual YuMi robot. Each phase of RSRD achieves an average of\n87% success rate, for a total end-to-end success rate of 60% across 90 trials.\nNotably, this is accomplished using only feature fields distilled from large\npretrained vision models -- without any task-specific training, fine-tuning,\ndataset collection, or annotation. Project page:\nhttps://robot-see-robot-do.github.io\n","authors":["Justin Kerr","Chung Min Kim","Mingxuan Wu","Brent Yi","Qianqian Wang","Ken Goldberg","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2409.18121v1.pdf","comment":"CoRL 2024, Project page: https://robot-see-robot-do.github.io"},{"id":"http://arxiv.org/abs/2409.18120v1","updated":"2024-09-26T17:57:15Z","published":"2024-09-26T17:57:15Z","title":"EvMAPPER: High Altitude Orthomapping with Event Cameras","summary":" Traditionally, unmanned aerial vehicles (UAVs) rely on CMOS-based cameras to\ncollect images about the world below. One of the most successful applications\nof UAVs is to generate orthomosaics or orthomaps, in which a series of images\nare integrated together to develop a larger map. However, the use of CMOS-based\ncameras with global or rolling shutters mean that orthomaps are vulnerable to\nchallenging light conditions, motion blur, and high-speed motion of\nindependently moving objects under the camera. Event cameras are less sensitive\nto these issues, as their pixels are able to trigger asynchronously on\nbrightness changes. This work introduces the first orthomosaic approach using\nevent cameras. In contrast to existing methods relying only on CMOS cameras,\nour approach enables map generation even in challenging light conditions,\nincluding direct sunlight and after sunset.\n","authors":["Fernando Cladera","Kenneth Chaney","M. Ani Hsieh","Camillo J. Taylor","Vijay Kumar"],"pdf_url":"https://arxiv.org/pdf/2409.18120v1.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.18108v1","updated":"2024-09-26T17:51:31Z","published":"2024-09-26T17:51:31Z","title":"Language-Embedded Gaussian Splats (LEGS): Incrementally Building\n Room-Scale Representations with a Mobile Robot","summary":" Building semantic 3D maps is valuable for searching for objects of interest\nin offices, warehouses, stores, and homes. We present a mapping system that\nincrementally builds a Language-Embedded Gaussian Splat (LEGS): a detailed 3D\nscene representation that encodes both appearance and semantics in a unified\nrepresentation. LEGS is trained online as a robot traverses its environment to\nenable localization of open-vocabulary object queries. We evaluate LEGS on 4\nroom-scale scenes where we query for objects in the scene to assess how LEGS\ncan capture semantic meaning. We compare LEGS to LERF and find that while both\nsystems have comparable object query success rates, LEGS trains over 3.5x\nfaster than LERF. Results suggest that a multi-camera setup and incremental\nbundle adjustment can boost visual reconstruction quality in constrained robot\ntrajectories, and suggest LEGS can localize open-vocabulary and long-tail\nobject queries with up to 66% accuracy.\n","authors":["Justin Yu","Kush Hari","Kishore Srinivas","Karim El-Refai","Adam Rashid","Chung Min Kim","Justin Kerr","Richard Cheng","Muhammad Zubair Irshad","Ashwin Balakrishna","Thomas Kollar","Ken Goldberg"],"pdf_url":"https://arxiv.org/pdf/2409.18108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18098v1","updated":"2024-09-26T17:41:35Z","published":"2024-09-26T17:41:35Z","title":"StackGen: Generating Stable Structures from Silhouettes via Diffusion","summary":" Humans naturally obtain intuition about the interactions between and the\nstability of rigid objects by observing and interacting with the world. It is\nthis intuition that governs the way in which we regularly configure objects in\nour environment, allowing us to build complex structures from simple, everyday\nobjects. Robotic agents, on the other hand, traditionally require an explicit\nmodel of the world that includes the detailed geometry of each object and an\nanalytical model of the environment dynamics, which are difficult to scale and\npreclude generalization. Instead, robots would benefit from an awareness of\nintuitive physics that enables them to similarly reason over the stable\ninteraction of objects in their environment. Towards that goal, we propose\nStackGen, a diffusion model that generates diverse stable configurations of\nbuilding blocks matching a target silhouette. To demonstrate the capability of\nthe method, we evaluate it in a simulated environment and deploy it in the real\nsetting using a robotic arm to assemble structures generated by the model.\n","authors":["Luzhe Sun","Takuma Yoneda","Samuel W. Wheeler","Tianchong Jiang","Matthew R. Walter"],"pdf_url":"https://arxiv.org/pdf/2409.18098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18097v1","updated":"2024-09-26T17:41:04Z","published":"2024-09-26T17:41:04Z","title":"A Sim-to-Real Vision-based Lane Keeping System for a 1:10-scale\n Autonomous Vehicle","summary":" In recent years, several competitions have highlighted the need to\ninvestigate vision-based solutions to address scenarios with functional\ninsufficiencies in perception, world modeling and localization. This article\npresents the Vision-based Lane Keeping System (VbLKS) developed by the\nDEI-Unipd Team within the context of the Bosch Future Mobility Challenge 2022.\nThe main contribution lies in a Simulation-to-Reality (Sim2Real) GPS-denied\nVbLKS for a 1:10-scale autonomous vehicle. In this VbLKS, the input to a\ntailored Pure Pursuit (PP) based control strategy, namely the Lookahead Heading\nError (LHE), is estimated at a constant lookahead distance employing a\nConvolutional Neural Network (CNN). A training strategy for a compact CNN is\nproposed, emphasizing data generation and augmentation on simulated camera\nimages from a 3D Gazebo simulator, and enabling real-time operation on\nlow-level hardware. A tailored PP-based lateral controller equipped with a\nderivative action and a PP-based velocity reference generation are implemented.\nTuning ranges are established through a systematic time-delay stability\nanalysis. Validation in a representative controlled laboratory setting is\nprovided.\n","authors":["Antonio Gallina","Matteo Grandin","Angelo Cenedese","Mattia Bruschetta"],"pdf_url":"https://arxiv.org/pdf/2409.18097v1.pdf","comment":"16 pages, 23 figures"},{"id":"http://arxiv.org/abs/2409.18092v1","updated":"2024-09-26T17:39:05Z","published":"2024-09-26T17:39:05Z","title":"DiffSSC: Semantic LiDAR Scan Completion using Denoising Diffusion\n Probabilistic Models","summary":" Perception systems play a crucial role in autonomous driving, incorporating\nmultiple sensors and corresponding computer vision algorithms. 3D LiDAR sensors\nare widely used to capture sparse point clouds of the vehicle's surroundings.\nHowever, such systems struggle to perceive occluded areas and gaps in the scene\ndue to the sparsity of these point clouds and their lack of semantics. To\naddress these challenges, Semantic Scene Completion (SSC) jointly predicts\nunobserved geometry and semantics in the scene given raw LiDAR measurements,\naiming for a more complete scene representation. Building on promising results\nof diffusion models in image generation and super-resolution tasks, we propose\ntheir extension to SSC by implementing the noising and denoising diffusion\nprocesses in the point and semantic spaces individually. To control the\ngeneration, we employ semantic LiDAR point clouds as conditional input and\ndesign local and global regularization losses to stabilize the denoising\nprocess. We evaluate our approach on autonomous driving datasets and our\napproach outperforms the state-of-the-art for SSC.\n","authors":["Helin Cao","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2409.18092v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2409.18084v1","updated":"2024-09-26T17:27:15Z","published":"2024-09-26T17:27:15Z","title":"GSON: A Group-based Social Navigation Framework with Large Multimodal\n Model","summary":" As the number of service robots and autonomous vehicles in human-centered\nenvironments grows, their requirements go beyond simply navigating to a\ndestination. They must also take into account dynamic social contexts and\nensure respect and comfort for others in shared spaces, which poses significant\nchallenges for perception and planning. In this paper, we present a group-based\nsocial navigation framework GSON to enable mobile robots to perceive and\nexploit the social group of their surroundings by leveling the visual reasoning\ncapability of the Large Multimodal Model (LMM). For perception, we apply visual\nprompting techniques to zero-shot extract the social relationship among\npedestrians and combine the result with a robust pedestrian detection and\ntracking pipeline to alleviate the problem of low inference speed of the LMM.\nGiven the perception result, the planning system is designed to avoid\ndisrupting the current social structure. We adopt a social structure-based\nmid-level planner as a bridge between global path planning and local motion\nplanning to preserve the global context and reactive response. The proposed\nmethod is validated on real-world mobile robot navigation tasks involving\ncomplex social structure understanding and reasoning. Experimental results\ndemonstrate the effectiveness of the system in these scenarios compared with\nseveral baselines.\n","authors":["Shangyi Luo","Ji Zhu","Peng Sun","Yuhong Deng","Cunjun Yu","Anxing Xiao","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18082v1","updated":"2024-09-26T17:26:16Z","published":"2024-09-26T17:26:16Z","title":"SKT: Integrating State-Aware Keypoint Trajectories with Vision-Language\n Models for Robotic Garment Manipulation","summary":" Automating garment manipulation poses a significant challenge for assistive\nrobotics due to the diverse and deformable nature of garments. Traditional\napproaches typically require separate models for each garment type, which\nlimits scalability and adaptability. In contrast, this paper presents a unified\napproach using vision-language models (VLMs) to improve keypoint prediction\nacross various garment categories. By interpreting both visual and semantic\ninformation, our model enables robots to manage different garment states with a\nsingle model. We created a large-scale synthetic dataset using advanced\nsimulation techniques, allowing scalable training without extensive real-world\ndata. Experimental results indicate that the VLM-based method significantly\nenhances keypoint detection accuracy and task success rates, providing a more\nflexible and general solution for robotic garment manipulation. In addition,\nthis research also underscores the potential of VLMs to unify various garment\nmanipulation tasks within a single framework, paving the way for broader\napplications in home automation and assistive robotics for future.\n","authors":["Xin Li","Siyuan Huang","Qiaojun Yu","Zhengkai Jiang","Ce Hao","Yimeng Zhu","Hongsheng Li","Peng Gao","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2409.18082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18053v1","updated":"2024-09-26T16:58:04Z","published":"2024-09-26T16:58:04Z","title":"DualAD: Dual-Layer Planning for Reasoning in Autonomous Driving","summary":" We present a novel autonomous driving framework, DualAD, designed to imitate\nhuman reasoning during driving. DualAD comprises two layers: a rule-based\nmotion planner at the bottom layer that handles routine driving tasks requiring\nminimal reasoning, and an upper layer featuring a rule-based text encoder that\nconverts driving scenarios from absolute states into text description. This\ntext is then processed by a large language model (LLM) to make driving\ndecisions. The upper layer intervenes in the bottom layer's decisions when\npotential danger is detected, mimicking human reasoning in critical situations.\nClosed-loop experiments demonstrate that DualAD, using a zero-shot pre-trained\nmodel, significantly outperforms rule-based motion planners that lack reasoning\nabilities. Our experiments also highlight the effectiveness of the text\nencoder, which considerably enhances the model's scenario understanding.\nAdditionally, the integrated DualAD model improves with stronger LLMs,\nindicating the framework's potential for further enhancement. We make code and\nbenchmarks publicly available.\n","authors":["Dingrui Wang","Marc Kaufeld","Johannes Betz"],"pdf_url":"https://arxiv.org/pdf/2409.18053v1.pdf","comment":"Autonomous Driving, Large Language Models (LLMs), Human Reasoning,\n Critical Scenario"},{"id":"http://arxiv.org/abs/2409.18052v1","updated":"2024-09-26T16:55:44Z","published":"2024-09-26T16:55:44Z","title":"Explaining Explaining","summary":" Explanation is key to people having confidence in high-stakes AI systems.\nHowever, machine-learning-based systems - which account for almost all current\nAI - can't explain because they are usually black boxes. The explainable AI\n(XAI) movement hedges this problem by redefining \"explanation\". The\nhuman-centered explainable AI (HCXAI) movement identifies the\nexplanation-oriented needs of users but can't fulfill them because of its\ncommitment to machine learning. In order to achieve the kinds of explanations\nneeded by real people operating in critical domains, we must rethink how to\napproach AI. We describe a hybrid approach to developing cognitive agents that\nuses a knowledge-based infrastructure supplemented by data obtained through\nmachine learning when applicable. These agents will serve as assistants to\nhumans who will bear ultimate responsibility for the decisions and actions of\nthe human-robot team. We illustrate the explanatory potential of such agents\nusing the under-the-hood panels of a demonstration system in which a team of\nsimulated robots collaborates on a search task assigned by a human.\n","authors":["Sergei Nirenburg","Marjorie McShane","Kenneth W. Goodman","Sanjay Oruganti"],"pdf_url":"https://arxiv.org/pdf/2409.18052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18049v1","updated":"2024-09-26T16:49:58Z","published":"2024-09-26T16:49:58Z","title":"Revisit Anything: Visual Place Recognition via Image Segment Retrieval","summary":" Accurately recognizing a revisited place is crucial for embodied agents to\nlocalize and navigate. This requires visual representations to be distinct,\ndespite strong variations in camera viewpoint and scene appearance. Existing\nvisual place recognition pipelines encode the \"whole\" image and search for\nmatches. This poses a fundamental challenge in matching two images of the same\nplace captured from different camera viewpoints: \"the similarity of what\noverlaps can be dominated by the dissimilarity of what does not overlap\". We\naddress this by encoding and searching for \"image segments\" instead of the\nwhole images. We propose to use open-set image segmentation to decompose an\nimage into `meaningful' entities (i.e., things and stuff). This enables us to\ncreate a novel image representation as a collection of multiple overlapping\nsubgraphs connecting a segment with its neighboring segments, dubbed\nSuperSegment. Furthermore, to efficiently encode these SuperSegments into\ncompact vector representations, we propose a novel factorized representation of\nfeature aggregation. We show that retrieving these partial representations\nleads to significantly higher recognition recall than the typical whole image\nbased retrieval. Our segments-based approach, dubbed SegVLAD, sets a new\nstate-of-the-art in place recognition on a diverse selection of benchmark\ndatasets, while being applicable to both generic and task-specialized image\nencoders. Finally, we demonstrate the potential of our method to ``revisit\nanything'' by evaluating our method on an object instance retrieval task, which\nbridges the two disparate areas of research: visual place recognition and\nobject-goal navigation, through their common aim of recognizing goal objects\nspecific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything.\n","authors":["Kartik Garg","Sai Shubodh Puligilla","Shishir Kolathaya","Madhava Krishna","Sourav Garg"],"pdf_url":"https://arxiv.org/pdf/2409.18049v1.pdf","comment":"Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures"},{"id":"http://arxiv.org/abs/2409.18047v1","updated":"2024-09-26T16:48:21Z","published":"2024-09-26T16:48:21Z","title":"HARMONIC: Cognitive and Control Collaboration in Human-Robotic Teams","summary":" This paper presents a novel approach to multi-robot planning and\ncollaboration. We demonstrate a cognitive strategy for robots in human-robot\nteams that incorporates metacognition, natural language communication, and\nexplainability. The system is embodied using the HARMONIC architecture that\nflexibly integrates cognitive and control capabilities across the team. We\nevaluate our approach through simulation experiments involving a joint search\ntask by a team of heterogeneous robots (a UGV and a drone) and a human. We\ndetail the system's handling of complex, real-world scenarios, effective action\ncoordination between robots with different capabilities, and natural\nhuman-robot communication. This work demonstrates that the robots' ability to\nreason about plans, goals, and attitudes, and to provide explanations for\nactions and decisions are essential prerequisites for realistic human-robot\nteaming.\n","authors":["Sanjay Oruganti","Sergei Nirenburg","Marjorie McShane","Jesse English","Michael K. Roberts","Christian Arndt"],"pdf_url":"https://arxiv.org/pdf/2409.18047v1.pdf","comment":"Submitted to ICRA 2025 Conference, Atlanta, GA, USA"},{"id":"http://arxiv.org/abs/2409.18038v1","updated":"2024-09-26T16:42:53Z","published":"2024-09-26T16:42:53Z","title":"MMDVS-LF: A Multi-Modal Dynamic-Vision-Sensor Line Following Dataset","summary":" Dynamic Vision Sensors (DVS), offer a unique advantage in control\napplications, due to their high temporal resolution, and asynchronous\nevent-based data. Still, their adoption in machine learning algorithms remains\nlimited. To address this gap, and promote the development of models that\nleverage the specific characteristics of DVS data, we introduce the Multi-Modal\nDynamic-Vision-Sensor Line Following dataset (MMDVS-LF). This comprehensive\ndataset, is the first to integrate multiple sensor modalities, including DVS\nrecordings, RGB video, odometry, and Inertial Measurement Unit (IMU) data, from\na small-scale standardized vehicle. Additionally, the dataset includes\neye-tracking and demographic data of drivers performing a Line Following task\non a track. With its diverse range of data, MMDVS-LF opens new opportunities\nfor developing deep learning algorithms, and conducting data science projects\nacross various domains, supporting innovation in autonomous systems and control\napplications.\n","authors":["Felix Resch","Mónika Farsang","Radu Grosu"],"pdf_url":"https://arxiv.org/pdf/2409.18038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18037v1","updated":"2024-09-26T16:42:13Z","published":"2024-09-26T16:42:13Z","title":"HARMONIC: A Framework for Explanatory Cognitive Robots","summary":" We present HARMONIC, a framework for implementing cognitive robots that\ntransforms general-purpose robots into trusted teammates capable of complex\ndecision-making, natural communication and human-level explanation. The\nframework supports interoperability between a strategic (cognitive) layer for\nhigh-level decision-making and a tactical (robot) layer for low-level control\nand execution. We describe the core features of the framework and our initial\nimplementation, in which HARMONIC was deployed on a simulated UGV and drone\ninvolved in a multi-robot search and retrieval task.\n","authors":["Sanjay Oruganti","Sergei Nirenburg","Marjorie McShane","Jesse English","Michael K. Roberts","Christian Arndt"],"pdf_url":"https://arxiv.org/pdf/2409.18037v1.pdf","comment":"Accepted for presentation at ICRA@40. 23-26 September 2024,\n Rotterdam, Netherlands"},{"id":"http://arxiv.org/abs/2409.18031v1","updated":"2024-09-26T16:38:44Z","published":"2024-09-26T16:38:44Z","title":"Reasoning Multi-Agent Behavioral Topology for Interactive Autonomous\n Driving","summary":" Autonomous driving system aims for safe and social-consistent driving through\nthe behavioral integration among interactive agents. However, challenges remain\ndue to multi-agent scene uncertainty and heterogeneous interaction. Current\ndense and sparse behavioral representations struggle with inefficiency and\ninconsistency in multi-agent modeling, leading to instability of collective\nbehavioral patterns when integrating prediction and planning (IPP). To address\nthis, we initiate a topological formation that serves as a compliant behavioral\nforeground to guide downstream trajectory generations. Specifically, we\nintroduce Behavioral Topology (BeTop), a pivotal topological formulation that\nexplicitly represents the consensual behavioral pattern among multi-agent\nfuture. BeTop is derived from braid theory to distill compliant interactive\ntopology from multi-agent future trajectories. A synergistic learning framework\n(BeTopNet) supervised by BeTop facilitates the consistency of behavior\nprediction and planning within the predicted topology priors. Through imitative\ncontingency learning, BeTop also effectively manages behavioral uncertainty for\nprediction and planning. Extensive verification on large-scale real-world\ndatasets, including nuPlan and WOMD, demonstrates that BeTop achieves\nstate-of-the-art performance in both prediction and planning tasks. Further\nvalidations on the proposed interactive scenario benchmark showcase planning\ncompliance in interactive cases.\n","authors":["Haochen Liu","Li Chen","Yu Qiao","Chen Lv","Hongyang Li"],"pdf_url":"https://arxiv.org/pdf/2409.18031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04591v4","updated":"2024-09-26T16:35:58Z","published":"2023-11-08T10:45:09Z","title":"Exploring Event-based Human Pose Estimation with 3D Event\n Representations","summary":" Human pose estimation is a fundamental and appealing task in computer vision.\nAlthough traditional cameras are commonly applied, their reliability decreases\nin scenarios under high dynamic range or heavy motion blur, where event cameras\noffer a robust solution. Predominant event-based methods accumulate events into\nframes, ignoring the asynchronous and high temporal resolution that is crucial\nfor distinguishing distinct actions. To address this issue and to unlock the 3D\npotential of event information, we introduce two 3D event representations: the\nRasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The\nRasEPC aggregates events within concise temporal slices at identical positions,\npreserving their 3D attributes along with statistical information, thereby\nsignificantly reducing memory and computational demands. Meanwhile, the DEV\nrepresentation discretizes events into voxels and projects them across three\northogonal planes, utilizing decoupled event attention to retrieve 3D cues from\nthe 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic\nevent-based dataset crafted to facilitate training and quantitative analysis in\noutdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD\ndataset, and our EV-3DPW dataset, with further qualitative validation via a\nderived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our\ncode and dataset have been made publicly available at\nhttps://github.com/MasterHow/EventPointPose.\n","authors":["Xiaoting Yin","Hao Shi","Jiaan Chen","Ze Wang","Yaozu Ye","Kailun Yang","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2311.04591v4.pdf","comment":"Accepted to Computer Vision and Image Understanding (CVPU). Extended\n version of arXiv:2206.04511. The code and dataset are available at\n https://github.com/MasterHow/EventPointPose"},{"id":"http://arxiv.org/abs/2409.18026v1","updated":"2024-09-26T16:33:16Z","published":"2024-09-26T16:33:16Z","title":"ReliOcc: Towards Reliable Semantic Occupancy Prediction via Uncertainty\n Learning","summary":" Vision-centric semantic occupancy prediction plays a crucial role in\nautonomous driving, which requires accurate and reliable predictions from\nlow-cost sensors. Although having notably narrowed the accuracy gap with LiDAR,\nthere is still few research effort to explore the reliability in predicting\nsemantic occupancy from camera. In this paper, we conduct a comprehensive\nevaluation of existing semantic occupancy prediction models from a reliability\nperspective for the first time. Despite the gradual alignment of camera-based\nmodels with LiDAR in term of accuracy, a significant reliability gap persists.\nTo addresses this concern, we propose ReliOcc, a method designed to enhance the\nreliability of camera-based occupancy networks. ReliOcc provides a\nplug-and-play scheme for existing models, which integrates hybrid uncertainty\nfrom individual voxels with sampling-based noise and relative voxels through\nmix-up learning. Besides, an uncertainty-aware calibration strategy is devised\nto further enhance model reliability in offline mode. Extensive experiments\nunder various settings demonstrate that ReliOcc significantly enhances model\nreliability while maintaining the accuracy of both geometric and semantic\npredictions. Importantly, our proposed approach exhibits robustness to sensor\nfailures and out of domain noises during inference.\n","authors":["Song Wang","Zhongdao Wang","Jiawei Yu","Wentong Li","Bailan Feng","Junbo Chen","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.18026v1.pdf","comment":"Technical report. Work in progress"},{"id":"http://arxiv.org/abs/2409.18009v1","updated":"2024-09-26T16:19:37Z","published":"2024-09-26T16:19:37Z","title":"Control Industrial Automation System with Large Language Models","summary":" Traditional industrial automation systems require specialized expertise to\noperate and complex reprogramming to adapt to new processes. Large language\nmodels offer the intelligence to make them more flexible and easier to use.\nHowever, LLMs' application in industrial settings is underexplored. This paper\nintroduces a framework for integrating LLMs to achieve end-to-end control of\nindustrial automation systems. At the core of the framework are an agent system\ndesigned for industrial tasks, a structured prompting method, and an\nevent-driven information modeling mechanism that provides real-time data for\nLLM inference. The framework supplies LLMs with real-time events on different\ncontext semantic levels, allowing them to interpret the information, generate\nproduction plans, and control operations on the automation system. It also\nsupports structured dataset creation for fine-tuning on this downstream\napplication of LLMs. Our contribution includes a formal system design,\nproof-of-concept implementation, and a method for generating task-specific\ndatasets for LLM fine-tuning and testing. This approach enables a more adaptive\nautomation system that can respond to spontaneous events, while allowing easier\noperation and configuration through natural language for more intuitive\nhuman-machine interaction. We provide demo videos and detailed data on GitHub:\nhttps://github.com/YuchenXia/LLM4IAS\n","authors":["Yuchen Xia","Nasser Jazdi","Jize Zhang","Chaitanya Shah","Michael Weyrich"],"pdf_url":"https://arxiv.org/pdf/2409.18009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08113v3","updated":"2024-09-26T16:14:54Z","published":"2024-06-12T11:50:51Z","title":"Valeo4Cast: A Modular Approach to End-to-End Forecasting","summary":" Motion forecasting is crucial in autonomous driving systems to anticipate the\nfuture trajectories of surrounding agents such as pedestrians, vehicles, and\ntraffic signals. In end-to-end forecasting, the model must jointly detect and\ntrack from sensor data (cameras or LiDARs) the past trajectories of the\ndifferent elements of the scene and predict their future locations. We depart\nfrom the current trend of tackling this task via end-to-end training from\nperception to forecasting, and instead use a modular approach. We individually\nbuild and train detection, tracking and forecasting modules. We then only use\nconsecutive finetuning steps to integrate the modules better and alleviate\ncompounding errors. We conduct an in-depth study on the finetuning strategies\nand it reveals that our simple yet effective approach significantly improves\nperformance on the end-to-end forecasting benchmark. Consequently, our solution\nranks first in the Argoverse 2 End-to-end Forecasting Challenge, with 63.82\nmAPf. We surpass forecasting results by +17.1 points over last year's winner\nand by +13.3 points over this year's runner-up. This remarkable performance in\nforecasting can be explained by our modular paradigm, which integrates\nfinetuning strategies and significantly outperforms the end-to-end-trained\ncounterparts. The code, model weights and results are made available\nhttps://github.com/valeoai/valeo4cast.\n","authors":["Yihong Xu","Éloi Zablocki","Alexandre Boulch","Gilles Puy","Mickael Chen","Florent Bartoccioni","Nermin Samet","Oriane Siméoni","Spyros Gidaris","Tuan-Hung Vu","Andrei Bursuc","Eduardo Valle","Renaud Marlet","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2406.08113v3.pdf","comment":"Winning solution of the Argoverse 2 \"Unified Detection, Tracking, and\n Forecasting\" challenge; work accepted at Road++ ECCVW 2024"},{"id":"http://arxiv.org/abs/2409.17995v1","updated":"2024-09-26T16:07:20Z","published":"2024-09-26T16:07:20Z","title":"Joint Localization and Planning using Diffusion","summary":" Diffusion models have been successfully applied to robotics problems such as\nmanipulation and vehicle path planning. In this work, we explore their\napplication to end-to-end navigation -- including both perception and planning\n-- by considering the problem of jointly performing global localization and\npath planning in known but arbitrary 2D environments. In particular, we\nintroduce a diffusion model which produces collision-free paths in a global\nreference frame given an egocentric LIDAR scan, an arbitrary map, and a desired\ngoal position. To this end, we implement diffusion in the space of paths in\nSE(2), and describe how to condition the denoising process on both obstacles\nand sensor observations. In our evaluation, we show that the proposed\nconditioning techniques enable generalization to realistic maps of considerably\ndifferent appearance than the training environment, demonstrate our model's\nability to accurately describe ambiguous solutions, and run extensive\nsimulation experiments showcasing our model's use as a real-time, end-to-end\nlocalization and planning stack.\n","authors":["L. Lao Beyer","S. Karaman"],"pdf_url":"https://arxiv.org/pdf/2409.17995v1.pdf","comment":"7 pages, 9 figures. Submitted to ICRA 2025, under review"},{"id":"http://arxiv.org/abs/2409.17992v1","updated":"2024-09-26T16:02:25Z","published":"2024-09-26T16:02:25Z","title":"LoopSR: Looping Sim-and-Real for Lifelong Policy Adaptation of Legged\n Robots","summary":" Reinforcement Learning (RL) has shown its remarkable and generalizable\ncapability in legged locomotion through sim-to-real transfer. However, while\nadaptive methods like domain randomization are expected to make policy more\nrobust to diverse environments, such comprehensiveness potentially detracts\nfrom the policy's performance in any specific environment according to the No\nFree Lunch theorem, leading to a suboptimal solution once deployed in the real\nworld. To address this issue, we propose a lifelong policy adaptation framework\nnamed LoopSR, which utilizes a transformer-based encoder to project real-world\ntrajectories into a latent space, and accordingly reconstruct the real-world\nenvironments back in simulation for further improvement. Autoencoder\narchitecture and contrastive learning methods are adopted to better extract the\ncharacteristics of real-world dynamics. The simulation parameters for continual\ntraining are derived by combining predicted parameters from the decoder with\nretrieved parameters from the simulation trajectory dataset. By leveraging the\ncontinual training, LoopSR achieves superior data efficiency compared with\nstrong baselines, with only a limited amount of data to yield eminent\nperformance in both sim-to-sim and sim-to-real experiments.\n","authors":["Peilin Wu","Weiji Xie","Jiahang Cao","Hang Lai","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17992v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2409.17988v1","updated":"2024-09-26T15:57:20Z","published":"2024-09-26T15:57:20Z","title":"Deblur e-NeRF: NeRF from Motion-Blurred Events under High-speed or\n Low-light Conditions","summary":" The stark contrast in the design philosophy of an event camera makes it\nparticularly ideal for operating under high-speed, high dynamic range and\nlow-light conditions, where standard cameras underperform. Nonetheless, event\ncameras still suffer from some amount of motion blur, especially under these\nchallenging conditions, in contrary to what most think. This is attributed to\nthe limited bandwidth of the event sensor pixel, which is mostly proportional\nto the light intensity. Thus, to ensure that event cameras can truly excel in\nsuch conditions where it has an edge over standard cameras, it is crucial to\naccount for event motion blur in downstream applications, especially\nreconstruction. However, none of the recent works on reconstructing Neural\nRadiance Fields (NeRFs) from events, nor event simulators, have considered the\nfull effects of event motion blur. To this end, we propose, Deblur e-NeRF, a\nnovel method to directly and effectively reconstruct blur-minimal NeRFs from\nmotion-blurred events generated under high-speed motion or low-light\nconditions. The core component of this work is a physically-accurate pixel\nbandwidth model proposed to account for event motion blur under arbitrary speed\nand lighting conditions. We also introduce a novel threshold-normalized total\nvariation loss to improve the regularization of large textureless patches.\nExperiments on real and novel realistically simulated sequences verify our\neffectiveness. Our code, event simulator and synthetic event dataset will be\nopen-sourced.\n","authors":["Weng Fei Low","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2409.17988v1.pdf","comment":"Accepted to ECCV 2024. Project website is accessible at\n https://wengflow.github.io/deblur-e-nerf. arXiv admin note: text overlap with\n arXiv:2006.07722 by other authors"},{"id":"http://arxiv.org/abs/2312.14950v2","updated":"2024-09-26T15:45:13Z","published":"2023-12-08T15:57:18Z","title":"TypeFly: Flying Drones with Large Language Model","summary":" Recent advancements in robot control using large language models (LLMs) have\ndemonstrated significant potential, primarily due to LLMs' capabilities to\nunderstand natural language commands and generate executable plans in various\nlanguages. However, in real-time and interactive applications involving mobile\nrobots, particularly drones, the sequential token generation process inherent\nto LLMs introduces substantial latency, i.e. response time, in control plan\ngeneration.\n In this paper, we present a system called ChatFly that tackles this problem\nusing a combination of a novel programming language called MiniSpec and its\nruntime to reduce the plan generation time and drone response time. That is,\ninstead of asking an LLM to write a program (robotic plan) in the popular but\nverbose Python, ChatFly gets it to do it in MiniSpec specially designed for\ntoken efficiency and stream interpretation. Using a set of challenging drone\ntasks, we show that design choices made by ChatFly can reduce up to 62%\nresponse time and provide a more consistent user experience, enabling\nresponsive and intelligent LLM-based drone control with efficient completion.\n","authors":["Guojun Chen","Xiaojing Yu","Neiwen Ling","Lin Zhong"],"pdf_url":"https://arxiv.org/pdf/2312.14950v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14115v4","updated":"2024-09-26T15:30:00Z","published":"2023-12-21T18:40:34Z","title":"LingoQA: Visual Question Answering for Autonomous Driving","summary":" We introduce LingoQA, a novel dataset and benchmark for visual question\nanswering in autonomous driving. The dataset contains 28K unique short video\nscenarios, and 419K annotations. Evaluating state-of-the-art vision-language\nmodels on our benchmark shows that their performance is below human\ncapabilities, with GPT-4V responding truthfully to 59.6% of the questions\ncompared to 96.6% for humans. For evaluation, we propose a truthfulness\nclassifier, called Lingo-Judge, that achieves a 0.95 Spearman correlation\ncoefficient to human evaluations, surpassing existing techniques like METEOR,\nBLEU, CIDEr, and GPT-4. We establish a baseline vision-language model and run\nextensive ablation studies to understand its performance. We release our\ndataset and benchmark as an evaluation platform for vision-language models in\nautonomous driving.\n","authors":["Ana-Maria Marcu","Long Chen","Jan Hünermann","Alice Karnsund","Benoit Hanotte","Prajwal Chidananda","Saurabh Nair","Vijay Badrinarayanan","Alex Kendall","Jamie Shotton","Elahe Arani","Oleg Sinavski"],"pdf_url":"https://arxiv.org/pdf/2312.14115v4.pdf","comment":"Accepted to ECCV 2024. Benchmark and dataset are available at\n https://github.com/wayveai/LingoQA/"},{"id":"http://arxiv.org/abs/2409.17896v1","updated":"2024-09-26T14:47:14Z","published":"2024-09-26T14:47:14Z","title":"Model-Free versus Model-Based Reinforcement Learning for Fixed-Wing UAV\n Attitude Control Under Varying Wind Conditions","summary":" This paper evaluates and compares the performance of model-free and\nmodel-based reinforcement learning for the attitude control of fixed-wing\nunmanned aerial vehicles using PID as a reference point. The comparison focuses\non their ability to handle varying flight dynamics and wind disturbances in a\nsimulated environment. Our results show that the Temporal Difference Model\nPredictive Control agent outperforms both the PID controller and other\nmodel-free reinforcement learning methods in terms of tracking accuracy and\nrobustness over different reference difficulties, particularly in nonlinear\nflight regimes. Furthermore, we introduce actuation fluctuation as a key metric\nto assess energy efficiency and actuator wear, and we test two different\napproaches from the literature: action variation penalty and conditioning for\naction policy smoothness. We also evaluate all control methods when subject to\nstochastic turbulence and gusts separately, so as to measure their effects on\ntracking performance, observe their limitations and outline their implications\non the Markov decision process formalism.\n","authors":["David Olivares","Pierre Fournier","Pavan Vasishta","Julien Marzat"],"pdf_url":"https://arxiv.org/pdf/2409.17896v1.pdf","comment":"Published at ICINCO 2024"},{"id":"http://arxiv.org/abs/2404.00769v2","updated":"2024-09-26T14:18:03Z","published":"2024-03-31T18:51:52Z","title":"An Active Perception Game for Robust Information Gathering","summary":" Active perception approaches select future viewpoints by using some estimate\nof the information gain. An inaccurate estimate can be detrimental in critical\nsituations, e.g., locating a person in distress. However the true information\ngained can only be calculated post hoc, i.e., after the observation is\nrealized. We present an approach for estimating the discrepancy between the\ninformation gain (which is the average over putative future observations) and\nthe true information gain. The key idea is to analyze the mathematical\nrelationship between active perception and the estimation error of the\ninformation gain in a game-theoretic setting. Using this, we develop an online\nestimation approach that achieves sub-linear regret (in the number of\ntime-steps) for the estimation of the true information gain and reduces the\nsub-optimality of active perception systems.\n We demonstrate our approach for active perception using a comprehensive set\nof experiments on: (a) different types of environments, including a quadrotor\nin a photorealistic simulation, real-world robotic data, and real-world\nexperiments with ground robots exploring indoor and outdoor scenes; (b)\ndifferent types of robotic perception data; and (c) different map\nrepresentations. On average, our approach reduces information gain estimation\nerrors by 42%, increases the information gain by 7%, PSNR by 5%, and semantic\naccuracy (measured as the number of objects that are localized correctly) by\n6%. In real-world experiments with a Jackal ground robot, our approach\ndemonstrated complex trajectories to explore occluded regions.\n","authors":["Siming He","Yuezhan Tao","Igor Spasojevic","Vijay Kumar","Pratik Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2404.00769v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04693v2","updated":"2024-09-26T13:53:33Z","published":"2024-04-06T17:41:36Z","title":"OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera\n Fusion for Colorizing Point Clouds","summary":" A Colored point cloud, as a simple and efficient 3D representation, has many\nadvantages in various fields, including robotic navigation and scene\nreconstruction. This representation is now commonly used in 3D reconstruction\ntasks relying on cameras and LiDARs. However, fusing data from these two types\nof sensors is poorly performed in many existing frameworks, leading to\nunsatisfactory mapping results, mainly due to inaccurate camera poses. This\npaper presents OmniColor, a novel and efficient algorithm to colorize point\nclouds using an independent 360-degree camera. Given a LiDAR-based point cloud\nand a sequence of panorama images with initial coarse camera poses, our\nobjective is to jointly optimize the poses of all frames for mapping images\nonto geometric reconstructions. Our pipeline works in an off-the-shelf manner\nthat does not require any feature extraction or matching process. Instead, we\nfind optimal poses by directly maximizing the photometric consistency of LiDAR\nmaps. In experiments, we show that our method can overcome the severe visual\ndistortion of omnidirectional images and greatly benefit from the wide field of\nview (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy\nand stability. The code will be released at\nhttps://github.com/liubonan123/OmniColor/.\n","authors":["Bonan Liu","Guoyang Zhao","Jianhao Jiao","Guang Cai","Chengyang Li","Handi Yin","Yuyang Wang","Ming Liu","Pan Hui"],"pdf_url":"https://arxiv.org/pdf/2404.04693v2.pdf","comment":"2024 IEEE International Conference on Robotics and Automation (ICRA)"},{"id":"http://arxiv.org/abs/2406.10615v2","updated":"2024-09-26T12:55:43Z","published":"2024-06-15T12:27:35Z","title":"Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation","summary":" Given the high cost of collecting robotic data in the real world, sample\nefficiency is a consistently compelling pursuit in robotics. In this paper, we\nintroduce SGRv2, an imitation learning framework that enhances sample\nefficiency through improved visual and action representations. Central to the\ndesign of SGRv2 is the incorporation of a critical inductive bias-action\nlocality, which posits that robot's actions are predominantly influenced by the\ntarget object and its interactions with the local environment. Extensive\nexperiments in both simulated and real-world settings demonstrate that action\nlocality is essential for boosting sample efficiency. SGRv2 excels in RLBench\ntasks with keyframe control using merely 5 demonstrations and surpasses the RVT\nbaseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and\nMimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR.\nIn real-world environments, with only eight demonstrations, SGRv2 can perform a\nvariety of tasks at a markedly higher success rate compared to baseline models.\nProject website: http://sgrv2-robot.github.io\n","authors":["Tong Zhang","Yingdong Hu","Jiacheng You","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2406.10615v2.pdf","comment":"CoRL 2024. Project website: http://sgrv2-robot.github.io"},{"id":"http://arxiv.org/abs/2409.17798v1","updated":"2024-09-26T12:47:36Z","published":"2024-09-26T12:47:36Z","title":"Swarm-LIO2: Decentralized, Efficient LiDAR-inertial Odometry for UAV\n Swarms","summary":" Aerial swarm systems possess immense potential in various aspects, such as\ncooperative exploration, target tracking, search and rescue. Efficient,\naccurate self and mutual state estimation are the critical preconditions for\ncompleting these swarm tasks, which remain challenging research topics. This\npaper proposes Swarm-LIO2: a fully decentralized, plug-and-play,\ncomputationally efficient, and bandwidth-efficient LiDAR-inertial odometry for\naerial swarm systems. Swarm-LIO2 uses a decentralized, plug-and-play network as\nthe communication infrastructure. Only bandwidth-efficient and low-dimensional\ninformation is exchanged, including identity, ego-state, mutual observation\nmeasurements, and global extrinsic transformations. To support the\nplug-and-play of new teammate participants, Swarm-LIO2 detects potential\nteammate UAVs and initializes the temporal offset and global extrinsic\ntransformation all automatically. To enhance the initialization efficiency,\nnovel reflectivity-based UAV detection, trajectory matching, and factor graph\noptimization methods are proposed. For state estimation, Swarm-LIO2 fuses\nLiDAR, IMU, and mutual observation measurements within an efficient ESIKF\nframework, with careful compensation of temporal delay and modeling of\nmeasurements to enhance the accuracy and consistency.\n","authors":["Fangcheng Zhu","Yunfan Ren","Longji Yin","Fanze Kong","Qingbo Liu","Ruize Xue","Wenyi Liu","Yixi Cai","Guozheng Lu","Haotian Li","Fu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17798v1.pdf","comment":"23 Pages"},{"id":"http://arxiv.org/abs/2404.06926v2","updated":"2024-09-26T12:31:23Z","published":"2024-04-10T11:24:34Z","title":"Gaussian-LIC: Real-Time Photo-Realistic SLAM with Gaussian Splatting and\n LiDAR-Inertial-Camera Fusion","summary":" In this paper, we present a real-time photo-realistic SLAM method based on\nmarrying Gaussian Splatting with LiDAR-Inertial-Camera SLAM. Most existing\nradiance-field-based SLAM systems mainly focus on bounded indoor environments,\nequipped with RGB-D or RGB sensors. However, they are prone to decline when\nexpanding to unbounded scenes or encountering adverse conditions, such as\nviolent motions and changing illumination. In contrast, oriented to general\nscenarios, our approach additionally tightly fuses LiDAR, IMU, and camera for\nrobust pose estimation and photo-realistic online mapping. To compensate for\nregions unobserved by the LiDAR, we propose to integrate both the triangulated\nvisual points from images and LiDAR points for initializing 3D Gaussians. In\naddition, the modeling of the sky and varying camera exposure have been\nrealized for high-quality rendering. Notably, we implement our system purely\nwith C++ and CUDA, and meticulously design a series of strategies to accelerate\nthe online optimization of the Gaussian-based scene representation. Extensive\nexperiments demonstrate that our method outperforms its counterparts while\nmaintaining real-time capability. Impressively, regarding photo-realistic\nmapping, our method with our estimated poses even surpasses all the compared\napproaches that utilize privileged ground-truth poses for mapping. Our code\nwill be released on project page https://xingxingzuo.github.io/gaussian_lic.\n","authors":["Xiaolei Lang","Laijian Li","Chenming Wu","Chen Zhao","Lina Liu","Yong Liu","Jiajun Lv","Xingxing Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.06926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07865v4","updated":"2024-09-26T12:18:49Z","published":"2024-05-13T15:53:18Z","title":"AnoVox: A Benchmark for Multimodal Anomaly Detection in Autonomous\n Driving","summary":" The scale-up of autonomous vehicles depends heavily on their ability to deal\nwith anomalies, such as rare objects on the road. In order to handle such\nsituations, it is necessary to detect anomalies in the first place. Anomaly\ndetection for autonomous driving has made great progress in the past years but\nsuffers from poorly designed benchmarks with a strong focus on camera data. In\nthis work, we propose AnoVox, the largest benchmark for ANOmaly detection in\nautonomous driving to date. AnoVox incorporates large-scale multimodal sensor\ndata and spatial VOXel ground truth, allowing for the comparison of methods\nindependent of their used sensor. We propose a formal definition of normality\nand provide a compliant training dataset. AnoVox is the first benchmark to\ncontain both content and temporal anomalies.\n","authors":["Daniel Bogdoll","Iramm Hamdard","Lukas Namgyu Rößler","Felix Geisler","Muhammed Bayram","Felix Wang","Jan Imhof","Miguel de Campos","Anushervon Tabarov","Yitian Yang","Hanno Gottschalk","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2405.07865v4.pdf","comment":"Daniel Bogdoll, Iramm Hamdard, and Lukas Namgyu R\\\"o{\\ss}ler\n contributed equally. Accepted for publication at ECCV 2024 W-CODA workshop"},{"id":"http://arxiv.org/abs/2409.17755v1","updated":"2024-09-26T11:40:07Z","published":"2024-09-26T11:40:07Z","title":"SECURE: Semantics-aware Embodied Conversation under Unawareness for\n Lifelong Robot Learning","summary":" This paper addresses a challenging interactive task learning scenario we call\nrearrangement under unawareness: to manipulate a rigid-body environment in a\ncontext where the robot is unaware of a concept that's key to solving the\ninstructed task. We propose SECURE, an interactive task learning framework\ndesigned to solve such problems by fixing a deficient domain model using\nembodied conversation. Through dialogue, the robot discovers and then learns to\nexploit unforeseen possibilities. Using SECURE, the robot not only learns from\nthe user's corrective feedback when it makes a mistake, but it also learns to\nmake strategic dialogue decisions for revealing useful evidence about novel\nconcepts for solving the instructed task. Together, these abilities allow the\nrobot to generalise to subsequent tasks using newly acquired knowledge. We\ndemonstrate that a robot that is semantics-aware -- that is, it exploits the\nlogical consequences of both sentence and discourse semantics in the learning\nand inference process -- learns to solve rearrangement under unawareness more\neffectively than a robot that lacks such capabilities.\n","authors":["Rimvydas Rubavicius","Peter David Fagan","Alex Lascarides","Subramanian Ramamoorthy"],"pdf_url":"https://arxiv.org/pdf/2409.17755v1.pdf","comment":"10 pages,4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2406.10759v2","updated":"2024-09-26T11:13:10Z","published":"2024-06-15T23:21:10Z","title":"Humanoid Parkour Learning","summary":" Parkour is a grand challenge for legged locomotion, even for quadruped\nrobots, requiring active perception and various maneuvers to overcome multiple\nchallenging obstacles. Existing methods for humanoid locomotion either optimize\na trajectory for a single parkour track or train a reinforcement learning\npolicy only to walk with a significant amount of motion references. In this\nwork, we propose a framework for learning an end-to-end vision-based\nwhole-body-control parkour policy for humanoid robots that overcomes multiple\nparkour skills without any motion prior. Using the parkour policy, the humanoid\nrobot can jump on a 0.42m platform, leap over hurdles, 0.8m gaps, and much\nmore. It can also run at 1.8m/s in the wild and walk robustly on different\nterrains. We test our policy in indoor and outdoor environments to demonstrate\nthat it can autonomously select parkour skills while following the rotation\ncommand of the joystick. We override the arm actions and show that this\nframework can easily transfer to humanoid mobile manipulation tasks. Videos can\nbe found at https://humanoid4parkour.github.io\n","authors":["Ziwen Zhuang","Shenzhe Yao","Hang Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.10759v2.pdf","comment":"Published on CoRL 2024"},{"id":"http://arxiv.org/abs/2409.17731v1","updated":"2024-09-26T11:01:00Z","published":"2024-09-26T11:01:00Z","title":"Robust Ladder Climbing with a Quadrupedal Robot","summary":" Quadruped robots are proliferating in industrial environments where they\ncarry sensor suites and serve as autonomous inspection platforms. Despite the\nadvantages of legged robots over their wheeled counterparts on rough and uneven\nterrain, they are still yet to be able to reliably negotiate ubiquitous\nfeatures of industrial infrastructure: ladders. Inability to traverse ladders\nprevents quadrupeds from inspecting dangerous locations, puts humans in harm's\nway, and reduces industrial site productivity. In this paper, we learn\nquadrupedal ladder climbing via a reinforcement learning-based control policy\nand a complementary hooked end-effector. We evaluate the robustness in\nsimulation across different ladder inclinations, rung geometries, and\ninter-rung spacings. On hardware, we demonstrate zero-shot transfer with an\noverall 90% success rate at ladder angles ranging from 70{\\deg} to 90{\\deg},\nconsistent climbing performance during unmodeled perturbations, and climbing\nspeeds 232x faster than the state of the art. This work expands the scope of\nindustrial quadruped robot applications beyond inspection on nominal terrains\nto challenging infrastructural features in the environment, highlighting\nsynergies between robot morphology and control policy when performing complex\nskills. More information can be found at the project website:\nhttps://sites.google.com/leggedrobotics.com/climbingladders.\n","authors":["Dylan Vogel","Robert Baines","Joseph Church","Julian Lotzer","Karl Werner","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2409.17731v1.pdf","comment":"Project website:\n https://sites.google.com/leggedrobotics.com/climbingladders"},{"id":"http://arxiv.org/abs/2409.17727v1","updated":"2024-09-26T10:56:35Z","published":"2024-09-26T10:56:35Z","title":"Robotic-CLIP: Fine-tuning CLIP on Action Data for Robotic Applications","summary":" Vision language models have played a key role in extracting meaningful\nfeatures for various robotic applications. Among these, Contrastive\nLanguage-Image Pretraining (CLIP) is widely used in robotic tasks that require\nboth vision and natural language understanding. However, CLIP was trained\nsolely on static images paired with text prompts and has not yet been fully\nadapted for robotic tasks involving dynamic actions. In this paper, we\nintroduce Robotic-CLIP to enhance robotic perception capabilities. We first\ngather and label large-scale action data, and then build our Robotic-CLIP by\nfine-tuning CLIP on 309,433 videos (~7.4 million frames) of action data using\ncontrastive learning. By leveraging action data, Robotic-CLIP inherits CLIP's\nstrong image performance while gaining the ability to understand actions in\nrobotic contexts. Intensive experiments show that our Robotic-CLIP outperforms\nother CLIP-based models across various language-driven robotic tasks.\nAdditionally, we demonstrate the practical effectiveness of Robotic-CLIP in\nreal-world grasping applications.\n","authors":["Nghia Nguyen","Minh Nhat Vu","Tung D. Ta","Baoru Huang","Thieu Vo","Ngan Le","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.17727v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2409.17725v1","updated":"2024-09-26T10:55:07Z","published":"2024-09-26T10:55:07Z","title":"Stable Object Placement Under Geometric Uncertainty via Differentiable\n Contact Dynamics","summary":" From serving a cup of coffee to carefully rearranging delicate items, stable\nobject placement is a crucial skill for future robots. This skill is\nchallenging due to the required accuracy, which is difficult to achieve under\ngeometric uncertainty. We leverage differentiable contact dynamics to develop a\nprincipled method for stable object placement under geometric uncertainty. We\nestimate the geometric uncertainty by minimizing the discrepancy between the\nforce-torque sensor readings and the model predictions through gradient\ndescent. We further keep track of a belief over multiple possible geometric\nparameters to mitigate the gradient-based method's sensitivity to the\ninitialization. We verify our approach in the real world on various geometric\nuncertainties, including the in-hand pose uncertainty of the grasped object,\nthe object's shape uncertainty, and the environment's shape uncertainty.\n","authors":["Linfeng Li","Gang Yang","Lin Shao","David Hsu"],"pdf_url":"https://arxiv.org/pdf/2409.17725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08160v2","updated":"2024-09-26T10:54:32Z","published":"2024-08-15T13:49:14Z","title":"General-purpose Clothes Manipulation with Semantic Keypoints","summary":" Clothes manipulation is a critical skill for household robots. Recent\nadvancements have been made in task-specific clothes manipulation, such as\nfolding, flattening, and hanging. However, due to clothes' complex geometries\nand deformability, creating a general-purpose robot system that can manipulate\na diverse range of clothes in many ways remains challenging. Since clothes are\ntypically designed with specific structures, we propose identifying these\nspecific features like ``left sleeve'' as semantic keypoints. Semantic\nkeypoints can provide semantic cues for task planning and geometric cues for\nlow-level action generation. With this insight, we develop a hierarchical\nlearning framework using the large language model (LLM) for general-purpose\nCLothes mAnipulation with Semantic keyPoints (CLASP). Extensive simulation\nexperiments show that CLASP outperforms baseline methods on both seen and\nunseen tasks across various clothes manipulation tasks. Real-world experiments\nshow that CLASP can be directly deployed in the real world and applied to a\nwide variety of clothes.\n","authors":["Yuhong Deng","David Hsu"],"pdf_url":"https://arxiv.org/pdf/2408.08160v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17720v1","updated":"2024-09-26T10:43:09Z","published":"2024-09-26T10:43:09Z","title":"Scene Understanding in Pick-and-Place Tasks: Analyzing Transformations\n Between Initial and Final Scenes","summary":" With robots increasingly collaborating with humans in everyday tasks, it is\nimportant to take steps toward robotic systems capable of understanding the\nenvironment. This work focuses on scene understanding to detect pick and place\ntasks given initial and final images from the scene. To this end, a dataset is\ncollected for object detection and pick and place task detection. A YOLOv5\nnetwork is subsequently trained to detect the objects in the initial and final\nscenes. Given the detected objects and their bounding boxes, two methods are\nproposed to detect the pick and place tasks which transform the initial scene\ninto the final scene. A geometric method is proposed which tracks objects'\nmovements in the two scenes and works based on the intersection of the bounding\nboxes which moved within scenes. Contrarily, the CNN-based method utilizes a\nConvolutional Neural Network to classify objects with intersected bounding\nboxes into 5 classes, showing the spatial relationship between the involved\nobjects. The performed pick and place tasks are then derived from analyzing the\nexperiments with both scenes. Results show that the CNN-based method, using a\nVGG16 backbone, outscores the geometric method by roughly 12 percentage points\nin certain scenarios, with an overall success rate of 84.3%.\n","authors":["Seraj Ghasemi","Hamed Hosseini","MohammadHossein Koosheshi","Mehdi Tale Masouleh","Ahmad Kalhor"],"pdf_url":"https://arxiv.org/pdf/2409.17720v1.pdf","comment":"Conference Paper, ICEE 2024, 7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17702v1","updated":"2024-09-26T10:16:08Z","published":"2024-09-26T10:16:08Z","title":"Episodic Memory Verbalization using Hierarchical Representations of\n Life-Long Robot Experience","summary":" Verbalization of robot experience, i.e., summarization of and question\nanswering about a robot's past, is a crucial ability for improving human-robot\ninteraction. Previous works applied rule-based systems or fine-tuned deep\nmodels to verbalize short (several-minute-long) streams of episodic data,\nlimiting generalization and transferability. In our work, we apply large\npretrained models to tackle this task with zero or few examples, and\nspecifically focus on verbalizing life-long experiences. For this, we derive a\ntree-like data structure from episodic memory (EM), with lower levels\nrepresenting raw perception and proprioception data, and higher levels\nabstracting events to natural language concepts. Given such a hierarchical\nrepresentation built from the experience stream, we apply a large language\nmodel as an agent to interactively search the EM given a user's query,\ndynamically expanding (initially collapsed) tree nodes to find the relevant\ninformation. The approach keeps computational costs low even when scaling to\nmonths of robot experience data. We evaluate our method on simulated household\nrobot data, human egocentric videos, and real-world robot recordings,\ndemonstrating its flexibility and scalability.\n","authors":["Leonard Bärmann","Chad DeChant","Joana Plewnia","Fabian Peller-Konrad","Daniel Bauer","Tamim Asfour","Alex Waibel"],"pdf_url":"https://arxiv.org/pdf/2409.17702v1.pdf","comment":"Code, data and demo videos at https://hierarchical-emv.github.io"},{"id":"http://arxiv.org/abs/2409.17680v1","updated":"2024-09-26T09:43:50Z","published":"2024-09-26T09:43:50Z","title":"Event-based Stereo Depth Estimation: A Survey","summary":" Stereopsis has widespread appeal in robotics as it is the predominant way by\nwhich living beings perceive depth to navigate our 3D world. Event cameras are\nnovel bio-inspired sensors that detect per-pixel brightness changes\nasynchronously, with very high temporal resolution and high dynamic range,\nenabling machine perception in high-speed motion and broad illumination\nconditions. The high temporal precision also benefits stereo matching, making\ndisparity (depth) estimation a popular research area for event cameras ever\nsince its inception. Over the last 30 years, the field has evolved rapidly,\nfrom low-latency, low-power circuit design to current deep learning (DL)\napproaches driven by the computer vision community. The bibliography is vast\nand difficult to navigate for non-experts due its highly interdisciplinary\nnature. Past surveys have addressed distinct aspects of this topic, in the\ncontext of applications, or focusing only on a specific class of techniques,\nbut have overlooked stereo datasets. This survey provides a comprehensive\noverview, covering both instantaneous stereo and long-term methods suitable for\nsimultaneous localization and mapping (SLAM), along with theoretical and\nempirical comparisons. It is the first to extensively review DL methods as well\nas stereo datasets, even providing practical suggestions for creating new\nbenchmarks to advance the field. The main advantages and challenges faced by\nevent-based stereo depth estimation are also discussed. Despite significant\nprogress, challenges remain in achieving optimal performance in not only\naccuracy but also efficiency, a cornerstone of event-based computing. We\nidentify several gaps and propose future research directions. We hope this\nsurvey inspires future research in this area, by serving as an accessible entry\npoint for newcomers, as well as a practical guide for seasoned researchers in\nthe community.\n","authors":["Suman Ghosh","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2409.17680v1.pdf","comment":"28 pages, 20 figures, 7 tables"},{"id":"http://arxiv.org/abs/2409.17655v1","updated":"2024-09-26T09:06:56Z","published":"2024-09-26T09:06:56Z","title":"AssistantX: An LLM-Powered Proactive Assistant in Collaborative\n Human-Populated Environment","summary":" The increasing demand for intelligent assistants in human-populated\nenvironments has motivated significant research in autonomous robotic systems.\nTraditional service robots and virtual assistants, however, struggle with\nreal-world task execution due to their limited capacity for dynamic reasoning\nand interaction, particularly when human collaboration is required. Recent\ndevelopments in Large Language Models have opened new avenues for improving\nthese systems, enabling more sophisticated reasoning and natural interaction\ncapabilities. In this paper, we introduce AssistantX, an LLM-powered proactive\nassistant designed to operate autonomously in a physical office environment.\nUnlike conventional service robots, AssistantX leverages a novel multi-agent\narchitecture, PPDR4X, which provides advanced inference capabilities and\ncomprehensive collaboration awareness. By effectively bridging the gap between\nvirtual operations and physical interactions, AssistantX demonstrates robust\nperformance in managing complex real-world scenarios. Our evaluation highlights\nthe architecture's effectiveness, showing that AssistantX can respond to clear\ninstructions, actively retrieve supplementary information from memory, and\nproactively seek collaboration from team members to ensure successful task\ncompletion. More details and videos can be found at\nhttps://assistantx-agent.github.io/AssistantX/.\n","authors":["Nan Sun","Bo Mao","Yongchang Li","Lumeng Ma","Di Guo","Huaping Liu"],"pdf_url":"https://arxiv.org/pdf/2409.17655v1.pdf","comment":"6 pages, 8 figures, 4 tables"},{"id":"http://arxiv.org/abs/2312.15897v2","updated":"2024-09-26T09:04:58Z","published":"2023-12-26T06:20:55Z","title":"Recursive Distillation for Open-Set Distributed Robot Localization","summary":" A typical assumption in state-of-the-art self-localization models is that an\nannotated training dataset is available for the target workspace. However, this\nis not necessarily true when a robot travels around the general open world.\nThis work introduces a novel training scheme for open-world distributed robot\nsystems. In our scheme, a robot (``student\") can ask the other robots it meets\nat unfamiliar places (``teachers\") for guidance. Specifically, a\npseudo-training dataset is reconstructed from the teacher model and then used\nfor continual learning of the student model under domain, class, and vocabulary\nincremental setup. Unlike typical knowledge transfer schemes, our scheme\nintroduces only minimal assumptions on the teacher model, so that it can handle\nvarious types of open-set teachers, including those uncooperative, untrainable\n(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In\nthis paper, we investigate a ranking function as an instance of such generic\nmodels, using a challenging data-free recursive distillation scenario, where a\nstudent once trained can recursively join the next-generation open teacher set.\n","authors":["Kenta Tsukahara","Kanji Tanaka"],"pdf_url":"https://arxiv.org/pdf/2312.15897v2.pdf","comment":"5 pages, 4 figures, technical report"},{"id":"http://arxiv.org/abs/2409.17652v1","updated":"2024-09-26T09:00:30Z","published":"2024-09-26T09:00:30Z","title":"FactorSim: Generative Simulation via Factorized Representation","summary":" Generating simulations to train intelligent agents in game-playing and\nrobotics from natural language input, from user input or task documentation,\nremains an open-ended challenge. Existing approaches focus on parts of this\nchallenge, such as generating reward functions or task hyperparameters. Unlike\nprevious work, we introduce FACTORSIM that generates full simulations in code\nfrom language input that can be used to train agents. Exploiting the structural\nmodularity specific to coded simulations, we propose to use a factored\npartially observable Markov decision process representation that allows us to\nreduce context dependence during each step of the generation. For evaluation,\nwe introduce a generative simulation benchmark that assesses the generated\nsimulation code's accuracy and effectiveness in facilitating zero-shot\ntransfers in reinforcement learning settings. We show that FACTORSIM\noutperforms existing methods in generating simulations regarding prompt\nalignment (e.g., accuracy), zero-shot transfer abilities, and human evaluation.\nWe also demonstrate its effectiveness in generating robotic tasks.\n","authors":["Fan-Yun Sun","S. I. Harini","Angela Yi","Yihan Zhou","Alex Zook","Jonathan Tremblay","Logan Cross","Jiajun Wu","Nick Haber"],"pdf_url":"https://arxiv.org/pdf/2409.17652v1.pdf","comment":"neurips 2024, project website:\n https://cs.stanford.edu/~sunfanyun/factorsim/"},{"id":"http://arxiv.org/abs/2409.17641v1","updated":"2024-09-26T08:44:49Z","published":"2024-09-26T08:44:49Z","title":"AP-VLM: Active Perception Enabled by Vision-Language Models","summary":" Active perception enables robots to dynamically gather information by\nadjusting their viewpoints, a crucial capability for interacting with complex,\npartially observable environments. In this paper, we present AP-VLM, a novel\nframework that combines active perception with a Vision-Language Model (VLM) to\nguide robotic exploration and answer semantic queries. Using a 3D virtual grid\noverlaid on the scene and orientation adjustments, AP-VLM allows a robotic\nmanipulator to intelligently select optimal viewpoints and orientations to\nresolve challenging tasks, such as identifying objects in occluded or inclined\npositions. We evaluate our system on two robotic platforms: a 7-DOF Franka\nPanda and a 6-DOF UR5, across various scenes with differing object\nconfigurations. Our results demonstrate that AP-VLM significantly outperforms\npassive perception methods and baseline models, including Toward Grounded\nCommon Sense Reasoning (TGCSR), particularly in scenarios where fixed camera\nviews are inadequate. The adaptability of AP-VLM in real-world settings shows\npromise for enhancing robotic systems' understanding of complex environments,\nbridging the gap between high-level semantic reasoning and low-level control.\n","authors":["Venkatesh Sripada","Samuel Carter","Frank Guerin","Amir Ghalamzan"],"pdf_url":"https://arxiv.org/pdf/2409.17641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17630v1","updated":"2024-09-26T08:25:05Z","published":"2024-09-26T08:25:05Z","title":"System-Level Safety Monitoring and Recovery for Perception Failures in\n Autonomous Vehicles","summary":" The safety-critical nature of autonomous vehicle (AV) operation necessitates\ndevelopment of task-relevant algorithms that can reason about safety at the\nsystem level and not just at the component level. To reason about the impact of\na perception failure on the entire system performance, such task-relevant\nalgorithms must contend with various challenges: complexity of AV stacks, high\nuncertainty in the operating environments, and the need for real-time\nperformance. To overcome these challenges, in this work, we introduce a\nQ-network called SPARQ (abbreviation for Safety evaluation for Perception And\nRecovery Q-network) that evaluates the safety of a plan generated by a planning\nalgorithm, accounting for perception failures that the planning process may\nhave overlooked. This Q-network can be queried during system runtime to assess\nwhether a proposed plan is safe for execution or poses potential safety risks.\nIf a violation is detected, the network can then recommend a corrective plan\nwhile accounting for the perceptual failure. We validate our algorithm using\nthe NuPlan-Vegas dataset, demonstrating its ability to handle cases where a\nperception failure compromises a proposed plan while the corrective plan\nremains safe. We observe an overall accuracy and recall of 90% while sustaining\na frequency of 42Hz on the unseen testing dataset. We compare our performance\nto a popular reachability-based baseline and analyze some interesting\nproperties of our approach in improving the safety properties of an AV\npipeline.\n","authors":["Kaustav Chakraborty","Zeyuan Feng","Sushant Veer","Apoorva Sharma","Boris Ivanovic","Marco Pavone","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2409.17630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17624v1","updated":"2024-09-26T08:19:21Z","published":"2024-09-26T08:19:21Z","title":"HGS-Planner: Hierarchical Planning Framework for Active Scene\n Reconstruction Using 3D Gaussian Splatting","summary":" In complex missions such as search and rescue,robots must make intelligent\ndecisions in unknown environments, relying on their ability to perceive and\nunderstand their surroundings. High-quality and real-time reconstruction\nenhances situational awareness and is crucial for intelligent robotics.\nTraditional methods often struggle with poor scene representation or are too\nslow for real-time use. Inspired by the efficacy of 3D Gaussian Splatting\n(3DGS), we propose a hierarchical planning framework for fast and high-fidelity\nactive reconstruction. Our method evaluates completion and quality gain to\nadaptively guide reconstruction, integrating global and local planning for\nefficiency. Experiments in simulated and real-world environments show our\napproach outperforms existing real-time methods.\n","authors":["Zijun Xu","Rui Jin","Ke Wu","Yi Zhao","Zhiwei Zhang","Jieru Zhao","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2409.17624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17621v1","updated":"2024-09-26T08:16:53Z","published":"2024-09-26T08:16:53Z","title":"Leveraging Semantic and Geometric Information for Zero-Shot\n Robot-to-Human Handover","summary":" Human-robot interaction (HRI) encompasses a wide range of collaborative\ntasks, with handover being one of the most fundamental. As robots become more\nintegrated into human environments, the potential for service robots to assist\nin handing objects to humans is increasingly promising. In robot-to-human (R2H)\nhandover, selecting the optimal grasp is crucial for success, as it requires\navoiding interference with the humans preferred grasp region and minimizing\nintrusion into their workspace. Existing methods either inadequately consider\ngeometric information or rely on data-driven approaches, which often struggle\nto generalize across diverse objects. To address these limitations, we propose\na novel zero-shot system that combines semantic and geometric information to\ngenerate optimal handover grasps. Our method first identifies grasp regions\nusing semantic knowledge from vision-language models (VLMs) and, by\nincorporating customized visual prompts, achieves finer granularity in region\ngrounding. A grasp is then selected based on grasp distance and approach angle\nto maximize human ease and avoid interference. We validate our approach through\nablation studies and real-world comparison experiments. Results demonstrate\nthat our system improves handover success rates and provides a more\nuser-preferred interaction experience. Videos, appendixes and more are\navailable at https://sites.google.com/view/vlm-handover/.\n","authors":["Jiangshan Liu","Wenlong Dong","Jiankun Wang","Max Q. -H. Meng"],"pdf_url":"https://arxiv.org/pdf/2409.17621v1.pdf","comment":"6 pages, 5 figures, conference"},{"id":"http://arxiv.org/abs/2409.17618v1","updated":"2024-09-26T08:10:28Z","published":"2024-09-26T08:10:28Z","title":"Learning Occlusion-aware Decision-making from Agent Interaction via\n Active Perception","summary":" Occlusion-aware decision-making is essential in autonomous driving due to the\nhigh uncertainty of various occlusions. Recent occlusion-aware decision-making\nmethods encounter issues such as high computational complexity, scenario\nscalability challenges, or reliance on limited expert data. Benefiting from\nautomatically generating data by exploration randomization, we uncover that\nreinforcement learning (RL) may show promise in occlusion-aware\ndecision-making. However, previous occlusion-aware RL faces challenges in\nexpanding to various dynamic and static occlusion scenarios, low learning\nefficiency, and lack of predictive ability. To address these issues, we\nintroduce Pad-AI, a self-reinforcing framework to learn occlusion-aware\ndecision-making through active perception. Pad-AI utilizes vectorized\nrepresentation to represent occluded environments efficiently and learns over\nthe semantic motion primitives to focus on high-level active perception\nexploration. Furthermore, Pad-AI integrates prediction and RL within a unified\nframework to provide risk-aware learning and security guarantees. Our framework\nwas tested in challenging scenarios under both dynamic and static occlusions\nand demonstrated efficient and general perception-aware exploration performance\nto other strong baselines in closed-loop evaluations.\n","authors":["Jie Jia","Yiming Shu","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2409.17618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17562v1","updated":"2024-09-26T06:21:52Z","published":"2024-09-26T06:21:52Z","title":"Software for the SpaceDREAM Robotic Arm","summary":" Impedance-controlled robots are widely used on Earth to perform\ninteraction-rich tasks and will be a key enabler for In-Space Servicing,\nAssembly and Manufacturing (ISAM) activities. This paper introduces the\nsoftware architecture used on the On-Board Computer (OBC) for the planned\nSpaceDREAM mission aiming to validate such robotic arm in Lower Earth Orbit\n(LEO) conducted by the German Aerospace Center (DLR) in cooperation with\nKINETIK Space GmbH and the Technical University of Munich (TUM). During the\nmission several free motion as well as contact tasks are to be performed in\norder to verify proper functionality of the robot in position and impedance\ncontrol on joint level as well as in cartesian control. The tasks are selected\nto be representative for subsequent servicing missions e.g. requiring interface\ndocking or precise manipulation.\n The software on the OBC commands the robot's joints via SpaceWire to perform\nthose mission tasks, reads camera images and data from additional sensors and\nsends telemetry data through an Ethernet link via the spacecraft down to Earth.\nIt is set up to execute a predefined mission after receiving a start signal\nfrom the spacecraft while it should be extendable to receive commands from\nEarth for later missions. Core design principle was to reuse as much existing\nsoftware and to stay as close as possible to existing robot software stacks at\nDLR. This allowed for a quick full operational start of the robot arm compared\nto a custom development of all robot software, a lower entry barrier for\nsoftware developers as well as a reuse of existing libraries. While not every\nline of code can be tested with this design, most of the software has already\nproven its functionality through daily execution on multiple robot systems.\n","authors":["Maximilian Mühlbauer","Maxime Chalon","Maximilian Ulmer","Alin Albu-Schäffer"],"pdf_url":"https://arxiv.org/pdf/2409.17562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02569v2","updated":"2024-09-26T05:57:37Z","published":"2024-04-03T08:42:36Z","title":"SliceIt! -- A Dual Simulator Framework for Learning Robot Food Slicing","summary":" Cooking robots can enhance the home experience by reducing the burden of\ndaily chores. However, these robots must perform their tasks dexterously and\nsafely in shared human environments, especially when handling dangerous tools\nsuch as kitchen knives. This study focuses on enabling a robot to autonomously\nand safely learn food-cutting tasks. More specifically, our goal is to enable a\ncollaborative robot or industrial robot arm to perform food-slicing tasks by\nadapting to varying material properties using compliance control. Our approach\ninvolves using Reinforcement Learning (RL) to train a robot to compliantly\nmanipulate a knife, by reducing the contact forces exerted by the food items\nand by the cutting board. However, training the robot in the real world can be\ninefficient, and dangerous, and result in a lot of food waste. Therefore, we\nproposed SliceIt!, a framework for safely and efficiently learning robot\nfood-slicing tasks in simulation. Following a real2sim2real approach, our\nframework consists of collecting a few real food slicing data, calibrating our\ndual simulation environment (a high-fidelity cutting simulator and a robotic\nsimulator), learning compliant control policies on the calibrated simulation\nenvironment, and finally, deploying the policies on the real robot.\n","authors":["Cristian C. Beltran-Hernandez","Nicolas Erbetti","Masashi Hamaya"],"pdf_url":"https://arxiv.org/pdf/2404.02569v2.pdf","comment":"Accepted to ICRA 2024"},{"id":"http://arxiv.org/abs/2406.14990v2","updated":"2024-09-26T05:51:20Z","published":"2024-06-21T09:03:37Z","title":"Learning Variable Compliance Control From a Few Demonstrations for\n Bimanual Robot with Haptic Feedback Teleoperation System","summary":" Automating dexterous, contact-rich manipulation tasks using rigid robots is a\nsignificant challenge in robotics. Rigid robots, defined by their actuation\nthrough position commands, face issues of excessive contact forces due to their\ninability to adapt to contact with the environment, potentially causing damage.\nWhile compliance control schemes have been introduced to mitigate these issues\nby controlling forces via external sensors, they are hampered by the need for\nfine-tuning task-specific controller parameters. Learning from Demonstrations\n(LfD) offers an intuitive alternative, allowing robots to learn manipulations\nthrough observed actions. In this work, we introduce a novel system to enhance\nthe teaching of dexterous, contact-rich manipulations to rigid robots. Our\nsystem is twofold: firstly, it incorporates a teleoperation interface utilizing\nVirtual Reality (VR) controllers, designed to provide an intuitive and\ncost-effective method for task demonstration with haptic feedback. Secondly, we\npresent Comp-ACT (Compliance Control via Action Chunking with Transformers), a\nmethod that leverages the demonstrations to learn variable compliance control\nfrom a few demonstrations. Our methods have been validated across various\ncomplex contact-rich manipulation tasks using single-arm and bimanual robot\nsetups in simulated and real-world environments, demonstrating the\neffectiveness of our system in teaching robots dexterous manipulations with\nenhanced adaptability and safety. Code available at:\nhttps://github.com/omron-sinicx/CompACT\n","authors":["Tatsuya Kamijo","Cristian C. Beltran-Hernandez","Masashi Hamaya"],"pdf_url":"https://arxiv.org/pdf/2406.14990v2.pdf","comment":"Accepted to IROS 2024"},{"id":"http://arxiv.org/abs/2409.17549v1","updated":"2024-09-26T05:37:52Z","published":"2024-09-26T05:37:52Z","title":"Canonical Representation and Force-Based Pretraining of 3D Tactile for\n Dexterous Visuo-Tactile Policy Learning","summary":" Tactile sensing plays a vital role in enabling robots to perform\nfine-grained, contact-rich tasks. However, the high dimensionality of tactile\ndata, due to the large coverage on dexterous hands, poses significant\nchallenges for effective tactile feature learning, especially for 3D tactile\ndata, as there are no large standardized datasets and no strong pretrained\nbackbones. To address these challenges, we propose a novel canonical\nrepresentation that reduces the difficulty of 3D tactile feature learning and\nfurther introduces a force-based self-supervised pretraining task to capture\nboth local and net force features, which are crucial for dexterous\nmanipulation. Our method achieves an average success rate of 78% across four\nfine-grained, contact-rich dexterous manipulation tasks in real-world\nexperiments, demonstrating effectiveness and robustness compared to other\nmethods. Further analysis shows that our method fully utilizes both spatial and\nforce information from 3D tactile data to accomplish the tasks. The videos can\nbe viewed at https://3dtacdex.github.io.\n","authors":["Tianhao Wu","Jinzhou Li","Jiyao Zhang","Mingdong Wu","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2409.17549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15780v3","updated":"2024-09-26T04:38:13Z","published":"2024-09-24T06:22:28Z","title":"A Learning Framework for Diverse Legged Robot Locomotion Using\n Barrier-Based Style Rewards","summary":" This work introduces a model-free reinforcement learning framework that\nenables various modes of motion (quadruped, tripod, or biped) and diverse tasks\nfor legged robot locomotion. We employ a motion-style reward based on a relaxed\nlogarithmic barrier function as a soft constraint, to bias the learning process\ntoward the desired motion style, such as gait, foot clearance, joint position,\nor body height. The predefined gait cycle is encoded in a flexible manner,\nfacilitating gait adjustments throughout the learning process. Extensive\nexperiments demonstrate that KAIST HOUND, a 45 kg robotic system, can achieve\nbiped, tripod, and quadruped locomotion using the proposed framework;\nquadrupedal capabilities include traversing uneven terrain, galloping at 4.67\nm/s, and overcoming obstacles up to 58 cm (67 cm for HOUND2); bipedal\ncapabilities include running at 3.6 m/s, carrying a 7.5 kg object, and\nascending stairs-all performed without exteroceptive input.\n","authors":["Gijeong Kim","Yong-Hoon Lee","Hae-Won Park"],"pdf_url":"https://arxiv.org/pdf/2409.15780v3.pdf","comment":"7 pages, 5 figures, Videos at https://youtu.be/JV2_HfTlOKI"},{"id":"http://arxiv.org/abs/2409.17519v1","updated":"2024-09-26T04:02:20Z","published":"2024-09-26T04:02:20Z","title":"Robotic Environmental State Recognition with Pre-Trained Vision-Language\n Models and Black-Box Optimization","summary":" In order for robots to autonomously navigate and operate in diverse\nenvironments, it is essential for them to recognize the state of their\nenvironment. On the other hand, the environmental state recognition has\ntraditionally involved distinct methods tailored to each state to be\nrecognized. In this study, we perform a unified environmental state recognition\nfor robots through the spoken language with pre-trained large-scale\nvision-language models. We apply Visual Question Answering and Image-to-Text\nRetrieval, which are tasks of Vision-Language Models. We show that with our\nmethod, it is possible to recognize not only whether a room door is\nopen/closed, but also whether a transparent door is open/closed and whether\nwater is running in a sink, without training neural networks or manual\nprogramming. In addition, the recognition accuracy can be improved by selecting\nappropriate texts from the set of prepared texts based on black-box\noptimization. For each state recognition, only the text set and its weighting\nneed to be changed, eliminating the need to prepare multiple different models\nand programs, and facilitating the management of source code and computer\nresource. We experimentally demonstrate the effectiveness of our method and\napply it to the recognition behavior on a mobile robot, Fetch.\n","authors":["Kento Kawaharazuka","Yoshiki Obinata","Naoaki Kanazawa","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2409.17519v1.pdf","comment":"Accepted at Advanced Robotics, website -\n https://haraduka.github.io/vlm-bbo/"},{"id":"http://arxiv.org/abs/2409.17497v1","updated":"2024-09-26T03:11:41Z","published":"2024-09-26T03:11:41Z","title":"Precise Interception Flight Targets by Image-based Visual Servoing of\n Multicopter","summary":" Interception of low-altitude intruding targets with low-cost drones equipped\nstrapdown camera presents a competitive option. However, the malicious\nmaneuvers by the non-cooperative target and the coupling of the camera make the\ntask challenging. To solve this problem, an Image-Based Visual Servoing (IBVS)\ncontrol algorithm based on proportional navigation guidance with field-of-view\nholding capability is designed. The proposed controller reduces the miss\ndistance while improving the stability of the visual servo system during\ninterception. Software-in-the-loop (SITL) simulation experiments show a 72.8%\nreduction in the circular error probability (CEP) compared to the most recent\nstudy. This improvement enhances interception accuracy from the decimeter to\nthe centimeter level. Real-world experiments further validate the effectiveness\nof the proposed algorithm.\n","authors":["Hailong Yan","Kun Yang","Yixiao Cheng","Zihao Wang","Dawei Li"],"pdf_url":"https://arxiv.org/pdf/2409.17497v1.pdf","comment":"9 pages, 15 figures, In the process of being submitted to the Journal\n of IEEE Transactions on Industrial Electronics"},{"id":"http://arxiv.org/abs/2409.16663v2","updated":"2024-09-26T02:57:52Z","published":"2024-09-25T06:48:25Z","title":"Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles\n Using Latent Space Generative World Models","summary":" We propose the use of latent space generative world models to address the\ncovariate shift problem in autonomous driving. A world model is a neural\nnetwork capable of predicting an agent's next state given past states and\nactions. By leveraging a world model during training, the driving policy\neffectively mitigates covariate shift without requiring an excessive amount of\ntraining data. During end-to-end training, our policy learns how to recover\nfrom errors by aligning with states observed in human demonstrations, so that\nat runtime it can recover from perturbations outside the training distribution.\nAdditionally, we introduce a novel transformer-based perception encoder that\nemploys multi-view cross-attention and a learned scene query. We present\nqualitative and quantitative results, demonstrating significant improvements\nupon prior state of the art in closed-loop testing in the CARLA simulator, as\nwell as showing the ability to handle perturbations in both CARLA and NVIDIA's\nDRIVE Sim.\n","authors":["Alexander Popov","Alperen Degirmenci","David Wehr","Shashank Hegde","Ryan Oldja","Alexey Kamenev","Bertrand Douillard","David Nistér","Urs Muller","Ruchi Bhargava","Stan Birchfield","Nikolai Smolyanskiy"],"pdf_url":"https://arxiv.org/pdf/2409.16663v2.pdf","comment":"7 pages, 6 figures, for ICRA 2025 conference, for associated video\n file, see https://youtu.be/fO7RZ57gVxk"},{"id":"http://arxiv.org/abs/2409.17479v1","updated":"2024-09-26T02:30:17Z","published":"2024-09-26T02:30:17Z","title":"Traverse the Non-Traversable: Estimating Traversability for Wheeled\n Mobility on Vertically Challenging Terrain","summary":" Most traversability estimation techniques divide off-road terrain into\ntraversable (e.g., pavement, gravel, and grass) and non-traversable (e.g.,\nboulders, vegetation, and ditches) regions and then inform subsequent planners\nto produce trajectories on the traversable part. However, recent research\ndemonstrated that wheeled robots can traverse vertically challenging terrain\n(e.g., extremely rugged boulders comparable in size to the vehicles\nthemselves), which unfortunately would be deemed as non-traversable by existing\ntechniques. Motivated by such limitations, this work aims at identifying the\ntraversable from the seemingly non-traversable, vertically challenging terrain\nbased on past kinodynamic vehicle-terrain interactions in a data-driven manner.\nOur new Traverse the Non-Traversable(TNT) traversability estimator can\nefficiently guide a down-stream sampling-based planner containing a\nhigh-precision 6-DoF kinodynamic model, which becomes deployable onboard a\nsmall-scale vehicle. Additionally, the estimated traversability can also be\nused as a costmap to plan global and local paths without sampling. Our\nexperiment results show that TNT can improve planning performance, efficiency,\nand stability by 50%, 26.7%, and 9.2% respectively on a physical robot\nplatform.\n","authors":["Chenhui Pan","Aniket Datar","Anuj Pokhrel","Matthew Choulas","Mohammad Nazeri","Xuesu Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.17479v1.pdf","comment":"for associated video file, see\n https://www.youtube.com/watch?v=Shcalb8sGcA"},{"id":"http://arxiv.org/abs/2409.17470v1","updated":"2024-09-26T02:14:50Z","published":"2024-09-26T02:14:50Z","title":"Tactile Probabilistic Contact Dynamics Estimation of Unknown Objects","summary":" We study the problem of rapidly identifying contact dynamics of unknown\nobjects in partially known environments. The key innovation of our method is a\nnovel formulation of the contact dynamics estimation problem as the joint\nestimation of contact geometries and physical parameters. We leverage DeepSDF,\na compact and expressive neural-network-based geometry representation over a\ndistribution of geometries, and adopt a particle filter to estimate both the\ngeometries in contact and the physical parameters. In addition, we couple the\nestimator with an active exploration strategy that plans information-gathering\nmoves to further expedite online estimation. Through simulation and physical\nexperiments, we show that our method estimates accurate contact dynamics with\nfewer than 30 exploration moves for unknown objects touching partially known\nenvironments.\n","authors":["Jinhoo Kim","Yifan Zhu","Aaron Dollar"],"pdf_url":"https://arxiv.org/pdf/2409.17470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17469v1","updated":"2024-09-26T02:02:58Z","published":"2024-09-26T02:02:58Z","title":"Verti-Selector: Automatic Curriculum Learning for Wheeled Mobility on\n Vertically Challenging Terrain","summary":" Reinforcement Learning (RL) has the potential to enable extreme off-road\nmobility by circumventing complex kinodynamic modeling, planning, and control\nby simulated end-to-end trial-and-error learning experiences. However, most RL\nmethods are sample-inefficient when training in a large amount of manually\ndesigned simulation environments and struggle at generalizing to the real\nworld. To address these issues, we introduce Verti-Selector (VS), an automatic\ncurriculum learning framework designed to enhance learning efficiency and\ngeneralization by selectively sampling training terrain. VS prioritizes\nvertically challenging terrain with higher Temporal Difference (TD) errors when\nrevisited, thereby allowing robots to learn at the edge of their evolving\ncapabilities. By dynamically adjusting the sampling focus, VS significantly\nboosts sample efficiency and generalization within the VW-Chrono simulator\nbuilt on the Chrono multi-physics engine. Furthermore, we provide simulation\nand physical results using VS on a Verti-4-Wheeler platform. These results\ndemonstrate that VS can achieve 23.08% improvement in terms of success rate by\nefficiently sampling during training and robustly generalizing to the real\nworld.\n","authors":["Tong Xu","Chenhui Pan","Xuesu Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.17469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16162v3","updated":"2024-09-26T00:47:25Z","published":"2024-07-23T04:06:23Z","title":"Plant Robots: Harnessing Growth Actuation of Plants for Locomotion and\n Object Manipulation","summary":" Plants display physical displacements during their growth due to\nphotosynthesis, which converts light into chemical energy. This can be\ninterpreted as plants acting as actuators with a built-in power source. This\npaper presents a method to create plant robots that move and perform tasks by\nharnessing the actuation output of plants: displacement and force generated\nfrom the growing process. As the target plant, radish sprouts are employed, and\ntheir displacement and force are characterized, followed by the calculation of\npower and energy densities. Based on the characterization, two different plant\nrobots are designed and fabricated: a rotational robot and a gripper. The\nformer demonstrates ground locomotion, achieving a travel distance of 14.6 mm\nwith an average speed of 0.8 mm/h. The latter demonstrates the picking and\nplacing of an object with a 0.1-g mass by the light-controlled open-close\nmotion of plant fingers. A good agreement between the experimental and model\nvalues is observed in the specific data of the mobile robot, suggesting that\nobtaining the actuation characteristics of plants can enable the design and\nprediction of behavior in plant robots. These results pave the way for the\nrealization of novel types of environmentally friendly and sustainable robots.\n","authors":["Kazuya Murakami","Misao Sato","Momoki Kubota","Jun Shintake"],"pdf_url":"https://arxiv.org/pdf/2407.16162v3.pdf","comment":"16 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17443v1","updated":"2024-09-26T00:32:56Z","published":"2024-09-26T00:32:56Z","title":"Cat-and-Mouse Satellite Dynamics: Divergent Adversarial Reinforcement\n Learning for Contested Multi-Agent Space Operations","summary":" As space becomes increasingly crowded and contested, robust autonomous\ncapabilities for multi-agent environments are gaining critical importance.\nCurrent autonomous systems in space primarily rely on optimization-based path\nplanning or long-range orbital maneuvers, which have not yet proven effective\nin adversarial scenarios where one satellite is actively pursuing another. We\nintroduce Divergent Adversarial Reinforcement Learning (DARL), a two-stage\nMulti-Agent Reinforcement Learning (MARL) approach designed to train autonomous\nevasion strategies for satellites engaged with multiple adversarial spacecraft.\nOur method enhances exploration during training by promoting diverse\nadversarial strategies, leading to more robust and adaptable evader models. We\nvalidate DARL through a cat-and-mouse satellite scenario, modeled as a\npartially observable multi-agent capture the flag game where two adversarial\n`cat' spacecraft pursue a single `mouse' evader. DARL's performance is compared\nagainst several benchmarks, including an optimization-based satellite path\nplanner, demonstrating its ability to produce highly robust models for\nadversarial multi-agent space environments.\n","authors":["Cameron Mehlman","Joseph Abramov","Gregory Falco"],"pdf_url":"https://arxiv.org/pdf/2409.17443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17435v1","updated":"2024-09-26T00:05:36Z","published":"2024-09-26T00:05:36Z","title":"Active Vision Might Be All You Need: Exploring Active Vision in Bimanual\n Robotic Manipulation","summary":" Imitation learning has demonstrated significant potential in performing\nhigh-precision manipulation tasks using visual feedback from cameras. However,\nit is common practice in imitation learning for cameras to be fixed in place,\nresulting in issues like occlusion and limited field of view. Furthermore,\ncameras are often placed in broad, general locations, without an effective\nviewpoint specific to the robot's task. In this work, we investigate the\nutility of active vision (AV) for imitation learning and manipulation, in\nwhich, in addition to the manipulation policy, the robot learns an AV policy\nfrom human demonstrations to dynamically change the robot's camera viewpoint to\nobtain better information about its environment and the given task. We\nintroduce AV-ALOHA, a new bimanual teleoperation robot system with AV, an\nextension of the ALOHA 2 robot system, incorporating an additional 7-DoF robot\narm that only carries a stereo camera and is solely tasked with finding the\nbest viewpoint. This camera streams stereo video to an operator wearing a\nvirtual reality (VR) headset, allowing the operator to control the camera pose\nusing head and body movements. The system provides an immersive teleoperation\nexperience, with bimanual first-person control, enabling the operator to\ndynamically explore and search the scene and simultaneously interact with the\nenvironment. We conduct imitation learning experiments of our system both in\nreal-world and in simulation, across a variety of tasks that emphasize\nviewpoint planning. Our results demonstrate the effectiveness of human-guided\nAV for imitation learning, showing significant improvements over fixed cameras\nin tasks with limited visibility. Project website:\nhttps://soltanilara.github.io/av-aloha/\n","authors":["Ian Chuang","Andrew Lee","Dechen Gao","Iman Soltani"],"pdf_url":"https://arxiv.org/pdf/2409.17435v1.pdf","comment":"6 pages, 4 figures"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2409.18097v1","updated":"2024-09-26T17:41:04Z","published":"2024-09-26T17:41:04Z","title":"A Sim-to-Real Vision-based Lane Keeping System for a 1:10-scale\n Autonomous Vehicle","summary":" In recent years, several competitions have highlighted the need to\ninvestigate vision-based solutions to address scenarios with functional\ninsufficiencies in perception, world modeling and localization. This article\npresents the Vision-based Lane Keeping System (VbLKS) developed by the\nDEI-Unipd Team within the context of the Bosch Future Mobility Challenge 2022.\nThe main contribution lies in a Simulation-to-Reality (Sim2Real) GPS-denied\nVbLKS for a 1:10-scale autonomous vehicle. In this VbLKS, the input to a\ntailored Pure Pursuit (PP) based control strategy, namely the Lookahead Heading\nError (LHE), is estimated at a constant lookahead distance employing a\nConvolutional Neural Network (CNN). A training strategy for a compact CNN is\nproposed, emphasizing data generation and augmentation on simulated camera\nimages from a 3D Gazebo simulator, and enabling real-time operation on\nlow-level hardware. A tailored PP-based lateral controller equipped with a\nderivative action and a PP-based velocity reference generation are implemented.\nTuning ranges are established through a systematic time-delay stability\nanalysis. Validation in a representative controlled laboratory setting is\nprovided.\n","authors":["Antonio Gallina","Matteo Grandin","Angelo Cenedese","Mattia Bruschetta"],"pdf_url":"https://arxiv.org/pdf/2409.18097v1.pdf","comment":"16 pages, 23 figures"},{"id":"http://arxiv.org/abs/2409.18010v1","updated":"2024-09-26T16:19:49Z","published":"2024-09-26T16:19:49Z","title":"End-to-end guarantees for indirect data-driven control of bilinear\n systems with finite stochastic data","summary":" In this paper we propose an end-to-end algorithm for indirect data-driven\ncontrol for bilinear systems with stability guarantees. We consider the case\nwhere the collected i.i.d. data is affected by probabilistic noise with\npossibly unbounded support and leverage tools from statistical learning theory\nto derive finite sample identification error bounds. To this end, we solve the\nbilinear identification problem by solving a set of linear and affine\nidentification problems, by a particular choice of a control input during the\ndata collection phase. We provide a priori as well as data-dependent finite\nsample identification error bounds on the individual matrices as well as\nellipsoidal bounds, both of which are structurally suitable for control.\nFurther, we integrate the structure of the derived identification error bounds\nin a robust controller design to obtain an exponentially stable closed-loop. By\nmeans of an extensive numerical study we showcase the interplay between the\ncontroller design and the derived identification error bounds. Moreover, we\nnote appealing connections of our results to indirect data-driven control of\ngeneral nonlinear systems through Koopman operator theory and discuss how our\nresults may be applied in this setup.\n","authors":["Nicolas Chatzikiriakos","Robin Strässer","Frank Allgöwer","Andrea Iannelli"],"pdf_url":"https://arxiv.org/pdf/2409.18010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18009v1","updated":"2024-09-26T16:19:37Z","published":"2024-09-26T16:19:37Z","title":"Control Industrial Automation System with Large Language Models","summary":" Traditional industrial automation systems require specialized expertise to\noperate and complex reprogramming to adapt to new processes. Large language\nmodels offer the intelligence to make them more flexible and easier to use.\nHowever, LLMs' application in industrial settings is underexplored. This paper\nintroduces a framework for integrating LLMs to achieve end-to-end control of\nindustrial automation systems. At the core of the framework are an agent system\ndesigned for industrial tasks, a structured prompting method, and an\nevent-driven information modeling mechanism that provides real-time data for\nLLM inference. The framework supplies LLMs with real-time events on different\ncontext semantic levels, allowing them to interpret the information, generate\nproduction plans, and control operations on the automation system. It also\nsupports structured dataset creation for fine-tuning on this downstream\napplication of LLMs. Our contribution includes a formal system design,\nproof-of-concept implementation, and a method for generating task-specific\ndatasets for LLM fine-tuning and testing. This approach enables a more adaptive\nautomation system that can respond to spontaneous events, while allowing easier\noperation and configuration through natural language for more intuitive\nhuman-machine interaction. We provide demo videos and detailed data on GitHub:\nhttps://github.com/YuchenXia/LLM4IAS\n","authors":["Yuchen Xia","Nasser Jazdi","Jize Zhang","Chaitanya Shah","Michael Weyrich"],"pdf_url":"https://arxiv.org/pdf/2409.18009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17997v1","updated":"2024-09-26T16:07:38Z","published":"2024-09-26T16:07:38Z","title":"Distributed Invariant Unscented Kalman Filter based on Inverse\n Covariance Intersection with Intermittent Measurements","summary":" This paper studies the problem of distributed state estimation (DSE) over\nsensor networks on matrix Lie groups, which is crucial for applications where\nsystem states evolve on Lie groups rather than vector spaces. We propose a\ndiffusion-based distributed invariant Unscented Kalman Filter using the inverse\ncovariance intersection (DIUKF-ICI) method to address target tracking in 3D\nenvironments. Unlike existing distributed UKFs confined to vector spaces, our\napproach extends the distributed UKF framework to Lie groups, enabling local\nestimates to be fused with intermediate information from neighboring agents on\nLie groups. To handle the unknown correlations across local estimates, we\nextend the ICI fusion strategy to matrix Lie groups for the first time and\nintegrate it into the diffusion algorithm. We demonstrate that the estimation\nerror of the proposed method is bounded. Additionally, the algorithm is fully\ndistributed, robust against intermittent measurements, and adaptable to\ntime-varying communication topologies. The effectiveness of the proposed method\nis validated through extensive Monte-Carlo simulations.\n","authors":["Zhian Ruan","Yizhi Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.17997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17988v1","updated":"2024-09-26T15:57:20Z","published":"2024-09-26T15:57:20Z","title":"Deblur e-NeRF: NeRF from Motion-Blurred Events under High-speed or\n Low-light Conditions","summary":" The stark contrast in the design philosophy of an event camera makes it\nparticularly ideal for operating under high-speed, high dynamic range and\nlow-light conditions, where standard cameras underperform. Nonetheless, event\ncameras still suffer from some amount of motion blur, especially under these\nchallenging conditions, in contrary to what most think. This is attributed to\nthe limited bandwidth of the event sensor pixel, which is mostly proportional\nto the light intensity. Thus, to ensure that event cameras can truly excel in\nsuch conditions where it has an edge over standard cameras, it is crucial to\naccount for event motion blur in downstream applications, especially\nreconstruction. However, none of the recent works on reconstructing Neural\nRadiance Fields (NeRFs) from events, nor event simulators, have considered the\nfull effects of event motion blur. To this end, we propose, Deblur e-NeRF, a\nnovel method to directly and effectively reconstruct blur-minimal NeRFs from\nmotion-blurred events generated under high-speed motion or low-light\nconditions. The core component of this work is a physically-accurate pixel\nbandwidth model proposed to account for event motion blur under arbitrary speed\nand lighting conditions. We also introduce a novel threshold-normalized total\nvariation loss to improve the regularization of large textureless patches.\nExperiments on real and novel realistically simulated sequences verify our\neffectiveness. Our code, event simulator and synthetic event dataset will be\nopen-sourced.\n","authors":["Weng Fei Low","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2409.17988v1.pdf","comment":"Accepted to ECCV 2024. Project website is accessible at\n https://wengflow.github.io/deblur-e-nerf. arXiv admin note: text overlap with\n arXiv:2006.07722 by other authors"},{"id":"http://arxiv.org/abs/2409.17931v1","updated":"2024-09-26T15:08:38Z","published":"2024-09-26T15:08:38Z","title":"Intelligent Energy Management: Remaining Useful Life Prediction and\n Charging Automation System Comprised of Deep Learning and the Internet of\n Things","summary":" Remaining Useful Life (RUL) of battery is an important parameter to know the\nbattery's remaining life and need for recharge. The goal of this research\nproject is to develop machine learning-based models for the battery RUL\ndataset. Different ML models are developed to classify the RUL of the vehicle,\nand the IoT (Internet of Things) concept is simulated for automating the\ncharging system and managing any faults aligning. The graphs plotted depict the\nrelationship between various vehicle parameters using the Blynk IoT platform.\nResults show that the catboost, Multi-Layer Perceptron (MLP), Gated Recurrent\nUnit (GRU), and hybrid model developed could classify RUL into three classes\nwith 99% more accuracy. The data is fed using the tkinter GUI for simulating\nartificial intelligence (AI)-based charging, and with a pyserial backend, data\ncan be entered into the Esp-32 microcontroller for making charge discharge\npossible with the model's predictions. Also, with an IoT system, the charging\ncan be disconnected, monitored, and analyzed for automation. The results show\nthat an accuracy of 99% can be obtained on models MLP, catboost model and\nsimilar accuracy on GRU model can be obtained, and finally relay-based\ntriggering can be made by prediction through the model used for automating the\ncharging and energy-saving mechanism. By showcasing an exemplary Blynk\nplatform-based monitoring and automation phenomenon, we further present\ninnovative ways of monitoring parameters and automating the system.\n","authors":["Biplov Paneru","Bishwash Paneru","DP Sharma Mainali"],"pdf_url":"https://arxiv.org/pdf/2409.17931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17916v1","updated":"2024-09-26T15:02:11Z","published":"2024-09-26T15:02:11Z","title":"Observer-Based Discontinuous Communication in the Secondary Control of\n AC Microgrids","summary":" This paper proposes an observer-based event-driven approach to decrease the\noveruse of communication networks. The suggested approach aims to estimate the\nrequired data for sharing between units in line with as much communication\nreduction as possible. In other words, the proposed approach effectively\ndetermines which state variables should be shared (observer concept) among the\nunits during specific time intervals (event-triggered concept). This strategy\nsignificantly reduces the overall communication load. It is shown that the\nestimation error remains bounded and Zeno behavior, characterized by an endless\nnumber of transmissions occurring within a limited time frame, does not occur.\nThe proposed methodology can be systematically applied to any\ncommunication-based secondary controller in alternating current (AC)\nmicrogrids. Simulation results demonstrate a high degree of precision in\nestimating the states under the proposed approach. Also, the secondary\ncontroller performance under the proposed method is evaluated in\nMATLAB/Simulink environment.\n","authors":["Shahabeddin Najafi","Yazdan Batmani","Pouya Shafiee","Charalambos Konstantinou"],"pdf_url":"https://arxiv.org/pdf/2409.17916v1.pdf","comment":"2024 IEEE PES Innovative Smart Grid Technologies Europe (ISGT Europe)"},{"id":"http://arxiv.org/abs/2409.17907v1","updated":"2024-09-26T14:52:51Z","published":"2024-09-26T14:52:51Z","title":"PhantomLiDAR: Cross-modality Signal Injection Attacks against LiDAR","summary":" LiDAR (Light Detection and Ranging) is a pivotal sensor for autonomous\ndriving, offering precise 3D spatial information. Previous signal attacks\nagainst LiDAR systems mainly exploit laser signals. In this paper, we\ninvestigate the possibility of cross-modality signal injection attacks, i.e.,\ninjecting intentional electromagnetic interference (IEMI) to manipulate LiDAR\noutput. Our insight is that the internal modules of a LiDAR, i.e., the laser\nreceiving circuit, the monitoring sensors, and the beam-steering modules, even\nwith strict electromagnetic compatibility (EMC) testing, can still couple with\nthe IEMI attack signals and result in the malfunction of LiDAR systems. Based\non the above attack surfaces, we propose the PhantomLiDAR attack, which\nmanipulates LiDAR output in terms of Points Interference, Points Injection,\nPoints Removal, and even LiDAR Power-Off. We evaluate and demonstrate the\neffectiveness of PhantomLiDAR with both simulated and real-world experiments on\nfive COTS LiDAR systems. We also conduct feasibility experiments in real-world\nmoving scenarios. We provide potential defense measures that can be implemented\nat both the sensor level and the vehicle system level to mitigate the risks\nassociated with IEMI attacks. Video demonstrations can be viewed at\nhttps://sites.google.com/view/phantomlidar.\n","authors":["Zizhi Jin","Qinhong Jiang","Xuancun Lu","Chen Yan","Xiaoyu Ji","Wenyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16899v2","updated":"2024-09-26T14:47:38Z","published":"2024-08-29T20:53:03Z","title":"Network-aware Recommender System via Online Feedback Optimization","summary":" Personalized content on social platforms can exacerbate negative phenomena\nsuch as polarization, partly due to the feedback interactions between\nrecommendations and the users. In this paper, we present a control-theoretic\nrecommender system that explicitly accounts for this feedback loop to mitigate\npolarization. Our approach extends online feedback optimization - a control\nparadigm for steady-state optimization of dynamical systems - to develop a\nrecommender system that trades off users engagement and polarization reduction,\nwhile relying solely on online click data. We establish theoretical guarantees\nfor optimality and stability of the proposed design and validate its\neffectiveness via numerical experiments with a user population governed by\nFriedkin-Johnsen dynamics. Our results show these \"network-aware\"\nrecommendations can significantly reduce polarization while maintaining high\nlevels of user engagement.\n","authors":["Sanjay Chandrasekaran","Giulia De Pasquale","Giuseppe Belgioioso","Florian Dörfler"],"pdf_url":"https://arxiv.org/pdf/2408.16899v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17896v1","updated":"2024-09-26T14:47:14Z","published":"2024-09-26T14:47:14Z","title":"Model-Free versus Model-Based Reinforcement Learning for Fixed-Wing UAV\n Attitude Control Under Varying Wind Conditions","summary":" This paper evaluates and compares the performance of model-free and\nmodel-based reinforcement learning for the attitude control of fixed-wing\nunmanned aerial vehicles using PID as a reference point. The comparison focuses\non their ability to handle varying flight dynamics and wind disturbances in a\nsimulated environment. Our results show that the Temporal Difference Model\nPredictive Control agent outperforms both the PID controller and other\nmodel-free reinforcement learning methods in terms of tracking accuracy and\nrobustness over different reference difficulties, particularly in nonlinear\nflight regimes. Furthermore, we introduce actuation fluctuation as a key metric\nto assess energy efficiency and actuator wear, and we test two different\napproaches from the literature: action variation penalty and conditioning for\naction policy smoothness. We also evaluate all control methods when subject to\nstochastic turbulence and gusts separately, so as to measure their effects on\ntracking performance, observe their limitations and outline their implications\non the Markov decision process formalism.\n","authors":["David Olivares","Pierre Fournier","Pavan Vasishta","Julien Marzat"],"pdf_url":"https://arxiv.org/pdf/2409.17896v1.pdf","comment":"Published at ICINCO 2024"},{"id":"http://arxiv.org/abs/2409.17881v1","updated":"2024-09-26T14:28:20Z","published":"2024-09-26T14:28:20Z","title":"Discontinuous Reception with Adjustable Inactivity Timer for IIoT","summary":" Discontinuous reception (DRX) is a key technology for reducing the energy\nconsumption of industrial Internet of Things (IIoT) devices. Specifically, DRX\nallows the devices to operate in a low-power mode when no data reception is\nscheduled, and its effectiveness depends on the proper configuration of the DRX\nparameters. In this paper, we characterize the DRX process departing from a\nsemi-Markov chain modeling. We detail two ways to set DRX parameters to\nminimize the device power consumption while meeting a mean delay constraint.\nThe first method exhaustively searches for the optimal configuration. In\ncontrast, the second method uses a low-complexity metaheuristic to find a\nsub-optimal configuration, thus considering ideal and practical DRX\nconfigurations. Notably, within the DRX parameters, the inactivity timer (IT)\nis a caution time that specifies how long a device remains active after the\nlast information exchange. Traditionally, a device implementing DRX will\nrestart the IT after each data reception as a precedent to a low-power mode.\nThe usual approach lies in restarting the IT whenever new data is received\nduring this cautious period, which might sometimes needlessly extend the active\ntime. Herein, we propose a more efficient method in which the transmit base\nstation (BS) explicitly indicates restarting the timer through the control\nchannel only when appropriate. The decision is taken based on the BS's\nknowledge about its buffer status. We consider Poisson and bursty traffic\nmodels, which are typical in IIoT setups, and verify the suitability of our\nproposal for reducing the energy consumption of the devices without\nsignificantly compromising the communication latency through extensive\nnumerical simulations. Specifically, energy-saving gains of up to 30% can be\nobtained regardless of the arrival rate and delay constraints.\n","authors":["David E. Ruíz-Guirola","Carlos A. Rodríguez-López","Onel L. A. López","Samuel Montejo-Sánchez","Vitalio Alfonso Reguera","Matti Latva-aho"],"pdf_url":"https://arxiv.org/pdf/2409.17881v1.pdf","comment":"IEEE Transactions on Industrial Informatics (2024)"},{"id":"http://arxiv.org/abs/2409.03175v2","updated":"2024-09-26T12:05:28Z","published":"2024-09-05T02:08:30Z","title":"Data-based approaches to learning and control by similarity between\n heterogeneous systems","summary":" This paper proposes basic definitions of similarity and similarity indexes\nbetween admissible behaviors of heterogeneous host and guest systems and\nfurther presents a similarity-based learning control framework by exploiting\nthe offline sampled data. By exploring helpful geometric properties of the\nadmissible behavior and decomposing it into the subspace and offset components,\nthe similarity indexes between two admissible behaviors are defined as the\nprincipal angles between their corresponding subspace components. By\nreconstructing the admissible behaviors leveraging sampled data, an efficient\nstrategy for calculating the similarity indexes is developed, based on which a\nsimilarity-based learning control framework is proposed. It is shown that, with\nthe application of similarity-based learning control, the host system can\ndirectly accomplish the same control tasks by utilizing the successful\nexperience provided by the guest system, without having to undergo the\ntrial-and-error process. All results in this paper are supported by simulation\nexamples.\n","authors":["Chenchao Wang","Deyuan Meng"],"pdf_url":"https://arxiv.org/pdf/2409.03175v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17720v1","updated":"2024-09-26T10:43:09Z","published":"2024-09-26T10:43:09Z","title":"Scene Understanding in Pick-and-Place Tasks: Analyzing Transformations\n Between Initial and Final Scenes","summary":" With robots increasingly collaborating with humans in everyday tasks, it is\nimportant to take steps toward robotic systems capable of understanding the\nenvironment. This work focuses on scene understanding to detect pick and place\ntasks given initial and final images from the scene. To this end, a dataset is\ncollected for object detection and pick and place task detection. A YOLOv5\nnetwork is subsequently trained to detect the objects in the initial and final\nscenes. Given the detected objects and their bounding boxes, two methods are\nproposed to detect the pick and place tasks which transform the initial scene\ninto the final scene. A geometric method is proposed which tracks objects'\nmovements in the two scenes and works based on the intersection of the bounding\nboxes which moved within scenes. Contrarily, the CNN-based method utilizes a\nConvolutional Neural Network to classify objects with intersected bounding\nboxes into 5 classes, showing the spatial relationship between the involved\nobjects. The performed pick and place tasks are then derived from analyzing the\nexperiments with both scenes. Results show that the CNN-based method, using a\nVGG16 backbone, outscores the geometric method by roughly 12 percentage points\nin certain scenarios, with an overall success rate of 84.3%.\n","authors":["Seraj Ghasemi","Hamed Hosseini","MohammadHossein Koosheshi","Mehdi Tale Masouleh","Ahmad Kalhor"],"pdf_url":"https://arxiv.org/pdf/2409.17720v1.pdf","comment":"Conference Paper, ICEE 2024, 7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17705v1","updated":"2024-09-26T10:21:39Z","published":"2024-09-26T10:21:39Z","title":"On the Output Redundancy of LTI Systems: A Geometric Approach with\n Application to Privacy","summary":" This paper examines the properties of output-redundant systems, that is,\nsystems possessing a larger number of outputs than inputs, through the lenses\nof the geometric approach of Wonham et al. We begin by formulating a simple\noutput allocation synthesis problem, which involves ``concealing\" input\ninformation from a malicious eavesdropper having access to the system output,\nwhile still allowing for a legitimate user to reconstruct it. It is shown that\nthe solvability of this problem requires the availability of a redundant set of\noutputs. This very problem is instrumental to unveiling the fundamental\ngeometric properties of output-redundant systems, which form the basis for our\nsubsequent constructions and results. As a direct application, we demonstrate\nhow output allocation can be employed to effectively protect the information of\ninput information from certain output eavesdroppers with guaranteed results.\n","authors":["Guitao Yang","Alexander J. Gallo","Angelo Barboni","Riccardo M. G. Ferrari","Andrea Serrani","Thomas Parisini"],"pdf_url":"https://arxiv.org/pdf/2409.17705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10668v2","updated":"2024-09-26T09:38:01Z","published":"2024-02-16T13:19:04Z","title":"Data-Driven Abstractions for Control Systems via Random Exploration","summary":" At the intersection of dynamical systems, control theory, and formal methods\nlies the construction of symbolic abstractions: these typically represent\nsimpler, finite-state models whose behavior mimics that of an underlying\nconcrete system but are easier to analyse. Building an abstraction usually\nrequires an accurate knowledge of the underlying model: this knowledge may be\ncostly to gather, especially in real-life applications. We aim to bridge this\ngap by building abstractions based on sampling finite length trajectories. To\nrefine a controller built for the abstraction to one for the concrete system,\nwe newly define a notion of probabilistic alternating simulation, and provide\nProbably Approximately Correct (PAC) guarantees that the constructed\nabstraction includes all behaviors of the concrete system and that it is\nsuitable for control design, for arbitrarily long time horizons, leveraging\nscenario theory. Our method is then tested on several numerical benchmarks.\n","authors":["Rudi Coppola","Andrea Peruffo","Manuel Mazo Jr"],"pdf_url":"https://arxiv.org/pdf/2402.10668v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17672v1","updated":"2024-09-26T09:31:42Z","published":"2024-09-26T09:31:42Z","title":"Semantic model for the description of energy data in the Module Type\n Package","summary":" Modular production systems that employ the Module Type Package (MTP) to\ndescribe module interfaces can, at present, only communicate energy data\nthrough proprietary solutions. Due to this limitation, users face additional\neffort when calculating energy KPIs for modules or determining the energy\nefficiency of modules. To address this issue, we present a model that\nfacilitates energy data to be described semantically and uniformly in the MTP\non the basis of an industrial standard (OPC 34100). MTPs incorporating this\nmodel can transmit semantically consistent energy data from modules to the\nprocess control system, making the data available for further applications,\nsuch as monitoring or optimization.\n","authors":["Leif-Thore Reiche","Felix Gehlhoff","Alexander Fay"],"pdf_url":"https://arxiv.org/pdf/2409.17672v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17613v1","updated":"2024-09-26T08:03:33Z","published":"2024-09-26T08:03:33Z","title":"Stereographic Projection of Probabilistic Frequency-Domain Uncertainty","summary":" This paper investigates the stereographic projection of points along the\nNyquist plots of single input single output (SISO) linear time invariant (LTI)\nsystems subject to probabilistic uncertainty. At each frequency, there\ncorresponds a complex-valued random variable with given probability\ndistribution in the complex plane. The chordal distance between the\nstereographic projections of this complex value and the corresponding value for\na nominal model, as per the well-known Nu-Gap metric of Vinnicombe, is also a\nrandom quantity. The main result provides the cumulative density function (CDF)\nof the chordal distance at a given frequency. Such a stochastic distance\nframework opens up a fresh and a fertile research direction on probabilistic\nrobust control theory.\n","authors":["Anton Nystrom","Venkatraman Renganathan","Michael Cantoni"],"pdf_url":"https://arxiv.org/pdf/2409.17613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04436v2","updated":"2024-09-26T07:43:44Z","published":"2023-09-30T12:54:55Z","title":"Adaptive Control of an Inverted Pendulum by a Reinforcement\n Learning-based LQR Method","summary":" Inverted pendulums constitute one of the popular systems for benchmarking\ncontrol algorithms. Several methods have been proposed for the control of this\nsystem, the majority of which rely on the availability of a mathematical model.\nHowever, deriving a mathematical model using physical parameters or system\nidentification techniques requires manual effort. Moreover, the designed\ncontrollers may perform poorly if system parameters change. To mitigate these\nproblems, recently, some studies used Reinforcement Learning (RL) based\napproaches for the control of inverted pendulum systems. Unfortunately, these\nmethods suffer from slow convergence and local minimum problems. Moreover, they\nmay require hyperparameter tuning which complicates the design process\nsignificantly. To alleviate these problems, the present study proposes an\nLQR-based RL method for adaptive balancing control of an inverted pendulum. As\nshown by numerical experiments, the algorithm stabilizes the system very fast\nwithout requiring a mathematical model or extensive hyperparameter tuning. In\naddition, it can adapt to parametric changes online.\n","authors":["Ugur Yildiran"],"pdf_url":"https://arxiv.org/pdf/2310.04436v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15958v2","updated":"2024-09-26T06:41:11Z","published":"2024-03-23T23:27:37Z","title":"Convection-Enabled Boundary Control of a 2D Channel Flow","summary":" Nonlinear convection, the source of turbulence in fluid flows, may hold the\nkey to stabilizing turbulence by solving a specific cubic polynomial equation.\nWe consider the incompressible Navier-Stokes equations in a two-dimensional\nchannel. The tangential and normal velocities are assumed to be periodic in the\nstreamwise direction. The pressure difference between the left and right ends\nof the channel is constant. Moreover, we consider no-slip boundary conditions,\nthat is, zero tangential velocity, at the top and bottom walls of the channel,\nand normal velocity actuation at the top and bottom walls. We design the\nboundary control inputs to achieve global exponential stabilization, in the L2\nsense, of a chosen Poiseuille equilibrium profile for an arbitrarily large\nReynolds number. The key idea behind our approach is to select the boundary\ncontrollers such that they have zero spatial mean (to guarantee mass\nconservation) but non-zero spatial cubic mean. We reveal that, because of\nconvection, the time derivative of the L2 energy of the regulation error is a\ncubic polynomial in the cubic mean of the boundary inputs. Regulation is then\nachieved by solving a specific cubic equation, using the Cardano root formula.\nThe results are illustrated via a numerical example.\n","authors":["Mohamed Camil Belhadjoudja","Miroslav Krstic","Emmanuel Witrant"],"pdf_url":"https://arxiv.org/pdf/2403.15958v2.pdf","comment":"To be presented at the 63rd IEEE Conference on Decision and Control\n (CDC 2024)"},{"id":"http://arxiv.org/abs/2409.13624v2","updated":"2024-09-26T06:40:18Z","published":"2024-09-20T16:34:31Z","title":"Safe stabilization using generalized Lyapunov barrier function","summary":" This paper addresses the safe stabilization problem, focusing on controlling\nthe system state to the origin while avoiding entry into unsafe state sets. The\ncurrent methods for solving this issue rely on smooth Lyapunov and barrier\nfunctions, which do not always ensure the existence of an effective controller\neven when such smooth functions are created. To tackle this challenge, we\nintroduce the concept of a generalized (nonsmooth) Lyapunov barrier function\n(GenLBF), which guarantees the existence of a safe and stable controller. We\noutline a systematic approach for constructing a GenLBF, including a technique\nfor efficiently calculating the upper generalized derivative of the GenLBF.\nUsing the constructed GenLBF, we propose a method for certifying safe\nstabilization of autonomous systems and design a piecewise continuous feedback\ncontrol to achieve safe stabilization of non-autonomous systems. A general\ncontroller refinement strategy is further proposed to help the state trajectory\nescape from undesired local points occurring in systems with special physical\nstructure. A thorough theoretical analysis demonstrates the effectiveness of\nour method in addressing the safe stabilization problem for systems with single\nor multiple bounded unsafe state sets. Extensive simulations of linear and\nnonlinear systems further illustrate the efficacy of the proposed method and\nits superiority over the smooth control Lyapunov barrier function method.\n","authors":["Jianglin Lan","Eldert van Henten","Peter Groot Koerkamp","Congcong Sun"],"pdf_url":"https://arxiv.org/pdf/2409.13624v2.pdf","comment":"19 pages, 14 figures, under review by a journal"},{"id":"http://arxiv.org/abs/2409.17500v1","updated":"2024-09-26T03:12:53Z","published":"2024-09-26T03:12:53Z","title":"GLinSAT: The General Linear Satisfiability Neural Network Layer By\n Accelerated Gradient Descent","summary":" Ensuring that the outputs of neural networks satisfy specific constraints is\ncrucial for applying neural networks to real-life decision-making problems. In\nthis paper, we consider making a batch of neural network outputs satisfy\nbounded and general linear constraints. We first reformulate the neural network\noutput projection problem as an entropy-regularized linear programming problem.\nWe show that such a problem can be equivalently transformed into an\nunconstrained convex optimization problem with Lipschitz continuous gradient\naccording to the duality theorem. Then, based on an accelerated gradient\ndescent algorithm with numerical performance enhancement, we present our\narchitecture, GLinSAT, to solve the problem. To the best of our knowledge, this\nis the first general linear satisfiability layer in which all the operations\nare differentiable and matrix-factorization-free. Despite the fact that we can\nexplicitly perform backpropagation based on automatic differentiation\nmechanism, we also provide an alternative approach in GLinSAT to calculate the\nderivatives based on implicit differentiation of the optimality condition.\nExperimental results on constrained traveling salesman problems, partial graph\nmatching with outliers, predictive portfolio allocation and power system unit\ncommitment demonstrate the advantages of GLinSAT over existing satisfiability\nlayers.\n","authors":["Hongtai Zeng","Chao Yang","Yanzhen Zhou","Cheng Yang","Qinglai Guo"],"pdf_url":"https://arxiv.org/pdf/2409.17500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16663v2","updated":"2024-09-26T02:57:52Z","published":"2024-09-25T06:48:25Z","title":"Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles\n Using Latent Space Generative World Models","summary":" We propose the use of latent space generative world models to address the\ncovariate shift problem in autonomous driving. A world model is a neural\nnetwork capable of predicting an agent's next state given past states and\nactions. By leveraging a world model during training, the driving policy\neffectively mitigates covariate shift without requiring an excessive amount of\ntraining data. During end-to-end training, our policy learns how to recover\nfrom errors by aligning with states observed in human demonstrations, so that\nat runtime it can recover from perturbations outside the training distribution.\nAdditionally, we introduce a novel transformer-based perception encoder that\nemploys multi-view cross-attention and a learned scene query. We present\nqualitative and quantitative results, demonstrating significant improvements\nupon prior state of the art in closed-loop testing in the CARLA simulator, as\nwell as showing the ability to handle perturbations in both CARLA and NVIDIA's\nDRIVE Sim.\n","authors":["Alexander Popov","Alperen Degirmenci","David Wehr","Shashank Hegde","Ryan Oldja","Alexey Kamenev","Bertrand Douillard","David Nistér","Urs Muller","Ruchi Bhargava","Stan Birchfield","Nikolai Smolyanskiy"],"pdf_url":"https://arxiv.org/pdf/2409.16663v2.pdf","comment":"7 pages, 6 figures, for ICRA 2025 conference, for associated video\n file, see https://youtu.be/fO7RZ57gVxk"},{"id":"http://arxiv.org/abs/2409.17488v1","updated":"2024-09-26T02:50:32Z","published":"2024-09-26T02:50:32Z","title":"Optimal control of stochastic reaction networks with entropic control\n cost and emergence of mode-switching strategies","summary":" Controlling the stochastic dynamics of biological populations is a challenge\nthat arises across various biological contexts. However, these dynamics are\ninherently nonlinear and involve a discrete state space, i.e., the number of\nmolecules, cells, or organisms. Additionally, the possibility of extinction has\na significant impact on both the dynamics and control strategies, particularly\nwhen the population size is small. These factors hamper the direct application\nof conventional control theories to biological systems. To address these\nchallenges, we formulate the optimal control problem for stochastic population\ndynamics by utilizing a control cost function based on the Kullback-Leibler\ndivergence. This approach naturally accounts for population-specific factors\nand simplifies the complex nonlinear Hamilton-Jacobi-Bellman equation into a\nlinear form, facilitating efficient computation of optimal solutions. We\ndemonstrate the effectiveness of our approach by applying it to the control of\ninteracting random walkers, Moran processes, and SIR models, and observe the\nmode-switching phenomena in the control strategies. Our approach provides new\nopportunities for applying control theory to a wide range of biological\nproblems.\n","authors":["Shuhei A. Horiguchi","Tetsuya J. Kobayashi"],"pdf_url":"https://arxiv.org/pdf/2409.17488v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.15139v2","updated":"2024-09-26T00:58:52Z","published":"2024-09-23T15:42:53Z","title":"The Top Manifold Connectedness of Quantum Control Landscapes","summary":" The control of quantum systems has been proven to possess trap-free\noptimization landscapes under the satisfaction of proper assumptions. However,\nmany details of the landscape geometry and their influence on search efficiency\nstill need to be fully understood. This paper numerically explores the\npath-connectedness of globally optimal control solutions forming the top\nmanifold of the landscape. We randomly sample a plurality of optimal controls\nin the top manifold to assess the existence of a continuous path at the top of\nthe landscape that connects two arbitrary optimal solutions. It is shown that\nfor different quantum control objectives including state-to-state transition\nprobabilities, observable expectation values and unitary transformations, such\na continuous path can be readily found, implying that these top manifolds are\nfundamentally path-connected. The significance of the latter conjecture lies in\nseeking locations in the top manifold where an ancillary objective can also be\noptimized while maintaining the full optimality of the original objective that\ndefined the landscape.\n","authors":["Yidian Fan","Re-Bing Wu","Tak-San Ho","Gaurav V. Bhole","Herschel Rabitz"],"pdf_url":"https://arxiv.org/pdf/2409.15139v2.pdf","comment":"34 pages, 10 figures"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2409.18119v1","updated":"2024-09-26T17:56:59Z","published":"2024-09-26T17:56:59Z","title":"Multi-View and Multi-Scale Alignment for Contrastive Language-Image\n Pre-training in Mammography","summary":" Contrastive Language-Image Pre-training (CLIP) shows promise in medical image\nanalysis but requires substantial data and computational resources. Due to\nthese restrictions, existing CLIP applications in medical imaging focus mainly\non modalities like chest X-rays that have abundant image-report data available,\nleaving many other important modalities under-explored. Here, we propose the\nfirst adaptation of the full CLIP model to mammography, which presents\nsignificant challenges due to labeled data scarcity, high-resolution images\nwith small regions of interest, and data imbalance. We first develop a\nspecialized supervision framework for mammography that leverages its multi-view\nnature. Furthermore, we design a symmetric local alignment module to better\nfocus on detailed features in high-resolution images. Lastly, we incorporate a\nparameter-efficient fine-tuning approach for large language models pre-trained\nwith medical knowledge to address data limitations. Our multi-view and\nmulti-scale alignment (MaMA) method outperforms state-of-the-art baselines for\nthree different tasks on two large real-world mammography datasets, EMBED and\nRSNA-Mammo, with only 52% model size compared with the largest baseline.\n","authors":["Yuexi Du","John Onofrey","Nicha C. Dvornek"],"pdf_url":"https://arxiv.org/pdf/2409.18119v1.pdf","comment":"This work is also the basis of the overall best solution for the\n MICCAI 2024 CXR-LT Challenge"},{"id":"http://arxiv.org/abs/2407.00312v2","updated":"2024-09-26T17:55:37Z","published":"2024-06-29T04:29:03Z","title":"UDC: A Unified Neural Divide-and-Conquer Framework for Large-Scale\n Combinatorial Optimization Problems","summary":" Single-stage neural combinatorial optimization solvers have achieved\nnear-optimal results on various small-scale combinatorial optimization (CO)\nproblems without needing expert knowledge. However, these solvers exhibit\nsignificant performance degradation when applied to large-scale CO problems.\nRecently, two-stage neural methods with divide-and-conquer strategies have\nshown efficiency in addressing large-scale CO problems. Nevertheless, the\nperformance of these methods highly relies on problem-specific heuristics in\neither the divide or the conquer procedure, which limits their applicability to\ngeneral CO problems. Moreover, these methods employ separate training schemes\nand ignore the interdependencies between the dividing and conquering\nstrategies, which often leads to sub-optimal solutions. To tackle these\ndrawbacks, this article develops a unified neural divide-and-conquer framework\n(i.e., UDC) for solving general large-scale CO problems. UDC offers a\nDivide-Conquer-Reunion (DCR) training method to eliminate the negative impact\nof a sub-optimal dividing policy. Employing a high-efficiency Graph Neural\nNetwork (GNN) for global instance dividing and a fixed-length sub-path solver\nfor conquering divided sub-problems, the proposed UDC framework demonstrates\nextensive applicability, achieving superior performance in 10 representative\nlarge-scale CO problems. The code is available at\nhttps://github.com/CIAM-Group/NCO_code/tree/main/single_objective/UDC-Large-scale-CO-master.\n","authors":["Zhi Zheng","Changliang Zhou","Tong Xialiang","Mingxuan Yuan","Zhenkun Wang"],"pdf_url":"https://arxiv.org/pdf/2407.00312v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18104v1","updated":"2024-09-26T17:49:20Z","published":"2024-09-26T17:49:20Z","title":"Find Rhinos without Finding Rhinos: Active Learning with Multimodal\n Imagery of South African Rhino Habitats","summary":" Much of Earth's charismatic megafauna is endangered by human activities,\nparticularly the rhino, which is at risk of extinction due to the poaching\ncrisis in Africa. Monitoring rhinos' movement is crucial to their protection\nbut has unfortunately proven difficult because rhinos are elusive. Therefore,\ninstead of tracking rhinos, we propose the novel approach of mapping communal\ndefecation sites, called middens, which give information about rhinos' spatial\nbehavior valuable to anti-poaching, management, and reintroduction efforts.\nThis paper provides the first-ever mapping of rhino midden locations by\nbuilding classifiers to detect them using remotely sensed thermal, RGB, and\nLiDAR imagery in passive and active learning settings. As existing active\nlearning methods perform poorly due to the extreme class imbalance in our\ndataset, we design MultimodAL, an active learning system employing a ranking\ntechnique and multimodality to achieve competitive performance with passive\nlearning models with 94% fewer labels. Our methods could therefore save over 76\nhours in labeling time when used on a similarly-sized dataset. Unexpectedly,\nour midden map reveals that rhino middens are not randomly distributed\nthroughout the landscape; rather, they are clustered. Consequently, rangers\nshould be targeted at areas with high midden densities to strengthen\nanti-poaching efforts, in line with UN Target 15.7.\n","authors":["Lucia Gordon","Nikhil Behari","Samuel Collier","Elizabeth Bondi-Kelly","Jackson A. Killian","Catherine Ressijac","Peter Boucher","Andrew Davies","Milind Tambe"],"pdf_url":"https://arxiv.org/pdf/2409.18104v1.pdf","comment":"9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good"},{"id":"http://arxiv.org/abs/2409.18101v1","updated":"2024-09-26T17:44:52Z","published":"2024-09-26T17:44:52Z","title":"AI-Powered Augmented Reality for Satellite Assembly, Integration and\n Test","summary":" The integration of Artificial Intelligence (AI) and Augmented Reality (AR) is\nset to transform satellite Assembly, Integration, and Testing (AIT) processes\nby enhancing precision, minimizing human error, and improving operational\nefficiency in cleanroom environments. This paper presents a technical\ndescription of the European Space Agency's (ESA) project \"AI for AR in\nSatellite AIT,\" which combines real-time computer vision and AR systems to\nassist technicians during satellite assembly. Leveraging Microsoft HoloLens 2\nas the AR interface, the system delivers context-aware instructions and\nreal-time feedback, tackling the complexities of object recognition and 6D pose\nestimation in AIT workflows. All AI models demonstrated over 70% accuracy, with\nthe detection model exceeding 95% accuracy, indicating a high level of\nperformance and reliability. A key contribution of this work lies in the\neffective use of synthetic data for training AI models in AR applications,\naddressing the significant challenges of obtaining real-world datasets in\nhighly dynamic satellite environments, as well as the creation of the Segmented\nAnything Model for Automatic Labelling (SAMAL), which facilitates the automatic\nannotation of real data, achieving speeds up to 20 times faster than manual\nhuman annotation. The findings demonstrate the efficacy of AI-driven AR systems\nin automating critical satellite assembly tasks, setting a foundation for\nfuture innovations in the space industry.\n","authors":["Alvaro Patricio","Joao Valente","Atabak Dehban","Ines Cadilha","Daniel Reis","Rodrigo Ventura"],"pdf_url":"https://arxiv.org/pdf/2409.18101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18099v1","updated":"2024-09-26T17:44:20Z","published":"2024-09-26T17:44:20Z","title":"EfficientCrackNet: A Lightweight Model for Crack Segmentation","summary":" Crack detection, particularly from pavement images, presents a formidable\nchallenge in the domain of computer vision due to several inherent complexities\nsuch as intensity inhomogeneity, intricate topologies, low contrast, and noisy\nbackgrounds. Automated crack detection is crucial for maintaining the\nstructural integrity of essential infrastructures, including buildings,\npavements, and bridges. Existing lightweight methods often face challenges\nincluding computational inefficiency, complex crack patterns, and difficult\nbackgrounds, leading to inaccurate detection and impracticality for real-world\napplications. To address these limitations, we propose EfficientCrackNet, a\nlightweight hybrid model combining Convolutional Neural Networks (CNNs) and\ntransformers for precise crack segmentation. EfficientCrackNet integrates\ndepthwise separable convolutions (DSC) layers and MobileViT block to capture\nboth global and local features. The model employs an Edge Extraction Method\n(EEM) and for efficient crack edge detection without pretraining, and\nUltra-Lightweight Subspace Attention Module (ULSAM) to enhance feature\nextraction. Extensive experiments on three benchmark datasets Crack500,\nDeepCrack, and GAPs384 demonstrate that EfficientCrackNet achieves superior\nperformance compared to existing lightweight models, while requiring only 0.26M\nparameters, and 0.483 FLOPs (G). The proposed model offers an optimal balance\nbetween accuracy and computational efficiency, outperforming state-of-the-art\nlightweight models, and providing a robust and adaptable solution for\nreal-world crack segmentation.\n","authors":["Abid Hasan Zim","Aquib Iqbal","Zaid Al-Huda","Asad Malik","Minoru Kuribayash"],"pdf_url":"https://arxiv.org/pdf/2409.18099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12822v3","updated":"2024-09-26T17:39:44Z","published":"2024-06-18T17:43:47Z","title":"Is It Good Data for Multilingual Instruction Tuning or Just Bad\n Multilingual Evaluation for Large Language Models?","summary":" Multilingual large language models are designed, claimed, and expected to\ncater to speakers of varied languages. We hypothesise that the current\npractices of fine-tuning and evaluating these models may not perfectly align\nwith this objective owing to a heavy reliance on translation, which cannot\ncover language-specific knowledge but can introduce translation defects. It\nremains unknown whether the nature of the instruction data has an impact on the\nmodel output; conversely, it is questionable whether translated test sets can\ncapture such nuances. Due to the often coupled practices of using translated\ndata in both stages, such imperfections could have been overlooked. This work\ninvestigates these issues using controlled native or translated data during the\ninstruction tuning and evaluation stages. We show that native or generation\nbenchmarks reveal a notable difference between native and translated\ninstruction data especially when model performance is high, whereas other types\nof test sets cannot. The comparison between round-trip and single-pass\ntranslations reflects the importance of knowledge from language-native\nresources. Finally, we demonstrate that regularization is beneficial to\nbridging this gap on structured but not generative tasks.\n","authors":["Pinzhen Chen","Simon Yu","Zhicheng Guo","Barry Haddow"],"pdf_url":"https://arxiv.org/pdf/2406.12822v3.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18092v1","updated":"2024-09-26T17:39:05Z","published":"2024-09-26T17:39:05Z","title":"DiffSSC: Semantic LiDAR Scan Completion using Denoising Diffusion\n Probabilistic Models","summary":" Perception systems play a crucial role in autonomous driving, incorporating\nmultiple sensors and corresponding computer vision algorithms. 3D LiDAR sensors\nare widely used to capture sparse point clouds of the vehicle's surroundings.\nHowever, such systems struggle to perceive occluded areas and gaps in the scene\ndue to the sparsity of these point clouds and their lack of semantics. To\naddress these challenges, Semantic Scene Completion (SSC) jointly predicts\nunobserved geometry and semantics in the scene given raw LiDAR measurements,\naiming for a more complete scene representation. Building on promising results\nof diffusion models in image generation and super-resolution tasks, we propose\ntheir extension to SSC by implementing the noising and denoising diffusion\nprocesses in the point and semantic spaces individually. To control the\ngeneration, we employ semantic LiDAR point clouds as conditional input and\ndesign local and global regularization losses to stabilize the denoising\nprocess. We evaluate our approach on autonomous driving datasets and our\napproach outperforms the state-of-the-art for SSC.\n","authors":["Helin Cao","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2409.18092v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2409.16898v2","updated":"2024-09-26T17:38:14Z","published":"2024-09-25T13:08:10Z","title":"AI-driven View Guidance System in Intra-cardiac Echocardiography Imaging","summary":" Intra-cardiac Echocardiography (ICE) is a crucial imaging modality used in\nelectrophysiology (EP) and structural heart disease (SHD) interventions,\nproviding real-time, high-resolution views from within the heart. Despite its\nadvantages, effective manipulation of the ICE catheter requires significant\nexpertise, which can lead to inconsistent outcomes, particularly among less\nexperienced operators. To address this challenge, we propose an AI-driven\nclosed-loop view guidance system with human-in-the-loop feedback, designed to\nassist users in navigating ICE imaging without requiring specialized knowledge.\nOur method models the relative position and orientation vectors between\narbitrary views and clinically defined ICE views in a spatial coordinate\nsystem, guiding users on how to manipulate the ICE catheter to transition from\nthe current view to the desired view over time. Operating in a closed-loop\nconfiguration, the system continuously predicts and updates the necessary\ncatheter manipulations, ensuring seamless integration into existing clinical\nworkflows. The effectiveness of the proposed system is demonstrated through a\nsimulation-based evaluation, achieving an 89% success rate with the 6532 test\ndataset, highlighting its potential to improve the accuracy and efficiency of\nICE imaging procedures.\n","authors":["Jaeyoung Huh","Paul Klein","Gareth Funka-Lea","Puneet Sharma","Ankur Kapoor","Young-Ho Kim"],"pdf_url":"https://arxiv.org/pdf/2409.16898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18084v1","updated":"2024-09-26T17:27:15Z","published":"2024-09-26T17:27:15Z","title":"GSON: A Group-based Social Navigation Framework with Large Multimodal\n Model","summary":" As the number of service robots and autonomous vehicles in human-centered\nenvironments grows, their requirements go beyond simply navigating to a\ndestination. They must also take into account dynamic social contexts and\nensure respect and comfort for others in shared spaces, which poses significant\nchallenges for perception and planning. In this paper, we present a group-based\nsocial navigation framework GSON to enable mobile robots to perceive and\nexploit the social group of their surroundings by leveling the visual reasoning\ncapability of the Large Multimodal Model (LMM). For perception, we apply visual\nprompting techniques to zero-shot extract the social relationship among\npedestrians and combine the result with a robust pedestrian detection and\ntracking pipeline to alleviate the problem of low inference speed of the LMM.\nGiven the perception result, the planning system is designed to avoid\ndisrupting the current social structure. We adopt a social structure-based\nmid-level planner as a bridge between global path planning and local motion\nplanning to preserve the global context and reactive response. The proposed\nmethod is validated on real-world mobile robot navigation tasks involving\ncomplex social structure understanding and reasoning. Experimental results\ndemonstrate the effectiveness of the system in these scenarios compared with\nseveral baselines.\n","authors":["Shangyi Luo","Ji Zhu","Peng Sun","Yuhong Deng","Cunjun Yu","Anxing Xiao","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18082v1","updated":"2024-09-26T17:26:16Z","published":"2024-09-26T17:26:16Z","title":"SKT: Integrating State-Aware Keypoint Trajectories with Vision-Language\n Models for Robotic Garment Manipulation","summary":" Automating garment manipulation poses a significant challenge for assistive\nrobotics due to the diverse and deformable nature of garments. Traditional\napproaches typically require separate models for each garment type, which\nlimits scalability and adaptability. In contrast, this paper presents a unified\napproach using vision-language models (VLMs) to improve keypoint prediction\nacross various garment categories. By interpreting both visual and semantic\ninformation, our model enables robots to manage different garment states with a\nsingle model. We created a large-scale synthetic dataset using advanced\nsimulation techniques, allowing scalable training without extensive real-world\ndata. Experimental results indicate that the VLM-based method significantly\nenhances keypoint detection accuracy and task success rates, providing a more\nflexible and general solution for robotic garment manipulation. In addition,\nthis research also underscores the potential of VLMs to unify various garment\nmanipulation tasks within a single framework, paving the way for broader\napplications in home automation and assistive robotics for future.\n","authors":["Xin Li","Siyuan Huang","Qiaojun Yu","Zhengkai Jiang","Ce Hao","Yimeng Zhu","Hongsheng Li","Peng Gao","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2409.18082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18073v1","updated":"2024-09-26T17:19:49Z","published":"2024-09-26T17:19:49Z","title":"Infer Human's Intentions Before Following Natural Language Instructions","summary":" For AI agents to be helpful to humans, they should be able to follow natural\nlanguage instructions to complete everyday cooperative tasks in human\nenvironments. However, real human instructions inherently possess ambiguity,\nbecause the human speakers assume sufficient prior knowledge about their hidden\ngoals and intentions. Standard language grounding and planning methods fail to\naddress such ambiguities because they do not model human internal goals as\nadditional partially observable factors in the environment. We propose a new\nframework, Follow Instructions with Social and Embodied Reasoning (FISER),\naiming for better natural language instruction following in collaborative\nembodied tasks. Our framework makes explicit inferences about human goals and\nintentions as intermediate reasoning steps. We implement a set of\nTransformer-based models and evaluate them over a challenging benchmark,\nHandMeThat. We empirically demonstrate that using social reasoning to\nexplicitly infer human intentions before making action plans surpasses purely\nend-to-end approaches. We also compare our implementation with strong\nbaselines, including Chain of Thought prompting on the largest available\npre-trained language models, and find that FISER provides better performance on\nthe embodied social reasoning tasks under investigation, reaching the\nstate-of-the-art on HandMeThat.\n","authors":["Yanming Wan","Yue Wu","Yiping Wang","Jiayuan Mao","Natasha Jaques"],"pdf_url":"https://arxiv.org/pdf/2409.18073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18071v1","updated":"2024-09-26T17:18:39Z","published":"2024-09-26T17:18:39Z","title":"FreeEdit: Mask-free Reference-based Image Editing with Multi-modal\n Instruction","summary":" Introducing user-specified visual concepts in image editing is highly\npractical as these concepts convey the user's intent more precisely than\ntext-based descriptions. We propose FreeEdit, a novel approach for achieving\nsuch reference-based image editing, which can accurately reproduce the visual\nconcept from the reference image based on user-friendly language instructions.\nOur approach leverages the multi-modal instruction encoder to encode language\ninstructions to guide the editing process. This implicit way of locating the\nediting area eliminates the need for manual editing masks. To enhance the\nreconstruction of reference details, we introduce the Decoupled Residual\nReferAttention (DRRA) module. This module is designed to integrate fine-grained\nreference features extracted by a detail extractor into the image editing\nprocess in a residual way without interfering with the original self-attention.\nGiven that existing datasets are unsuitable for reference-based image editing\ntasks, particularly due to the difficulty in constructing image triplets that\ninclude a reference image, we curate a high-quality dataset, FreeBench, using a\nnewly developed twice-repainting scheme. FreeBench comprises the images before\nand after editing, detailed editing instructions, as well as a reference image\nthat maintains the identity of the edited object, encompassing tasks such as\nobject addition, replacement, and deletion. By conducting phased training on\nFreeBench followed by quality tuning, FreeEdit achieves high-quality zero-shot\nediting through convenient language instructions. We conduct extensive\nexperiments to evaluate the effectiveness of FreeEdit across multiple task\ntypes, demonstrating its superiority over existing methods. The code will be\navailable at: https://freeedit.github.io/.\n","authors":["Runze He","Kai Ma","Linjiang Huang","Shaofei Huang","Jialin Gao","Xiaoming Wei","Jiao Dai","Jizhong Han","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18071v1.pdf","comment":"14 pages, 14 figures, project website: https://freeedit.github.io/"},{"id":"http://arxiv.org/abs/2310.06114v3","updated":"2024-09-26T17:14:09Z","published":"2023-10-09T19:42:22Z","title":"Learning Interactive Real-World Simulators","summary":" Generative models trained on internet data have revolutionized how text,\nimage, and video content can be created. Perhaps the next milestone for\ngenerative models is to simulate realistic experience in response to actions\ntaken by humans, robots, and other interactive agents. Applications of a\nreal-world simulator range from controllable content creation in games and\nmovies, to training embodied agents purely in simulation that can be directly\ndeployed in the real world. We explore the possibility of learning a universal\nsimulator (UniSim) of real-world interaction through generative modeling. We\nfirst make the important observation that natural datasets available for\nlearning a real-world simulator are often rich along different dimensions\n(e.g., abundant objects in image data, densely sampled actions in robotics\ndata, and diverse movements in navigation data). With careful orchestration of\ndiverse datasets, each providing a different aspect of the overall experience,\nwe can simulate the visual outcome of both high-level instructions such as\n\"open the drawer\" and low-level controls from otherwise static scenes and\nobjects. We use the simulator to train both high-level vision-language policies\nand low-level reinforcement learning policies, each of which can be deployed in\nthe real world in zero shot after training purely in simulation. We also show\nthat other types of intelligence such as video captioning models can benefit\nfrom training with simulated experience, opening up even wider applications.\nVideo demos can be found at https://universal-simulator.github.io.\n","authors":["Sherry Yang","Yilun Du","Kamyar Ghasemipour","Jonathan Tompson","Leslie Kaelbling","Dale Schuurmans","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2310.06114v3.pdf","comment":"https://universal-simulator.github.io"},{"id":"http://arxiv.org/abs/2409.18055v1","updated":"2024-09-26T16:59:01Z","published":"2024-09-26T16:59:01Z","title":"Visual Data Diagnosis and Debiasing with Concept Graphs","summary":" The widespread success of deep learning models today is owed to the curation\nof extensive datasets significant in size and complexity. However, such models\nfrequently pick up inherent biases in the data during the training process,\nleading to unreliable predictions. Diagnosing and debiasing datasets is thus a\nnecessity to ensure reliable model performance. In this paper, we present\nCONBIAS, a novel framework for diagnosing and mitigating Concept co-occurrence\nBiases in visual datasets. CONBIAS represents visual datasets as knowledge\ngraphs of concepts, enabling meticulous analysis of spurious concept\nco-occurrences to uncover concept imbalances across the whole dataset.\nMoreover, we show that by employing a novel clique-based concept balancing\nstrategy, we can mitigate these imbalances, leading to enhanced performance on\ndownstream tasks. Extensive experiments show that data augmentation based on a\nbalanced concept distribution augmented by CONBIAS improves generalization\nperformance across multiple datasets compared to state-of-the-art methods. We\nwill make our code and data publicly available.\n","authors":["Rwiddhi Chakraborty","Yinong Wang","Jialu Gao","Runkai Zheng","Cheng Zhang","Fernando De la Torre"],"pdf_url":"https://arxiv.org/pdf/2409.18055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18053v1","updated":"2024-09-26T16:58:04Z","published":"2024-09-26T16:58:04Z","title":"DualAD: Dual-Layer Planning for Reasoning in Autonomous Driving","summary":" We present a novel autonomous driving framework, DualAD, designed to imitate\nhuman reasoning during driving. DualAD comprises two layers: a rule-based\nmotion planner at the bottom layer that handles routine driving tasks requiring\nminimal reasoning, and an upper layer featuring a rule-based text encoder that\nconverts driving scenarios from absolute states into text description. This\ntext is then processed by a large language model (LLM) to make driving\ndecisions. The upper layer intervenes in the bottom layer's decisions when\npotential danger is detected, mimicking human reasoning in critical situations.\nClosed-loop experiments demonstrate that DualAD, using a zero-shot pre-trained\nmodel, significantly outperforms rule-based motion planners that lack reasoning\nabilities. Our experiments also highlight the effectiveness of the text\nencoder, which considerably enhances the model's scenario understanding.\nAdditionally, the integrated DualAD model improves with stronger LLMs,\nindicating the framework's potential for further enhancement. We make code and\nbenchmarks publicly available.\n","authors":["Dingrui Wang","Marc Kaufeld","Johannes Betz"],"pdf_url":"https://arxiv.org/pdf/2409.18053v1.pdf","comment":"Autonomous Driving, Large Language Models (LLMs), Human Reasoning,\n Critical Scenario"},{"id":"http://arxiv.org/abs/2409.18052v1","updated":"2024-09-26T16:55:44Z","published":"2024-09-26T16:55:44Z","title":"Explaining Explaining","summary":" Explanation is key to people having confidence in high-stakes AI systems.\nHowever, machine-learning-based systems - which account for almost all current\nAI - can't explain because they are usually black boxes. The explainable AI\n(XAI) movement hedges this problem by redefining \"explanation\". The\nhuman-centered explainable AI (HCXAI) movement identifies the\nexplanation-oriented needs of users but can't fulfill them because of its\ncommitment to machine learning. In order to achieve the kinds of explanations\nneeded by real people operating in critical domains, we must rethink how to\napproach AI. We describe a hybrid approach to developing cognitive agents that\nuses a knowledge-based infrastructure supplemented by data obtained through\nmachine learning when applicable. These agents will serve as assistants to\nhumans who will bear ultimate responsibility for the decisions and actions of\nthe human-robot team. We illustrate the explanatory potential of such agents\nusing the under-the-hood panels of a demonstration system in which a team of\nsimulated robots collaborates on a search task assigned by a human.\n","authors":["Sergei Nirenburg","Marjorie McShane","Kenneth W. Goodman","Sanjay Oruganti"],"pdf_url":"https://arxiv.org/pdf/2409.18052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18049v1","updated":"2024-09-26T16:49:58Z","published":"2024-09-26T16:49:58Z","title":"Revisit Anything: Visual Place Recognition via Image Segment Retrieval","summary":" Accurately recognizing a revisited place is crucial for embodied agents to\nlocalize and navigate. This requires visual representations to be distinct,\ndespite strong variations in camera viewpoint and scene appearance. Existing\nvisual place recognition pipelines encode the \"whole\" image and search for\nmatches. This poses a fundamental challenge in matching two images of the same\nplace captured from different camera viewpoints: \"the similarity of what\noverlaps can be dominated by the dissimilarity of what does not overlap\". We\naddress this by encoding and searching for \"image segments\" instead of the\nwhole images. We propose to use open-set image segmentation to decompose an\nimage into `meaningful' entities (i.e., things and stuff). This enables us to\ncreate a novel image representation as a collection of multiple overlapping\nsubgraphs connecting a segment with its neighboring segments, dubbed\nSuperSegment. Furthermore, to efficiently encode these SuperSegments into\ncompact vector representations, we propose a novel factorized representation of\nfeature aggregation. We show that retrieving these partial representations\nleads to significantly higher recognition recall than the typical whole image\nbased retrieval. Our segments-based approach, dubbed SegVLAD, sets a new\nstate-of-the-art in place recognition on a diverse selection of benchmark\ndatasets, while being applicable to both generic and task-specialized image\nencoders. Finally, we demonstrate the potential of our method to ``revisit\nanything'' by evaluating our method on an object instance retrieval task, which\nbridges the two disparate areas of research: visual place recognition and\nobject-goal navigation, through their common aim of recognizing goal objects\nspecific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything.\n","authors":["Kartik Garg","Sai Shubodh Puligilla","Shishir Kolathaya","Madhava Krishna","Sourav Garg"],"pdf_url":"https://arxiv.org/pdf/2409.18049v1.pdf","comment":"Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures"},{"id":"http://arxiv.org/abs/2409.18047v1","updated":"2024-09-26T16:48:21Z","published":"2024-09-26T16:48:21Z","title":"HARMONIC: Cognitive and Control Collaboration in Human-Robotic Teams","summary":" This paper presents a novel approach to multi-robot planning and\ncollaboration. We demonstrate a cognitive strategy for robots in human-robot\nteams that incorporates metacognition, natural language communication, and\nexplainability. The system is embodied using the HARMONIC architecture that\nflexibly integrates cognitive and control capabilities across the team. We\nevaluate our approach through simulation experiments involving a joint search\ntask by a team of heterogeneous robots (a UGV and a drone) and a human. We\ndetail the system's handling of complex, real-world scenarios, effective action\ncoordination between robots with different capabilities, and natural\nhuman-robot communication. This work demonstrates that the robots' ability to\nreason about plans, goals, and attitudes, and to provide explanations for\nactions and decisions are essential prerequisites for realistic human-robot\nteaming.\n","authors":["Sanjay Oruganti","Sergei Nirenburg","Marjorie McShane","Jesse English","Michael K. Roberts","Christian Arndt"],"pdf_url":"https://arxiv.org/pdf/2409.18047v1.pdf","comment":"Submitted to ICRA 2025 Conference, Atlanta, GA, USA"},{"id":"http://arxiv.org/abs/2409.18046v1","updated":"2024-09-26T16:47:32Z","published":"2024-09-26T16:47:32Z","title":"IFCap: Image-like Retrieval and Frequency-based Entity Filtering for\n Zero-shot Captioning","summary":" Recent advancements in image captioning have explored text-only training\nmethods to overcome the limitations of paired image-text data. However,\nexisting text-only training methods often overlook the modality gap between\nusing text data during training and employing images during inference. To\naddress this issue, we propose a novel approach called Image-like Retrieval,\nwhich aligns text features with visually relevant features to mitigate the\nmodality gap. Our method further enhances the accuracy of generated captions by\ndesigning a Fusion Module that integrates retrieved captions with input\nfeatures. Additionally, we introduce a Frequency-based Entity Filtering\ntechnique that significantly improves caption quality. We integrate these\nmethods into a unified framework, which we refer to as IFCap\n($\\textbf{I}$mage-like Retrieval and $\\textbf{F}$requency-based Entity\nFiltering for Zero-shot $\\textbf{Cap}$tioning). Through extensive\nexperimentation, our straightforward yet powerful approach has demonstrated its\nefficacy, outperforming the state-of-the-art methods by a significant margin in\nboth image captioning and video captioning compared to zero-shot captioning\nbased on text-only training.\n","authors":["Soeun Lee","Si-Woo Kim","Taewhan Kim","Dong-Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18046v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18037v1","updated":"2024-09-26T16:42:13Z","published":"2024-09-26T16:42:13Z","title":"HARMONIC: A Framework for Explanatory Cognitive Robots","summary":" We present HARMONIC, a framework for implementing cognitive robots that\ntransforms general-purpose robots into trusted teammates capable of complex\ndecision-making, natural communication and human-level explanation. The\nframework supports interoperability between a strategic (cognitive) layer for\nhigh-level decision-making and a tactical (robot) layer for low-level control\nand execution. We describe the core features of the framework and our initial\nimplementation, in which HARMONIC was deployed on a simulated UGV and drone\ninvolved in a multi-robot search and retrieval task.\n","authors":["Sanjay Oruganti","Sergei Nirenburg","Marjorie McShane","Jesse English","Michael K. Roberts","Christian Arndt"],"pdf_url":"https://arxiv.org/pdf/2409.18037v1.pdf","comment":"Accepted for presentation at ICRA@40. 23-26 September 2024,\n Rotterdam, Netherlands"},{"id":"http://arxiv.org/abs/2409.16626v2","updated":"2024-09-26T16:41:27Z","published":"2024-09-25T05:11:58Z","title":"Ascend HiFloat8 Format for Deep Learning","summary":" This preliminary white paper proposes a novel 8-bit floating-point data\nformat HiFloat8 (abbreviated as HiF8) for deep learning. HiF8 features tapered\nprecision. For normal value encoding, it provides 7 exponent values with 3-bit\nmantissa, 8 exponent values with 2-bit mantissa, and 16 exponent values with\n1-bit mantissa. For denormal value encoding, it extends the dynamic range by 7\nextra powers of 2, from 31 to 38 binades (notice that FP16 covers 40 binades).\nMeanwhile, HiF8 encodes all the special values except that positive zero and\nnegative zero are represented by only one bit-pattern. Thanks to the better\nbalance between precision and dynamic range, HiF8 can be simultaneously used in\nboth forward and backward passes of AI training. In this paper, we will\ndescribe the definition and rounding methods of HiF8, as well as the tentative\ntraining and inference solutions. To demonstrate the efficacy of HiF8, massive\nsimulation results on various neural networks, including traditional neural\nnetworks and large language models (LLMs), will also be presented.\n","authors":["Yuanyong Luo","Zhongxing Zhang","Richard Wu","Hu Liu","Ying Jin","Kai Zheng","Minmin Wang","Zhanying He","Guipeng Hu","Luyao Chen","Tianchi Hu","Junsong Wang","Minqi Chen","Mikhaylov Dmitry","Korviakov Vladimir","Bobrin Maxim","Yuhao Hu","Guanfu Chen","Zeyi Huang"],"pdf_url":"https://arxiv.org/pdf/2409.16626v2.pdf","comment":"13 Pages, 4 Figures, 9 Tables"},{"id":"http://arxiv.org/abs/2409.13731v3","updated":"2024-09-26T16:34:35Z","published":"2024-09-10T02:00:28Z","title":"KAG: Boosting LLMs in Professional Domains via Knowledge Augmented\n Generation","summary":" The recently developed retrieval-augmented generation (RAG) technology has\nenabled the efficient construction of domain-specific applications. However, it\nalso has limitations, including the gap between vector similarity and the\nrelevance of knowledge reasoning, as well as insensitivity to knowledge logic,\nsuch as numerical values, temporal relations, expert rules, and others, which\nhinder the effectiveness of professional knowledge services. In this work, we\nintroduce a professional domain knowledge service framework called Knowledge\nAugmented Generation (KAG). KAG is designed to address the aforementioned\nchallenges with the motivation of making full use of the advantages of\nknowledge graph(KG) and vector retrieval, and to improve generation and\nreasoning performance by bidirectionally enhancing large language models (LLMs)\nand KGs through five key aspects: (1) LLM-friendly knowledge representation,\n(2) mutual-indexing between knowledge graphs and original chunks, (3)\nlogical-form-guided hybrid reasoning engine, (4) knowledge alignment with\nsemantic reasoning, and (5) model capability enhancement for KAG. We compared\nKAG with existing RAG methods in multihop question answering and found that it\nsignificantly outperforms state-of-theart methods, achieving a relative\nimprovement of 19.6% on 2wiki and 33.5% on hotpotQA in terms of F1 score. We\nhave successfully applied KAG to two professional knowledge Q&A tasks of Ant\nGroup, including E-Government Q&A and E-Health Q&A, achieving significant\nimprovement in professionalism compared to RAG methods.\n","authors":["Lei Liang","Mengshu Sun","Zhengke Gui","Zhongshu Zhu","Zhouyu Jiang","Ling Zhong","Yuan Qu","Peilong Zhao","Zhongpu Bo","Jin Yang","Huaidong Xiong","Lin Yuan","Jun Xu","Zaoyang Wang","Zhiqiang Zhang","Wen Zhang","Huajun Chen","Wenguang Chen","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.13731v3.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2409.18028v1","updated":"2024-09-26T16:34:35Z","published":"2024-09-26T16:34:35Z","title":"Compositional Hardness of Code in Large Language Models -- A\n Probabilistic Perspective","summary":" A common practice in large language model (LLM) usage for complex analytical\ntasks such as code generation, is to sample a solution for the entire task\nwithin the model's context window. Previous works have shown that subtask\ndecomposition within the model's context (chain of thought), is beneficial for\nsolving such tasks. In this work, we point a limitation of LLMs' ability to\nperform several sub-tasks within the same context window - an in-context\nhardness of composition, pointing to an advantage for distributing a decomposed\nproblem in a multi-agent system of LLMs. The hardness of composition is\nquantified by a generation complexity metric, i.e., the number of LLM\ngenerations required to sample at least one correct solution. We find a gap\nbetween the generation complexity of solving a compositional problem within the\nsame context relative to distributing it among multiple agents, that increases\nexponentially with the solution's length. We prove our results theoretically\nand demonstrate them empirically.\n","authors":["Yotam Wolf","Binyamin Rothberg","Dorin Shteyman","Amnon Shashua"],"pdf_url":"https://arxiv.org/pdf/2409.18028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18025v1","updated":"2024-09-26T16:32:19Z","published":"2024-09-26T16:32:19Z","title":"An Adversarial Perspective on Machine Unlearning for AI Safety","summary":" Large language models are finetuned to refuse questions about hazardous\nknowledge, but these protections can often be bypassed. Unlearning methods aim\nat completely removing hazardous capabilities from models and make them\ninaccessible to adversaries. This work challenges the fundamental differences\nbetween unlearning and traditional safety post-training from an adversarial\nperspective. We demonstrate that existing jailbreak methods, previously\nreported as ineffective against unlearning, can be successful when applied\ncarefully. Furthermore, we develop a variety of adaptive methods that recover\nmost supposedly unlearned capabilities. For instance, we show that finetuning\non 10 unrelated examples or removing specific directions in the activation\nspace can recover most hazardous capabilities for models edited with RMU, a\nstate-of-the-art unlearning method. Our findings challenge the robustness of\ncurrent unlearning approaches and question their advantages over safety\ntraining.\n","authors":["Jakub Łucki","Boyi Wei","Yangsibo Huang","Peter Henderson","Florian Tramèr","Javier Rando"],"pdf_url":"https://arxiv.org/pdf/2409.18025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18017v1","updated":"2024-09-26T16:25:48Z","published":"2024-09-26T16:25:48Z","title":"Transferring disentangled representations: bridging the gap between\n synthetic and real images","summary":" Developing meaningful and efficient representations that separate the\nfundamental structure of the data generation mechanism is crucial in\nrepresentation learning. However, Disentangled Representation Learning has not\nfully shown its potential on real images, because of correlated generative\nfactors, their resolution and limited access to ground truth labels.\nSpecifically on the latter, we investigate the possibility of leveraging\nsynthetic data to learn general-purpose disentangled representations applicable\nto real data, discussing the effect of fine-tuning and what properties of\ndisentanglement are preserved after the transfer. We provide an extensive\nempirical study to address these issues. In addition, we propose a new\ninterpretable intervention-based metric, to measure the quality of factors\nencoding in the representation. Our results indicate that some level of\ndisentanglement, transferring a representation from synthetic to real data, is\npossible and effective.\n","authors":["Jacopo Dapueto","Nicoletta Noceti","Francesca Odone"],"pdf_url":"https://arxiv.org/pdf/2409.18017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18014v1","updated":"2024-09-26T16:22:59Z","published":"2024-09-26T16:22:59Z","title":"Role-RL: Online Long-Context Processing with Role Reinforcement Learning\n for Distinct LLMs in Their Optimal Roles","summary":" Large language models (LLMs) with long-context processing are still\nchallenging because of their implementation complexity, training efficiency and\ndata sparsity. To address this issue, a new paradigm named Online Long-context\nProcessing (OLP) is proposed when we process a document of unlimited length,\nwhich typically occurs in the information reception and organization of diverse\nstreaming media such as automated news reporting, live e-commerce, and viral\nshort videos. Moreover, a dilemma was often encountered when we tried to select\nthe most suitable LLM from a large number of LLMs amidst explosive growth\naiming for outstanding performance, affordable prices, and short response\ndelays. In view of this, we also develop Role Reinforcement Learning (Role-RL)\nto automatically deploy different LLMs in their respective roles within the OLP\npipeline according to their actual performance. Extensive experiments are\nconducted on our OLP-MINI dataset and it is found that OLP with Role-RL\nframework achieves OLP benchmark with an average recall rate of 93.2% and the\nLLM cost saved by 79.4%. The code and dataset are publicly available at:\nhttps://anonymous.4open.science/r/Role-RL.\n","authors":["Lewei He","Tianyu Shi","Pengran Huang","Bingzhi Chen","Qianglong Chen","Jiahui Pan"],"pdf_url":"https://arxiv.org/pdf/2409.18014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18009v1","updated":"2024-09-26T16:19:37Z","published":"2024-09-26T16:19:37Z","title":"Control Industrial Automation System with Large Language Models","summary":" Traditional industrial automation systems require specialized expertise to\noperate and complex reprogramming to adapt to new processes. Large language\nmodels offer the intelligence to make them more flexible and easier to use.\nHowever, LLMs' application in industrial settings is underexplored. This paper\nintroduces a framework for integrating LLMs to achieve end-to-end control of\nindustrial automation systems. At the core of the framework are an agent system\ndesigned for industrial tasks, a structured prompting method, and an\nevent-driven information modeling mechanism that provides real-time data for\nLLM inference. The framework supplies LLMs with real-time events on different\ncontext semantic levels, allowing them to interpret the information, generate\nproduction plans, and control operations on the automation system. It also\nsupports structured dataset creation for fine-tuning on this downstream\napplication of LLMs. Our contribution includes a formal system design,\nproof-of-concept implementation, and a method for generating task-specific\ndatasets for LLM fine-tuning and testing. This approach enables a more adaptive\nautomation system that can respond to spontaneous events, while allowing easier\noperation and configuration through natural language for more intuitive\nhuman-machine interaction. We provide demo videos and detailed data on GitHub:\nhttps://github.com/YuchenXia/LLM4IAS\n","authors":["Yuchen Xia","Nasser Jazdi","Jize Zhang","Chaitanya Shah","Michael Weyrich"],"pdf_url":"https://arxiv.org/pdf/2409.18009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17995v1","updated":"2024-09-26T16:07:20Z","published":"2024-09-26T16:07:20Z","title":"Joint Localization and Planning using Diffusion","summary":" Diffusion models have been successfully applied to robotics problems such as\nmanipulation and vehicle path planning. In this work, we explore their\napplication to end-to-end navigation -- including both perception and planning\n-- by considering the problem of jointly performing global localization and\npath planning in known but arbitrary 2D environments. In particular, we\nintroduce a diffusion model which produces collision-free paths in a global\nreference frame given an egocentric LIDAR scan, an arbitrary map, and a desired\ngoal position. To this end, we implement diffusion in the space of paths in\nSE(2), and describe how to condition the denoising process on both obstacles\nand sensor observations. In our evaluation, we show that the proposed\nconditioning techniques enable generalization to realistic maps of considerably\ndifferent appearance than the training environment, demonstrate our model's\nability to accurately describe ambiguous solutions, and run extensive\nsimulation experiments showcasing our model's use as a real-time, end-to-end\nlocalization and planning stack.\n","authors":["L. Lao Beyer","S. Karaman"],"pdf_url":"https://arxiv.org/pdf/2409.17995v1.pdf","comment":"7 pages, 9 figures. Submitted to ICRA 2025, under review"},{"id":"http://arxiv.org/abs/2409.17994v1","updated":"2024-09-26T16:06:38Z","published":"2024-09-26T16:06:38Z","title":"CRoP: Context-wise Robust Static Human-Sensing Personalization","summary":" The advancement in deep learning and internet-of-things have led to diverse\nhuman sensing applications. However, distinct patterns in human sensing,\ninfluenced by various factors or contexts, challenge generic neural network\nmodel's performance due to natural distribution shifts. To address this,\npersonalization tailors models to individual users. Yet most personalization\nstudies overlook intra-user heterogeneity across contexts in sensory data,\nlimiting intra-user generalizability. This limitation is especially critical in\nclinical applications, where limited data availability hampers both\ngeneralizability and personalization. Notably, intra-user sensing attributes\nare expected to change due to external factors such as treatment progression,\nfurther complicating the challenges.This work introduces CRoP, a novel static\npersonalization approach using an off-the-shelf pre-trained model and pruning\nto optimize personalization and generalization. CRoP shows superior\npersonalization effectiveness and intra-user robustness across four\nhuman-sensing datasets, including two from real-world health domains,\nhighlighting its practical and social impact. Additionally, to support CRoP's\ngeneralization ability and design choices, we provide empirical justification\nthrough gradient inner product analysis, ablation studies, and comparisons\nagainst state-of-the-art baselines.\n","authors":["Sawinder Kaur","Avery Gump","Jingyu Xin","Yi Xiao","Harshit Sharma","Nina R Benway","Jonathan L Preston","Asif Salekin"],"pdf_url":"https://arxiv.org/pdf/2409.17994v1.pdf","comment":"31 pages, 10 figues and 13 tables"},{"id":"http://arxiv.org/abs/2409.16427v2","updated":"2024-09-26T15:56:08Z","published":"2024-09-24T19:47:21Z","title":"HAICOSYSTEM: An Ecosystem for Sandboxing Safety Risks in Human-AI\n Interactions","summary":" AI agents are increasingly autonomous in their interactions with human users\nand tools, leading to increased interactional safety risks. We present\nHAICOSYSTEM, a framework examining AI agent safety within diverse and complex\nsocial interactions. HAICOSYSTEM features a modular sandbox environment that\nsimulates multi-turn interactions between human users and AI agents, where the\nAI agents are equipped with a variety of tools (e.g., patient management\nplatforms) to navigate diverse scenarios (e.g., a user attempting to access\nother patients' profiles). To examine the safety of AI agents in these\ninteractions, we develop a comprehensive multi-dimensional evaluation framework\nthat uses metrics covering operational, content-related, societal, and legal\nrisks. Through running 1840 simulations based on 92 scenarios across seven\ndomains (e.g., healthcare, finance, education), we demonstrate that HAICOSYSTEM\ncan emulate realistic user-AI interactions and complex tool use by AI agents.\nOur experiments show that state-of-the-art LLMs, both proprietary and\nopen-sourced, exhibit safety risks in over 50\\% cases, with models generally\nshowing higher risks when interacting with simulated malicious users. Our\nfindings highlight the ongoing challenge of building agents that can safely\nnavigate complex interactions, particularly when faced with malicious users. To\nfoster the AI agent safety ecosystem, we release a code platform that allows\npractitioners to create custom scenarios, simulate interactions, and evaluate\nthe safety and performance of their agents.\n","authors":["Xuhui Zhou","Hyunwoo Kim","Faeze Brahman","Liwei Jiang","Hao Zhu","Ximing Lu","Frank Xu","Bill Yuchen Lin","Yejin Choi","Niloofar Mireshghallah","Ronan Le Bras","Maarten Sap"],"pdf_url":"https://arxiv.org/pdf/2409.16427v2.pdf","comment":"Both the second and third authors contributed equally"},{"id":"http://arxiv.org/abs/2409.17978v1","updated":"2024-09-26T15:52:36Z","published":"2024-09-26T15:52:36Z","title":"HydraViT: Stacking Heads for a Scalable ViT","summary":" The architecture of Vision Transformers (ViTs), particularly the Multi-head\nAttention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs\non devices with varying constraints, such as mobile phones, requires multiple\nmodels of different sizes. However, this approach has limitations, such as\ntraining and storing each required model separately. This paper introduces\nHydraViT, a novel approach that addresses these limitations by stacking\nattention heads to achieve a scalable ViT. By repeatedly changing the size of\nthe embedded dimensions throughout each layer and their corresponding number of\nattention heads in MHA during training, HydraViT induces multiple subnetworks.\nThereby, HydraViT achieves adaptability across a wide spectrum of hardware\nenvironments while maintaining performance. Our experimental results\ndemonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10\nsubnetworks, covering a wide range of resource constraints. HydraViT achieves\nup to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy\nwith the same throughput on ImageNet-1K compared to the baselines, making it an\neffective solution for scenarios where hardware availability is diverse or\nvaries over time. Source code available at https://github.com/ds-kiel/HydraViT.\n","authors":["Janek Haberer","Ali Hojjat","Olaf Landsiedel"],"pdf_url":"https://arxiv.org/pdf/2409.17978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14950v2","updated":"2024-09-26T15:45:13Z","published":"2023-12-08T15:57:18Z","title":"TypeFly: Flying Drones with Large Language Model","summary":" Recent advancements in robot control using large language models (LLMs) have\ndemonstrated significant potential, primarily due to LLMs' capabilities to\nunderstand natural language commands and generate executable plans in various\nlanguages. However, in real-time and interactive applications involving mobile\nrobots, particularly drones, the sequential token generation process inherent\nto LLMs introduces substantial latency, i.e. response time, in control plan\ngeneration.\n In this paper, we present a system called ChatFly that tackles this problem\nusing a combination of a novel programming language called MiniSpec and its\nruntime to reduce the plan generation time and drone response time. That is,\ninstead of asking an LLM to write a program (robotic plan) in the popular but\nverbose Python, ChatFly gets it to do it in MiniSpec specially designed for\ntoken efficiency and stream interpretation. Using a set of challenging drone\ntasks, we show that design choices made by ChatFly can reduce up to 62%\nresponse time and provide a more consistent user experience, enabling\nresponsive and intelligent LLM-based drone control with efficient completion.\n","authors":["Guojun Chen","Xiaojing Yu","Neiwen Ling","Lin Zhong"],"pdf_url":"https://arxiv.org/pdf/2312.14950v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17954v1","updated":"2024-09-26T15:30:54Z","published":"2024-09-26T15:30:54Z","title":"Enhancing elusive clues in knowledge learning by contrasting attention\n of language models","summary":" Causal language models acquire vast amount of knowledge from general text\ncorpus during pretraining, but the efficiency of knowledge learning is known to\nbe unsatisfactory, especially when learning from knowledge-dense and\nsmall-sized corpora. The deficiency can come from long-distance dependencies\nwhich are hard to capture by language models, and overfitting to co-occurrence\npatterns and distracting clues in the training text. To address these issues,\nthe paper proposes a method to enhance knowledge learning during language model\npretraining, by enhancing elusive but important clues in text discovered by the\nlanguage model themselves. We found that larger language models pay more\nattention to non-obvious but important clues, which are often overlooked by\nsmaller language models. Therefore, we can identify these clues by contrasting\nthe attention weights of large and small language models. We use the identified\nclues as a guide to perform token-dropout data augmentation on the training\ntext, and observed a significant boost in both small and large models'\nperformance in fact memorization. This shows that the behavior contrast between\nmore and less-performant language models contains important clues for knowledge\nlearning, and it can be ``amplified\" for a straight-forward improvement in\nknowledge learning efficiency.\n","authors":["Jian Gao","Xiao Zhang","Ji Wu","Miao Li"],"pdf_url":"https://arxiv.org/pdf/2409.17954v1.pdf","comment":"7 pages and 17 figures"},{"id":"http://arxiv.org/abs/2312.14115v4","updated":"2024-09-26T15:30:00Z","published":"2023-12-21T18:40:34Z","title":"LingoQA: Visual Question Answering for Autonomous Driving","summary":" We introduce LingoQA, a novel dataset and benchmark for visual question\nanswering in autonomous driving. The dataset contains 28K unique short video\nscenarios, and 419K annotations. Evaluating state-of-the-art vision-language\nmodels on our benchmark shows that their performance is below human\ncapabilities, with GPT-4V responding truthfully to 59.6% of the questions\ncompared to 96.6% for humans. For evaluation, we propose a truthfulness\nclassifier, called Lingo-Judge, that achieves a 0.95 Spearman correlation\ncoefficient to human evaluations, surpassing existing techniques like METEOR,\nBLEU, CIDEr, and GPT-4. We establish a baseline vision-language model and run\nextensive ablation studies to understand its performance. We release our\ndataset and benchmark as an evaluation platform for vision-language models in\nautonomous driving.\n","authors":["Ana-Maria Marcu","Long Chen","Jan Hünermann","Alice Karnsund","Benoit Hanotte","Prajwal Chidananda","Saurabh Nair","Vijay Badrinarayanan","Alex Kendall","Jamie Shotton","Elahe Arani","Oleg Sinavski"],"pdf_url":"https://arxiv.org/pdf/2312.14115v4.pdf","comment":"Accepted to ECCV 2024. Benchmark and dataset are available at\n https://github.com/wayveai/LingoQA/"},{"id":"http://arxiv.org/abs/2409.13740v2","updated":"2024-09-26T15:27:08Z","published":"2024-09-10T16:37:58Z","title":"Language agents achieve superhuman synthesis of scientific knowledge","summary":" Language models are known to hallucinate incorrect information, and it is\nunclear if they are sufficiently accurate and reliable for use in scientific\nresearch. We developed a rigorous human-AI comparison methodology to evaluate\nlanguage model agents on real-world literature search tasks covering\ninformation retrieval, summarization, and contradiction detection tasks. We\nshow that PaperQA2, a frontier language model agent optimized for improved\nfactuality, matches or exceeds subject matter expert performance on three\nrealistic literature research tasks without any restrictions on humans (i.e.,\nfull access to internet, search tools, and time). PaperQA2 writes cited,\nWikipedia-style summaries of scientific topics that are significantly more\naccurate than existing, human-written Wikipedia articles. We also introduce a\nhard benchmark for scientific literature research called LitQA2 that guided\ndesign of PaperQA2, leading to it exceeding human performance. Finally, we\napply PaperQA2 to identify contradictions within the scientific literature, an\nimportant scientific task that is challenging for humans. PaperQA2 identifies\n2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of\nwhich 70% are validated by human experts. These results demonstrate that\nlanguage model agents are now capable of exceeding domain experts across\nmeaningful tasks on scientific literature.\n","authors":["Michael D. Skarlinski","Sam Cox","Jon M. Laurent","James D. Braza","Michaela Hinks","Michael J. Hammerling","Manvitha Ponnapati","Samuel G. Rodriques","Andrew D. White"],"pdf_url":"https://arxiv.org/pdf/2409.13740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14500v2","updated":"2024-09-26T15:26:43Z","published":"2024-09-22T15:53:19Z","title":"TabGraphs: A Benchmark and Strong Baselines for Learning on Graphs with\n Tabular Node Features","summary":" Tabular machine learning is an important field for industry and science. In\nthis field, table rows are usually treated as independent data samples, but\nadditional information about relations between them is sometimes available and\ncan be used to improve predictive performance. Such information can be\nnaturally modeled with a graph, thus tabular machine learning may benefit from\ngraph machine learning methods. However, graph machine learning models are\ntypically evaluated on datasets with homogeneous node features, which have\nlittle in common with heterogeneous mixtures of numerical and categorical\nfeatures present in tabular datasets. Thus, there is a critical difference\nbetween the data used in tabular and graph machine learning studies, which does\nnot allow one to understand how successfully graph models can be transferred to\ntabular data. To bridge this gap, we propose a new benchmark of diverse graphs\nwith heterogeneous tabular node features and realistic prediction tasks. We use\nthis benchmark to evaluate a vast set of models, including simple methods\npreviously overlooked in the literature. Our experiments show that graph neural\nnetworks (GNNs) can indeed often bring gains in predictive performance for\ntabular data, but standard tabular models also can be adapted to work with\ngraph data by using simple feature preprocessing, which sometimes enables them\nto compete with and even outperform GNNs. Based on our empirical study, we\nprovide insights for researchers and practitioners in both tabular and graph\nmachine learning fields.\n","authors":["Gleb Bazhenov","Oleg Platonov","Liudmila Prokhorenkova"],"pdf_url":"https://arxiv.org/pdf/2409.14500v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09825v2","updated":"2024-09-26T15:21:31Z","published":"2024-06-14T08:29:34Z","title":"Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of\n Anomalous Behavior in Bio-regenerative Life Support System Telemetry","summary":" The detection of abnormal or critical system states is essential in condition\nmonitoring. While much attention is given to promptly identifying anomalies, a\nretrospective analysis of these anomalies can significantly enhance our\ncomprehension of the underlying causes of observed undesired behavior. This\naspect becomes particularly critical when the monitored system is deployed in a\nvital environment. In this study, we delve into anomalies within the domain of\nBio-Regenerative Life Support Systems (BLSS) for space exploration and analyze\nanomalies found in telemetry data stemming from the EDEN ISS space greenhouse\nin Antarctica. We employ time series clustering on anomaly detection results to\ncategorize various types of anomalies in both uni- and multivariate settings.\nWe then assess the effectiveness of these methods in identifying systematic\nanomalous behavior. Additionally, we illustrate that the anomaly detection\nmethods MDI and DAMP produce complementary results, as previously indicated by\nresearch.\n","authors":["Ferdinand Rewicki","Jakob Gawlikowski","Julia Niebling","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2406.09825v2.pdf","comment":"12 pages, + Supplemental Materials, Published at Machine Learning and\n Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024"},{"id":"http://arxiv.org/abs/2409.17946v1","updated":"2024-09-26T15:20:37Z","published":"2024-09-26T15:20:37Z","title":"Weak-To-Strong Backdoor Attacks for LLMs with Contrastive Knowledge\n Distillation","summary":" Despite being widely applied due to their exceptional capabilities, Large\nLanguage Models (LLMs) have been proven to be vulnerable to backdoor attacks.\nThese attacks introduce targeted vulnerabilities into LLMs by poisoning\ntraining samples and full-parameter fine-tuning. However, this kind of backdoor\nattack is limited since they require significant computational resources,\nespecially as the size of LLMs increases. Besides, parameter-efficient\nfine-tuning (PEFT) offers an alternative but the restricted parameter updating\nmay impede the alignment of triggers with target labels. In this study, we\nfirst verify that backdoor attacks with PEFT may encounter challenges in\nachieving feasible performance. To address these issues and improve the\neffectiveness of backdoor attacks with PEFT, we propose a novel backdoor attack\nalgorithm from weak to strong based on contrastive knowledge distillation\n(W2SAttack). Specifically, we poison small-scale language models through\nfull-parameter fine-tuning to serve as the teacher model. The teacher model\nthen covertly transfers the backdoor to the large-scale student model through\ncontrastive knowledge distillation, which employs PEFT. Theoretical analysis\nreveals that W2SAttack has the potential to augment the effectiveness of\nbackdoor attacks. We demonstrate the superior performance of W2SAttack on\nclassification tasks across four language models, four backdoor attack\nalgorithms, and two different architectures of teacher models. Experimental\nresults indicate success rates close to 100% for backdoor attacks targeting\nPEFT.\n","authors":["Shuai Zhao","Leilei Gan","Zhongliang Guo","Xiaobao Wu","Luwei Xiao","Xiaoyu Xu","Cong-Duy Nguyen","Luu Anh Tuan"],"pdf_url":"https://arxiv.org/pdf/2409.17946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17943v1","updated":"2024-09-26T15:18:34Z","published":"2024-09-26T15:18:34Z","title":"On Translating Technical Terminology: A Translation Workflow for\n Machine-Translated Acronyms","summary":" The typical workflow for a professional translator to translate a document\nfrom its source language (SL) to a target language (TL) is not always focused\non what many language models in natural language processing (NLP) do - predict\nthe next word in a series of words. While high-resource languages like English\nand French are reported to achieve near human parity using common metrics for\nmeasurement such as BLEU and COMET, we find that an important step is being\nmissed: the translation of technical terms, specifically acronyms. Some\nstate-of-the art machine translation systems like Google Translate which are\npublicly available can be erroneous when dealing with acronyms - as much as 50%\nin our findings. This article addresses acronym disambiguation for MT systems\nby proposing an additional step to the SL-TL (FR-EN) translation workflow where\nwe first offer a new acronym corpus for public consumption and then experiment\nwith a search-based thresholding algorithm that achieves nearly 10% increase\nwhen compared to Google Translate and OpusMT.\n","authors":["Richard Yue","John E. Ortega","Kenneth Ward Church"],"pdf_url":"https://arxiv.org/pdf/2409.17943v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17939v1","updated":"2024-09-26T15:12:59Z","published":"2024-09-26T15:12:59Z","title":"Predicting Anchored Text from Translation Memories for Machine\n Translation Using Deep Learning Methods","summary":" Translation memories (TMs) are the backbone for professional translation\ntools called computer-aided translation (CAT) tools. In order to perform a\ntranslation using a CAT tool, a translator uses the TM to gather translations\nsimilar to the desired segment to translate (s'). Many CAT tools offer a\nfuzzy-match algorithm to locate segments (s) in the TM that are close in\ndistance to s'. After locating two similar segments, the CAT tool will present\nparallel segments (s, t) that contain one segment in the source language along\nwith its translation in the target language. Additionally, CAT tools contain\nfuzzy-match repair (FMR) techniques that will automatically use the parallel\nsegments from the TM to create new TM entries containing a modified version of\nthe original with the idea in mind that it will be the translation of s'. Most\nFMR techniques use machine translation as a way of \"repairing\" those words that\nhave to be modified. In this article, we show that for a large part of those\nwords which are anchored, we can use other techniques that are based on machine\nlearning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we\nshow that for anchored words that follow the continuous bag-of-words (CBOW)\nparadigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for\nsome cases, better results than neural machine translation for translating\nanchored words from French to English.\n","authors":["Richard Yue","John E. Ortega"],"pdf_url":"https://arxiv.org/pdf/2409.17939v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17931v1","updated":"2024-09-26T15:08:38Z","published":"2024-09-26T15:08:38Z","title":"Intelligent Energy Management: Remaining Useful Life Prediction and\n Charging Automation System Comprised of Deep Learning and the Internet of\n Things","summary":" Remaining Useful Life (RUL) of battery is an important parameter to know the\nbattery's remaining life and need for recharge. The goal of this research\nproject is to develop machine learning-based models for the battery RUL\ndataset. Different ML models are developed to classify the RUL of the vehicle,\nand the IoT (Internet of Things) concept is simulated for automating the\ncharging system and managing any faults aligning. The graphs plotted depict the\nrelationship between various vehicle parameters using the Blynk IoT platform.\nResults show that the catboost, Multi-Layer Perceptron (MLP), Gated Recurrent\nUnit (GRU), and hybrid model developed could classify RUL into three classes\nwith 99% more accuracy. The data is fed using the tkinter GUI for simulating\nartificial intelligence (AI)-based charging, and with a pyserial backend, data\ncan be entered into the Esp-32 microcontroller for making charge discharge\npossible with the model's predictions. Also, with an IoT system, the charging\ncan be disconnected, monitored, and analyzed for automation. The results show\nthat an accuracy of 99% can be obtained on models MLP, catboost model and\nsimilar accuracy on GRU model can be obtained, and finally relay-based\ntriggering can be made by prediction through the model used for automating the\ncharging and energy-saving mechanism. By showcasing an exemplary Blynk\nplatform-based monitoring and automation phenomenon, we further present\ninnovative ways of monitoring parameters and automating the system.\n","authors":["Biplov Paneru","Bishwash Paneru","DP Sharma Mainali"],"pdf_url":"https://arxiv.org/pdf/2409.17931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17928v1","updated":"2024-09-26T15:07:30Z","published":"2024-09-26T15:07:30Z","title":"Pioneering Reliable Assessment in Text-to-Image Knowledge Editing:\n Leveraging a Fine-Grained Dataset and an Innovative Criterion","summary":" During pre-training, the Text-to-Image (T2I) diffusion models encode factual\nknowledge into their parameters. These parameterized facts enable realistic\nimage generation, but they may become obsolete over time, thereby\nmisrepresenting the current state of the world. Knowledge editing techniques\naim to update model knowledge in a targeted way. However, facing the dual\nchallenges posed by inadequate editing datasets and unreliable evaluation\ncriterion, the development of T2I knowledge editing encounter difficulties in\neffectively generalizing injected knowledge. In this work, we design a T2I\nknowledge editing framework by comprehensively spanning on three phases: First,\nwe curate a dataset \\textbf{CAKE}, comprising paraphrase and multi-object test,\nto enable more fine-grained assessment on knowledge generalization. Second, we\npropose a novel criterion, \\textbf{adaptive CLIP threshold}, to effectively\nfilter out false successful images under the current criterion and achieve\nreliable editing evaluation. Finally, we introduce \\textbf{MPE}, a simple but\neffective approach for T2I knowledge editing. Instead of tuning parameters, MPE\nprecisely recognizes and edits the outdated part of the conditioning\ntext-prompt to accommodate the up-to-date knowledge. A straightforward\nimplementation of MPE (Based on in-context learning) exhibits better overall\nperformance than previous model editors. We hope these efforts can further\npromote faithful evaluation of T2I knowledge editing methods.\n","authors":["Hengrui Gu","Kaixiong Zhou","Yili Wang","Ruobing Wang","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17928v1.pdf","comment":"EMNLP24 Findings"},{"id":"http://arxiv.org/abs/2409.17922v1","updated":"2024-09-26T15:05:15Z","published":"2024-09-26T15:05:15Z","title":"Navigation in a simplified Urban Flow through Deep Reinforcement\n Learning","summary":" The increasing number of unmanned aerial vehicles (UAVs) in urban\nenvironments requires a strategy to minimize their environmental impact, both\nin terms of energy efficiency and noise reduction. In order to reduce these\nconcerns, novel strategies for developing prediction models and optimization of\nflight planning, for instance through deep reinforcement learning (DRL), are\nneeded. Our goal is to develop DRL algorithms capable of enabling the\nautonomous navigation of UAVs in urban environments, taking into account the\npresence of buildings and other UAVs, optimizing the trajectories in order to\nreduce both energetic consumption and noise. This is achieved using fluid-flow\nsimulations which represent the environment in which UAVs navigate and training\nthe UAV as an agent interacting with an urban environment. In this work, we\nconsider a domain domain represented by a two-dimensional flow field with\nobstacles, ideally representing buildings, extracted from a three-dimensional\nhigh-fidelity numerical simulation. The presented methodology, using PPO+LSTM\ncells, was validated by reproducing a simple but fundamental problem in\nnavigation, namely the Zermelo's problem, which deals with a vessel navigating\nin a turbulent flow, travelling from a starting point to a target location,\noptimizing the trajectory. The current method shows a significant improvement\nwith respect to both a simple PPO and a TD3 algorithm, with a success rate (SR)\nof the PPO+LSTM trained policy of 98.7%, and a crash rate (CR) of 0.1%,\noutperforming both PPO (SR = 75.6%, CR=18.6%) and TD3 (SR=77.4% and CR=14.5%).\nThis is the first step towards DRL strategies which will guide UAVs in a\nthree-dimensional flow field using real-time signals, making the navigation\nefficient in terms of flight time and avoiding damages to the vehicle.\n","authors":["Federica Tonti","Jean Rabault","Ricardo Vinuesa"],"pdf_url":"https://arxiv.org/pdf/2409.17922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15228v3","updated":"2024-09-26T14:57:52Z","published":"2024-09-23T17:22:09Z","title":"A Comprehensive Framework for Evaluating API-oriented Code Generation in\n Large Language Models","summary":" Large language models (LLMs) like GitHub Copilot and ChatGPT have emerged as\npowerful tools for code generation, significantly enhancing productivity and\naccelerating software development. However, existing benchmarks primarily focus\non general code generation without considering API-oriented code generation,\ni.e., generating code that invokes APIs from specific libraries. Given the\ngrowing demand for API-oriented code generation, there is a pressing need for a\nsystematic and automated approach to evaluate LLM on API-oriented code\ngeneration. To address this gap, we propose AutoAPIEval, a lightweight and\nautomated framework designed to evaluate the capabilities of LLMs in\nAPI-oriented code generation. Our framework works with any library that\nprovides API documentation and focuses on two unit tasks: API recommendation\nand code example generation, along with four metrics to evaluate the generated\nAPIs and code examples, such as the proportion of incorrect API recommendations\nfor Task 1, and the proportion of code examples where no specific API is\ninvoked and uncompilable/unexecutable code examples for Task 2. In addition, we\nconducted a case study on three LLMs (ChatGPT, MagiCoder, and DeepSeek Coder)\nand Java Runtime Environment 8 to demonstrate the framework's effectiveness.\nOur findings reveal substantial variability in LLM performance across tasks,\nwith ChatGPT adhering better to instructions, while sharing similar\neffectiveness in code example generation with its counterparts (i.e., MagiCoder\nand DeekSeek Coder). We also identify key factors associated with code quality,\nsuch as API popularity and model confidence, and build classifiers that achieve\nhigh accuracy in detecting incorrect API recommendations and erroneous code\nexamples. Retrieval-augmented generation enhances the quality of code generated\nby LLMs, though its effectiveness varies across different LLMs.\n","authors":["Yixi Wu","Pengfei He","Zehao Wang","Shaowei Wang","Yuan Tian","Tse-Hsun Chen"],"pdf_url":"https://arxiv.org/pdf/2409.15228v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17907v1","updated":"2024-09-26T14:52:51Z","published":"2024-09-26T14:52:51Z","title":"PhantomLiDAR: Cross-modality Signal Injection Attacks against LiDAR","summary":" LiDAR (Light Detection and Ranging) is a pivotal sensor for autonomous\ndriving, offering precise 3D spatial information. Previous signal attacks\nagainst LiDAR systems mainly exploit laser signals. In this paper, we\ninvestigate the possibility of cross-modality signal injection attacks, i.e.,\ninjecting intentional electromagnetic interference (IEMI) to manipulate LiDAR\noutput. Our insight is that the internal modules of a LiDAR, i.e., the laser\nreceiving circuit, the monitoring sensors, and the beam-steering modules, even\nwith strict electromagnetic compatibility (EMC) testing, can still couple with\nthe IEMI attack signals and result in the malfunction of LiDAR systems. Based\non the above attack surfaces, we propose the PhantomLiDAR attack, which\nmanipulates LiDAR output in terms of Points Interference, Points Injection,\nPoints Removal, and even LiDAR Power-Off. We evaluate and demonstrate the\neffectiveness of PhantomLiDAR with both simulated and real-world experiments on\nfive COTS LiDAR systems. We also conduct feasibility experiments in real-world\nmoving scenarios. We provide potential defense measures that can be implemented\nat both the sensor level and the vehicle system level to mitigate the risks\nassociated with IEMI attacks. Video demonstrations can be viewed at\nhttps://sites.google.com/view/phantomlidar.\n","authors":["Zizhi Jin","Qinhong Jiang","Xuancun Lu","Chen Yan","Xiaoyu Ji","Wenyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17904v1","updated":"2024-09-26T14:51:40Z","published":"2024-09-26T14:51:40Z","title":"Learning to Love Edge Cases in Formative Math Assessment: Using the\n AMMORE Dataset and Chain-of-Thought Prompting to Improve Grading Accuracy","summary":" This paper introduces AMMORE, a new dataset of 53,000 math open-response\nquestion-answer pairs from Rori, a learning platform used by students in\nseveral African countries and conducts two experiments to evaluate the use of\nlarge language models (LLM) for grading particularly challenging student\nanswers. The AMMORE dataset enables various potential analyses and provides an\nimportant resource for researching student math acquisition in understudied,\nreal-world, educational contexts. In experiment 1 we use a variety of\nLLM-driven approaches, including zero-shot, few-shot, and chain-of-thought\nprompting, to grade the 1% of student answers that a rule-based classifier\nfails to grade accurately. We find that the best-performing approach --\nchain-of-thought prompting -- accurately scored 92% of these edge cases,\neffectively boosting the overall accuracy of the grading from 98.7% to 99.9%.\nIn experiment 2, we aim to better understand the consequential validity of the\nimproved grading accuracy, by passing grades generated by the best-performing\nLLM-based approach to a Bayesian Knowledge Tracing (BKT) model, which estimated\nstudent mastery of specific lessons. We find that relatively modest\nimprovements in model accuracy at the individual question level can lead to\nsignificant changes in the estimation of student mastery. Where the rules-based\nclassifier currently used to grade student, answers misclassified the mastery\nstatus of 6.9% of students across their completed lessons, using the LLM\nchain-of-thought approach this misclassification rate was reduced to 2.6% of\nstudents. Taken together, these findings suggest that LLMs could be a valuable\ntool for grading open-response questions in K-12 mathematics education,\npotentially enabling encouraging wider adoption of open-ended questions in\nformative assessment.\n","authors":["Owen Henkel","Hannah Horne-Robinson","Maria Dyshel","Nabil Ch","Baptiste Moreau-Pernet","Ralph Abood"],"pdf_url":"https://arxiv.org/pdf/2409.17904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17899v1","updated":"2024-09-26T14:49:09Z","published":"2024-09-26T14:49:09Z","title":"Revisiting Acoustic Similarity in Emotional Speech and Music via\n Self-Supervised Representations","summary":" Emotion recognition from speech and music shares similarities due to their\nacoustic overlap, which has led to interest in transferring knowledge between\nthese domains. However, the shared acoustic cues between speech and music,\nparticularly those encoded by Self-Supervised Learning (SSL) models, remain\nlargely unexplored, given the fact that SSL models for speech and music have\nrarely been applied in cross-domain research. In this work, we revisit the\nacoustic similarity between emotion speech and music, starting with an analysis\nof the layerwise behavior of SSL models for Speech Emotion Recognition (SER)\nand Music Emotion Recognition (MER). Furthermore, we perform cross-domain\nadaptation by comparing several approaches in a two-stage fine-tuning process,\nexamining effective ways to utilize music for SER and speech for MER. Lastly,\nwe explore the acoustic similarities between emotional speech and music using\nFrechet audio distance for individual emotions, uncovering the issue of emotion\nbias in both speech and music SSL models. Our findings reveal that while speech\nand music SSL models do capture shared acoustic features, their behaviors can\nvary depending on different emotions due to their training strategies and\ndomain-specificities. Additionally, parameter-efficient fine-tuning can enhance\nSER and MER performance by leveraging knowledge from each other. This study\nprovides new insights into the acoustic similarity between emotional speech and\nmusic, and highlights the potential for cross-domain generalization to improve\nSER and MER systems.\n","authors":["Yujia Sun","Zeyu Zhao","Korin Richmond","Yuanchao Li"],"pdf_url":"https://arxiv.org/pdf/2409.17899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09577v2","updated":"2024-09-26T14:34:53Z","published":"2024-04-15T08:38:43Z","title":"Transformers, Contextualism, and Polysemy","summary":" The transformer architecture, introduced by Vaswani et al. (2017), is at the\nheart of the remarkable recent progress in the development of language models,\nincluding widely-used chatbots such as Chat-GPT and Claude. In this paper, I\nargue that we can extract from the way the transformer architecture works a\ntheory of the relationship between context and meaning. I call this the\ntransformer theory, and I argue that it is novel with regard to two related\nphilosophical debates: the contextualism debate regarding the extent of\ncontext-sensitivity across natural language, and the polysemy debate regarding\nhow polysemy should be captured within an account of word meaning.\n","authors":["Jumbly Grindrod"],"pdf_url":"https://arxiv.org/pdf/2404.09577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10588v5","updated":"2024-09-26T14:34:06Z","published":"2024-09-16T14:56:27Z","title":"Opponent Shaping for Antibody Development","summary":" Anti-viral therapies are typically designed to target the current strains of\na virus. Game theoretically, this corresponds to a short-sighted, or myopic,\nresponse. However, therapy-induced selective pressures act on viral antigens to\ndrive the emergence of mutated strains, against which initial therapies have\nreduced efficacy. Building on a computational model of binding between\nantibodies and viral antigens (the Absolut! framework), we design and implement\na genetic simulation of such viral evolutionary escape. Crucially, this allows\nour antibody optimisation algorithm to consider and influence the entire escape\ncurve of the virus, i.e. to guide (or ''shape'') the viral evolution. This is\ninspired by opponent shaping which, in general-sum learning, accounts for the\nadaptation of the co-player rather than playing a myopic best response. Hence\nwe call the optimised antibodies shapers. Within our simulations, we\ndemonstrate that our shapers target both current and simulated future viral\nvariants, outperforming the antibodies chosen in a myopic way. Furthermore, we\nshow that shapers exert specific evolutionary pressure on the virus compared to\nmyopic antibodies. Altogether, shapers modify the evolutionary trajectories of\nviral strains and minimise the viral escape compared to their myopic\ncounterparts. While this is a simplified model, we hope that our proposed\nparadigm will enable the discovery of better long-lived vaccines and antibody\ntherapies in the future, enabled by rapid advancements in the capabilities of\nsimulation tools. Our code is available at\nhttps://github.com/olakalisz/antibody-shapers.\n","authors":["Sebastian Towers","Aleksandra Kalisz","Philippe A. Robert","Alicia Higueruelo","Francesca Vianello","Ming-Han Chloe Tsai","Harrison Steel","Jakob N. Foerster"],"pdf_url":"https://arxiv.org/pdf/2409.10588v5.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2409.17876v1","updated":"2024-09-26T14:23:44Z","published":"2024-09-26T14:23:44Z","title":"Why Companies \"Democratise\" Artificial Intelligence: The Case of Open\n Source Software Donations","summary":" Companies claim to \"democratise\" artificial intelligence (AI) when they\ndonate AI open source software (OSS) to non-profit foundations or release AI\nmodels, among others, but what does this term mean and why do they do it? As\nthe impact of AI on society and the economy grows, understanding the commercial\nincentives behind AI democratisation efforts is crucial for ensuring these\nefforts serve broader interests beyond commercial agendas. Towards this end,\nthis study employs a mixed-methods approach to investigate commercial\nincentives for 43 AI OSS donations to the Linux Foundation. It makes\ncontributions to both research and practice. It contributes a taxonomy of both\nindividual and organisational social, economic, and technological incentives\nfor AI democratisation. In particular, it highlights the role of democratising\nthe governance and control rights of an OSS project (i.e., from one company to\nopen governance) as a structural enabler for downstream goals, such as\nattracting external contributors, reducing development costs, and influencing\nindustry standards, among others. Furthermore, OSS donations are often\nchampioned by individual developers within companies, highlighting the\nimportance of the bottom-up incentives for AI democratisation. The taxonomy\nprovides a framework and toolkit for discerning incentives for other AI\ndemocratisation efforts, such as the release of AI models. The paper concludes\nwith a discussion of future research directions.\n","authors":["Cailean Osborne"],"pdf_url":"https://arxiv.org/pdf/2409.17876v1.pdf","comment":"30 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2310.01807v2","updated":"2024-09-26T14:21:10Z","published":"2023-10-03T05:40:56Z","title":"Discrete, compositional, and symbolic representations through attractor\n dynamics","summary":" Symbolic systems are powerful frameworks for modeling cognitive processes as\nthey encapsulate the rules and relationships fundamental to many aspects of\nhuman reasoning and behavior. Central to these models are systematicity,\ncompositionality, and productivity, making them invaluable in both cognitive\nscience and artificial intelligence. However, certain limitations remain. For\ninstance, the integration of structured symbolic processes and latent\nsub-symbolic processes has been implemented at the computational level through\nfiat methods such as quantization or softmax sampling, which assume, rather\nthan derive, the operations underpinning discretization and symbolicization. In\nthis work, we introduce a novel neural stochastic dynamical systems model that\nintegrates attractor dynamics with symbolic representations to model cognitive\nprocesses akin to the probabilistic language of thought (PLoT). Our model\nsegments the continuous representational space into discrete basins, with\nattractor states corresponding to symbolic sequences, that reflect the\nsemanticity and compositionality characteristic of symbolic systems through\nunsupervised learning, rather than relying on pre-defined primitives. Moreover,\nlike PLoT, our model learns to sample a diverse distribution of attractor\nstates that reflect the mutual information between the input data and the\nsymbolic encodings. This approach establishes a unified framework that\nintegrates both symbolic and sub-symbolic processing through neural dynamics, a\nneuro-plausible substrate with proven expressivity in AI, offering a more\ncomprehensive model that mirrors the complex duality of cognitive operations.\n","authors":["Andrew Nam","Eric Elmoznino","Nikolay Malkin","James McClelland","Yoshua Bengio","Guillaume Lajoie"],"pdf_url":"https://arxiv.org/pdf/2310.01807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17874v1","updated":"2024-09-26T14:20:14Z","published":"2024-09-26T14:20:14Z","title":"DarkSAM: Fooling Segment Anything Model to Segment Nothing","summary":" Segment Anything Model (SAM) has recently gained much attention for its\noutstanding generalization to unseen data and tasks. Despite its promising\nprospect, the vulnerabilities of SAM, especially to universal adversarial\nperturbation (UAP) have not been thoroughly investigated yet. In this paper, we\npropose DarkSAM, the first prompt-free universal attack framework against SAM,\nincluding a semantic decoupling-based spatial attack and a texture\ndistortion-based frequency attack. We first divide the output of SAM into\nforeground and background. Then, we design a shadow target strategy to obtain\nthe semantic blueprint of the image as the attack target. DarkSAM is dedicated\nto fooling SAM by extracting and destroying crucial object features from images\nin both spatial and frequency domains. In the spatial domain, we disrupt the\nsemantics of both the foreground and background in the image to confuse SAM. In\nthe frequency domain, we further enhance the attack effectiveness by distorting\nthe high-frequency components (i.e., texture information) of the image.\nConsequently, with a single UAP, DarkSAM renders SAM incapable of segmenting\nobjects across diverse images with varying prompts. Experimental results on\nfour datasets for SAM and its two variant models demonstrate the powerful\nattack capability and transferability of DarkSAM.\n","authors":["Ziqi Zhou","Yufei Song","Minghui Li","Shengshan Hu","Xianlong Wang","Leo Yu Zhang","Dezhong Yao","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2409.17874v1.pdf","comment":"This paper has been accepted by the 38th Annual Conference on Neural\n Information Processing Systems (NeurIPS'24)"},{"id":"http://arxiv.org/abs/2409.17870v1","updated":"2024-09-26T14:17:58Z","published":"2024-09-26T14:17:58Z","title":"Efficient Arbitrary Precision Acceleration for Large Language Models on\n GPU Tensor Cores","summary":" Large language models (LLMs) have been widely applied but face challenges in\nefficient inference. While quantization methods reduce computational demands,\nultra-low bit quantization with arbitrary precision is hindered by limited GPU\nTensor Core support and inefficient memory management, leading to suboptimal\nacceleration. To address these challenges, we propose a comprehensive\nacceleration scheme for arbitrary precision LLMs. At its core, we introduce a\nnovel bipolar-INT data format that facilitates parallel computing and supports\nsymmetric quantization, effectively reducing data redundancy. Building on this,\nwe implement an arbitrary precision matrix multiplication scheme that\ndecomposes and recovers matrices at the bit level, enabling flexible precision\nwhile maximizing GPU Tensor Core utilization. Furthermore, we develop an\nefficient matrix preprocessing method that optimizes data layout for subsequent\ncomputations. Finally, we design a data recovery-oriented memory management\nsystem that strategically utilizes fast shared memory, significantly enhancing\nkernel execution speed and minimizing memory access latency. Experimental\nresults demonstrate our approach's effectiveness, with up to 13\\times speedup\nin matrix multiplication compared to NVIDIA's CUTLASS. When integrated into\nLLMs, we achieve up to 6.7\\times inference acceleration. These improvements\nsignificantly enhance LLM inference efficiency, enabling broader and more\nresponsive applications of LLMs.\n","authors":["Shaobo Ma","Chao Fang","Haikuo Shao","Zhongfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05208v3","updated":"2024-09-26T14:16:01Z","published":"2023-10-08T15:49:36Z","title":"ZSC-Eval: An Evaluation Toolkit and Benchmark for Multi-agent Zero-shot\n Coordination","summary":" Zero-shot coordination (ZSC) is a new cooperative multi-agent reinforcement\nlearning (MARL) challenge that aims to train an ego agent to work with diverse,\nunseen partners during deployment. The significant difference between the\ndeployment-time partners' distribution and the training partners' distribution\ndetermined by the training algorithm makes ZSC a unique out-of-distribution\n(OOD) generalization challenge. The potential distribution gap between\nevaluation and deployment-time partners leads to inadequate evaluation, which\nis exacerbated by the lack of appropriate evaluation metrics. In this paper, we\npresent ZSC-Eval, the first evaluation toolkit and benchmark for ZSC\nalgorithms. ZSC-Eval consists of: 1) Generation of evaluation partner\ncandidates through behavior-preferring rewards to approximate deployment-time\npartners' distribution; 2) Selection of evaluation partners by Best-Response\nDiversity (BR-Div); 3) Measurement of generalization performance with various\nevaluation partners via the Best-Response Proximity (BR-Prox) metric. We use\nZSC-Eval to benchmark ZSC algorithms in Overcooked and Google Research Football\nenvironments and get novel empirical findings. We also conduct a human\nexperiment of current ZSC algorithms to verify the ZSC-Eval's consistency with\nhuman evaluation. ZSC-Eval is now available at\nhttps://github.com/sjtu-marl/ZSC-Eval.\n","authors":["Xihuai Wang","Shao Zhang","Wenhao Zhang","Wentao Dong","Jingxiao Chen","Ying Wen","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.05208v3.pdf","comment":"Accepted in NeurIPS 2024 Dataset and Benchmark Track"},{"id":"http://arxiv.org/abs/2409.17865v1","updated":"2024-09-26T14:15:54Z","published":"2024-09-26T14:15:54Z","title":"Implementing a Nordic-Baltic Federated Health Data Network: a case\n report","summary":" Background: Centralized collection and processing of healthcare data across\nnational borders pose significant challenges, including privacy concerns, data\nheterogeneity and legal barriers. To address some of these challenges, we\nformed an interdisciplinary consortium to develop a feder-ated health data\nnetwork, comprised of six institutions across five countries, to facilitate\nNordic-Baltic cooperation on secondary use of health data. The objective of\nthis report is to offer early insights into our experiences developing this\nnetwork. Methods: We used a mixed-method ap-proach, combining both experimental\ndesign and implementation science to evaluate the factors affecting the\nimplementation of our network. Results: Technically, our experiments indicate\nthat the network functions without significant performance degradation compared\nto centralized simu-lation. Conclusion: While use of interdisciplinary\napproaches holds a potential to solve challeng-es associated with establishing\nsuch collaborative networks, our findings turn the spotlight on the uncertain\nregulatory landscape playing catch up and the significant operational costs.\n","authors":["Taridzo Chomutare","Aleksandar Babic","Laura-Maria Peltonen","Silja Elunurm","Peter Lundberg","Arne Jönsson","Emma Eneling","Ciprian-Virgil Gerstenberger","Troels Siggaard","Raivo Kolde","Oskar Jerdhaf","Martin Hansson","Alexandra Makhlysheva","Miroslav Muzny","Erik Ylipää","Søren Brunak","Hercules Dalianis"],"pdf_url":"https://arxiv.org/pdf/2409.17865v1.pdf","comment":"24 pages (including appendices), 1 figure"},{"id":"http://arxiv.org/abs/2409.17864v1","updated":"2024-09-26T14:12:23Z","published":"2024-09-26T14:12:23Z","title":"A Multimodal Single-Branch Embedding Network for Recommendation in\n Cold-Start and Missing Modality Scenarios","summary":" Most recommender systems adopt collaborative filtering (CF) and provide\nrecommendations based on past collective interactions. Therefore, the\nperformance of CF algorithms degrades when few or no interactions are\navailable, a scenario referred to as cold-start. To address this issue,\nprevious work relies on models leveraging both collaborative data and side\ninformation on the users or items. Similar to multimodal learning, these models\naim at combining collaborative and content representations in a shared\nembedding space. In this work we propose a novel technique for multimodal\nrecommendation, relying on a multimodal Single-Branch embedding network for\nRecommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction\ndata as well as multimodal side information using the same single-branch\nembedding network on different modalities. This makes SiBraR effective in\nscenarios of missing modality, including cold start. Our extensive experiments\non large-scale recommendation datasets from three different recommendation\ndomains (music, movie, and e-commerce) and providing multimodal content\ninformation (audio, text, image, labels, and interactions) show that SiBraR\nsignificantly outperforms CF as well as state-of-the-art content-based RSs in\ncold-start scenarios, and is competitive in warm scenarios. We show that\nSiBraR's recommendations are accurate in missing modality scenarios, and that\nthe model is able to map different modalities to the same region of the shared\nembedding space, hence reducing the modality gap.\n","authors":["Christian Ganhör","Marta Moscati","Anna Hausberger","Shah Nawaz","Markus Schedl"],"pdf_url":"https://arxiv.org/pdf/2409.17864v1.pdf","comment":"Accepted at 18th ACM Conference on Recommender Systems (RecSys '24)"},{"id":"http://arxiv.org/abs/2409.17841v1","updated":"2024-09-26T13:45:36Z","published":"2024-09-26T13:45:36Z","title":"Machine Learning-based vs Deep Learning-based Anomaly Detection in\n Multivariate Time Series for Spacecraft Attitude Sensors","summary":" In the framework of Failure Detection, Isolation and Recovery (FDIR) on\nspacecraft, new AI-based approaches are emerging in the state of the art to\novercome the limitations commonly imposed by traditional threshold checking.\n The present research aims at characterizing two different approaches to the\nproblem of stuck values detection in multivariate time series coming from\nspacecraft attitude sensors. The analysis reveals the performance differences\nin the two approaches, while commenting on their interpretability and\ngeneralization to different scenarios.\n","authors":["R. Gallon","F. Schiemenz","A. Krstova","A. Menicucci","E. Gill"],"pdf_url":"https://arxiv.org/pdf/2409.17841v1.pdf","comment":"Accepted for the ESA SPAICE Conference 2024"},{"id":"http://arxiv.org/abs/2409.17840v1","updated":"2024-09-26T13:44:22Z","published":"2024-09-26T13:44:22Z","title":"Detecting and Measuring Confounding Using Causal Mechanism Shifts","summary":" Detecting and measuring confounding effects from data is a key challenge in\ncausal inference. Existing methods frequently assume causal sufficiency,\ndisregarding the presence of unobserved confounding variables. Causal\nsufficiency is both unrealistic and empirically untestable. Additionally,\nexisting methods make strong parametric assumptions about the underlying causal\ngenerative process to guarantee the identifiability of confounding variables.\nRelaxing the causal sufficiency and parametric assumptions and leveraging\nrecent advancements in causal discovery and confounding analysis with\nnon-i.i.d. data, we propose a comprehensive approach for detecting and\nmeasuring confounding. We consider various definitions of confounding and\nintroduce tailored methodologies to achieve three objectives: (i) detecting and\nmeasuring confounding among a set of variables, (ii) separating observed and\nunobserved confounding effects, and (iii) understanding the relative strengths\nof confounding bias between different sets of variables. We present useful\nproperties of a confounding measure and present measures that satisfy those\nproperties. Empirical results support the theoretical analysis.\n","authors":["Abbavaram Gowtham Reddy","Vineeth N Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2409.17840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17836v1","updated":"2024-09-26T13:38:33Z","published":"2024-09-26T13:38:33Z","title":"Language Models as Zero-shot Lossless Gradient Compressors: Towards\n General Neural Parameter Prior Models","summary":" Despite the widespread use of statistical prior models in various fields,\nsuch models for neural network gradients have long been overlooked. The\ninherent challenge stems from their high-dimensional structures and complex\ninterdependencies, which complicate effective modeling. In this work, we\ndemonstrate the potential of large language models (LLMs) to act as gradient\npriors in a zero-shot setting. We examine the property by considering lossless\ngradient compression -- a critical application in distributed learning -- that\ndepends heavily on precise probability modeling. To achieve this, we introduce\nLM-GC, a novel method that integrates LLMs with arithmetic coding. Our\ntechnique converts plain gradients into text-like formats, enhancing token\nefficiency by up to 38 times compared to their plain representations. We ensure\nthat this data conversion maintains a close alignment with the structure of\nplain gradients and the symbols commonly recognized by LLMs. Our experiments\nindicate that LM-GC surpasses existing state-of-the-art lossless compression\nmethods, improving compression rates by 10\\% up to 17.2\\% across various\ndatasets and architectures. Additionally, our approach shows promising\ncompatibility with lossy compression techniques such as quantization and\nsparsification. These findings highlight the significant potential of LLMs as a\nmodel for effectively handling gradients. We will release the source code upon\npublication.\n","authors":["Hui-Po Wang","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2409.17836v1.pdf","comment":"To appear in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2305.01899v2","updated":"2024-09-26T13:34:35Z","published":"2023-05-03T05:16:54Z","title":"Empowering Agrifood System with Artificial Intelligence: A Survey of the\n Progress, Challenges and Opportunities","summary":" With the world population rapidly increasing, transforming our agrifood\nsystems to be more productive, efficient, safe, and sustainable is crucial to\nmitigate potential food shortages. Recently, artificial intelligence (AI)\ntechniques such as deep learning (DL) have demonstrated their strong abilities\nin various areas, including language, vision, remote sensing (RS), and agrifood\nsystems applications. However, the overall impact of AI on agrifood systems\nremains unclear. In this paper, we thoroughly review how AI techniques can\ntransform agrifood systems and contribute to the modern agrifood industry.\nFirstly, we summarize the data acquisition methods in agrifood systems,\nincluding acquisition, storage, and processing techniques. Secondly, we present\na progress review of AI methods in agrifood systems, specifically in\nagriculture, animal husbandry, and fishery, covering topics such as agrifood\nclassification, growth monitoring, yield prediction, and quality assessment.\nFurthermore, we highlight potential challenges and promising research\nopportunities for transforming modern agrifood systems with AI. We hope this\nsurvey could offer an overall picture to newcomers in the field and serve as a\nstarting point for their further research. The project website is\nhttps://github.com/Frenkie14/Agrifood-Survey.\n","authors":["Tao Chen","Liang Lv","Di Wang","Jing Zhang","Yue Yang","Zeyang Zhao","Chen Wang","Xiaowei Guo","Hao Chen","Qingye Wang","Yufei Xu","Qiming Zhang","Bo Du","Liangpei Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2305.01899v2.pdf","comment":"Accepted by ACM Computing Surveys"},{"id":"http://arxiv.org/abs/2409.16934v2","updated":"2024-09-26T13:22:37Z","published":"2024-09-25T13:45:23Z","title":"Investigating OCR-Sensitive Neurons to Improve Entity Recognition in\n Historical Documents","summary":" This paper investigates the presence of OCR-sensitive neurons within the\nTransformer architecture and their influence on named entity recognition (NER)\nperformance on historical documents. By analysing neuron activation patterns in\nresponse to clean and noisy text inputs, we identify and then neutralise\nOCR-sensitive neurons to improve model performance. Based on two open access\nlarge language models (Llama2 and Mistral), experiments demonstrate the\nexistence of OCR-sensitive regions and show improvements in NER performance on\nhistorical newspapers and classical commentaries, highlighting the potential of\ntargeted neuron modulation to improve models' performance on noisy text.\n","authors":["Emanuela Boros","Maud Ehrmann"],"pdf_url":"https://arxiv.org/pdf/2409.16934v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17819v1","updated":"2024-09-26T13:15:18Z","published":"2024-09-26T13:15:18Z","title":"Inference-Time Language Model Alignment via Integrated Value Guidance","summary":" Large language models are typically fine-tuned to align with human\npreferences, but tuning large models is computationally intensive and complex.\nIn this work, we introduce $\\textit{Integrated Value Guidance}$ (IVG), a method\nthat uses implicit and explicit value functions to guide language model\ndecoding at token and chunk-level respectively, efficiently aligning large\nlanguage models purely at inference time. This approach circumvents the\ncomplexities of direct fine-tuning and outperforms traditional methods.\nEmpirically, we demonstrate the versatility of IVG across various tasks. In\ncontrolled sentiment generation and summarization tasks, our method\nsignificantly improves the alignment of large models using inference-time\nguidance from $\\texttt{gpt2}$-based value functions. Moreover, in a more\nchallenging instruction-following benchmark AlpacaEval 2.0, we show that both\nspecifically tuned and off-the-shelf value functions greatly improve the\nlength-controlled win rates of large models against $\\texttt{gpt-4-turbo}$\n(e.g., $19.51\\% \\rightarrow 26.51\\%$ for $\\texttt{Mistral-7B-Instruct-v0.2}$\nand $25.58\\% \\rightarrow 33.75\\%$ for $\\texttt{Mixtral-8x7B-Instruct-v0.1}$\nwith Tulu guidance).\n","authors":["Zhixuan Liu","Zhanhui Zhou","Yuanfu Wang","Chao Yang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2409.17819v1.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17815v1","updated":"2024-09-26T13:12:13Z","published":"2024-09-26T13:12:13Z","title":"DREAMS: A python framework to train deep learning models with model card\n reporting for medical and health applications","summary":" Electroencephalography (EEG) data provides a non-invasive method for\nresearchers and clinicians to observe brain activity in real time. The\nintegration of deep learning techniques with EEG data has significantly\nimproved the ability to identify meaningful patterns, leading to valuable\ninsights for both clinical and research purposes. However, most of the\nframeworks so far, designed for EEG data analysis, are either too focused on\npre-processing or in deep learning methods per, making their use for both\nclinician and developer communities problematic. Moreover, critical issues such\nas ethical considerations, biases, uncertainties, and the limitations inherent\nin AI models for EEG data analysis are frequently overlooked, posing challenges\nto the responsible implementation of these technologies. In this paper, we\nintroduce a comprehensive deep learning framework tailored for EEG data\nprocessing, model training and report generation. While constructed in way to\nbe adapted and developed further by AI developers, it enables to report,\nthrough model cards, the outcome and specific information of use for both\ndevelopers and clinicians. In this way, we discuss how this framework can, in\nthe future, provide clinical researchers and developers with the tools needed\nto create transparent and accountable AI models for EEG data analysis and\ndiagnosis.\n","authors":["Rabindra Khadka","Pedro G Lind","Anis Yazidi","Asma Belhadi"],"pdf_url":"https://arxiv.org/pdf/2409.17815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16106v2","updated":"2024-09-26T13:05:36Z","published":"2024-09-24T14:07:47Z","title":"Scenario of Use Scheme: Threat Model Specification for Speaker Privacy\n Protection in the Medical Domain","summary":" Speech recordings are being more frequently used to detect and monitor\ndisease, leading to privacy concerns. Beyond cryptography, protection of speech\ncan be addressed by approaches, such as perturbation, disentanglement, and\nre-synthesis, that eliminate sensitive information of the speaker, leaving the\ninformation necessary for medical analysis purposes. In order for such privacy\nprotective approaches to be developed, clear and systematic specifications of\nassumptions concerning medical settings and the needs of medical professionals\nare necessary. In this paper, we propose a Scenario of Use Scheme that\nincorporates an Attacker Model, which characterizes the adversary against whom\nthe speaker's privacy must be defended, and a Protector Model, which specifies\nthe defense. We discuss the connection of the scheme with previous work on\nspeech privacy. Finally, we present a concrete example of a specified Scenario\nof Use and a set of experiments about protecting speaker data against gender\ninference attacks while maintaining utility for Parkinson's detection.\n","authors":["Mehtab Ur Rahman","Martha Larson","Louis ten Bosch","Cristian Tejedor-García"],"pdf_url":"https://arxiv.org/pdf/2409.16106v2.pdf","comment":"Accepted and published at SPSC Symposium 2024 4th Symposium on\n Security and Privacy in Speech Communication. Interspeech 2024"},{"id":"http://arxiv.org/abs/2406.10615v2","updated":"2024-09-26T12:55:43Z","published":"2024-06-15T12:27:35Z","title":"Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation","summary":" Given the high cost of collecting robotic data in the real world, sample\nefficiency is a consistently compelling pursuit in robotics. In this paper, we\nintroduce SGRv2, an imitation learning framework that enhances sample\nefficiency through improved visual and action representations. Central to the\ndesign of SGRv2 is the incorporation of a critical inductive bias-action\nlocality, which posits that robot's actions are predominantly influenced by the\ntarget object and its interactions with the local environment. Extensive\nexperiments in both simulated and real-world settings demonstrate that action\nlocality is essential for boosting sample efficiency. SGRv2 excels in RLBench\ntasks with keyframe control using merely 5 demonstrations and surpasses the RVT\nbaseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and\nMimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR.\nIn real-world environments, with only eight demonstrations, SGRv2 can perform a\nvariety of tasks at a markedly higher success rate compared to baseline models.\nProject website: http://sgrv2-robot.github.io\n","authors":["Tong Zhang","Yingdong Hu","Jiacheng You","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2406.10615v2.pdf","comment":"CoRL 2024. Project website: http://sgrv2-robot.github.io"},{"id":"http://arxiv.org/abs/2409.17791v1","updated":"2024-09-26T12:37:26Z","published":"2024-09-26T12:37:26Z","title":"Self-supervised Preference Optimization: Enhance Your Language Model\n with Preference Degree Awareness","summary":" Recently, there has been significant interest in replacing the reward model\nin Reinforcement Learning with Human Feedback (RLHF) methods for Large Language\nModels (LLMs), such as Direct Preference Optimization (DPO) and its variants.\nThese approaches commonly use a binary cross-entropy mechanism on pairwise\nsamples, i.e., minimizing and maximizing the loss based on preferred or\ndis-preferred responses, respectively. However, while this training strategy\nomits the reward model, it also overlooks the varying preference degrees within\ndifferent responses. We hypothesize that this is a key factor hindering LLMs\nfrom sufficiently understanding human preferences. To address this problem, we\npropose a novel Self-supervised Preference Optimization (SPO) framework, which\nconstructs a self-supervised preference degree loss combined with the alignment\nloss, thereby helping LLMs improve their ability to understand the degree of\npreference. Extensive experiments are conducted on two widely used datasets of\ndifferent tasks. The results demonstrate that SPO can be seamlessly integrated\nwith existing preference optimization methods and significantly boost their\nperformance to achieve state-of-the-art performance. We also conduct detailed\nanalyses to offer comprehensive insights into SPO, which verifies its\neffectiveness. The code is available at https://github.com/lijian16/SPO.\n","authors":["Jian Li","Haojing Huang","Yujia Zhang","Pengfei Xu","Xi Chen","Rui Song","Lida Shi","Jingwen Wang","Hao Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17791v1.pdf","comment":"Accepted at EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17788v1","updated":"2024-09-26T12:33:34Z","published":"2024-09-26T12:33:34Z","title":"Ophthalmic Biomarker Detection with Parallel Prediction of Transformer\n and Convolutional Architecture","summary":" Ophthalmic diseases represent a significant global health issue,\nnecessitating the use of advanced precise diagnostic tools. Optical Coherence\nTomography (OCT) imagery which offers high-resolution cross-sectional images of\nthe retina has become a pivotal imaging modality in ophthalmology.\nTraditionally physicians have manually detected various diseases and biomarkers\nfrom such diagnostic imagery. In recent times, deep learning techniques have\nbeen extensively used for medical diagnostic tasks enabling fast and precise\ndiagnosis. This paper presents a novel approach for ophthalmic biomarker\ndetection using an ensemble of Convolutional Neural Network (CNN) and Vision\nTransformer. While CNNs are good for feature extraction within the local\ncontext of the image, transformers are known for their ability to extract\nfeatures from the global context of the image. Using an ensemble of both\ntechniques allows us to harness the best of both worlds. Our method has been\nimplemented on the OLIVES dataset to detect 6 major biomarkers from the OCT\nimages and shows significant improvement of the macro averaged F1 score on the\ndataset.\n","authors":["Md. Touhidul Islam","Md. Abtahi Majeed Chowdhury","Mahmudul Hasan","Asif Quadir","Lutfa Aktar"],"pdf_url":"https://arxiv.org/pdf/2409.17788v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2409.14590v2","updated":"2024-09-26T12:29:45Z","published":"2024-09-22T20:47:04Z","title":"Explainable AI needs formal notions of explanation correctness","summary":" The use of machine learning (ML) in critical domains such as medicine poses\nrisks and requires regulation. One requirement is that decisions of ML systems\nin high-risk applications should be human-understandable. The field of\n\"explainable artificial intelligence\" (XAI) seemingly addresses this need.\nHowever, in its current form, XAI is unfit to provide quality control for ML;\nit itself needs scrutiny. Popular XAI methods cannot reliably answer important\nquestions about ML models, their training data, or a given test input. We\nrecapitulate results demonstrating that popular XAI methods systematically\nattribute importance to input features that are independent of the prediction\ntarget. This limits their utility for purposes such as model and data\n(in)validation, model improvement, and scientific discovery. We argue that the\nfundamental reason for this limitation is that current XAI methods do not\naddress well-defined problems and are not evaluated against objective criteria\nof explanation correctness. Researchers should formally define the problems\nthey intend to solve first and then design methods accordingly. This will lead\nto notions of explanation correctness that can be theoretically verified and\nobjective metrics of explanation performance that can be assessed using\nground-truth data.\n","authors":["Stefan Haufe","Rick Wilming","Benedict Clark","Rustam Zhumagambetov","Danny Panknin","Ahcène Boubekki"],"pdf_url":"https://arxiv.org/pdf/2409.14590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17777v1","updated":"2024-09-26T12:15:13Z","published":"2024-09-26T12:15:13Z","title":"Harnessing Shared Relations via Multimodal Mixup Contrastive Learning\n for Multimodal Classification","summary":" Deep multimodal learning has shown remarkable success by leveraging\ncontrastive learning to capture explicit one-to-one relations across\nmodalities. However, real-world data often exhibits shared relations beyond\nsimple pairwise associations. We propose M3CoL, a Multimodal Mixup Contrastive\nLearning approach to capture nuanced shared relations inherent in multimodal\ndata. Our key contribution is a Mixup-based contrastive loss that learns robust\nrepresentations by aligning mixed samples from one modality with their\ncorresponding samples from other modalities thereby capturing shared relations\nbetween them. For multimodal classification tasks, we introduce a framework\nthat integrates a fusion module with unimodal prediction modules for auxiliary\nsupervision during training, complemented by our proposed Mixup-based\ncontrastive loss. Through extensive experiments on diverse datasets (N24News,\nROSMAP, BRCA, and Food-101), we demonstrate that M3CoL effectively captures\nshared multimodal relations and generalizes across domains. It outperforms\nstate-of-the-art methods on N24News, ROSMAP, and BRCA, while achieving\ncomparable performance on Food-101. Our work highlights the significance of\nlearning shared relations for robust multimodal learning, opening up promising\navenues for future research.\n","authors":["Raja Kumar","Raghav Singhal","Pranamya Kulkarni","Deval Mehta","Kshitij Jadhav"],"pdf_url":"https://arxiv.org/pdf/2409.17777v1.pdf","comment":"RK and RS contributed equally to this work, 20 Pages, 8 Figures, 9\n Tables"},{"id":"http://arxiv.org/abs/2409.17774v1","updated":"2024-09-26T12:11:28Z","published":"2024-09-26T12:11:28Z","title":"Faithfulness and the Notion of Adversarial Sensitivity in NLP\n Explanations","summary":" Faithfulness is arguably the most critical metric to assess the reliability\nof explainable AI. In NLP, current methods for faithfulness evaluation are\nfraught with discrepancies and biases, often failing to capture the true\nreasoning of models. We introduce Adversarial Sensitivity as a novel approach\nto faithfulness evaluation, focusing on the explainer's response when the model\nis under adversarial attack. Our method accounts for the faithfulness of\nexplainers by capturing sensitivity to adversarial input changes. This work\naddresses significant limitations in existing evaluation techniques, and\nfurthermore, quantifies faithfulness from a crucial yet underexplored paradigm.\n","authors":["Supriya Manna","Niladri Sett"],"pdf_url":"https://arxiv.org/pdf/2409.17774v1.pdf","comment":"Accepted as a Full Paper at EMNLP 2024 Workshop BlackBoxNLP"},{"id":"http://arxiv.org/abs/2309.16928v3","updated":"2024-09-26T12:09:22Z","published":"2023-09-29T02:04:24Z","title":"Learning to Receive Help: Intervention-Aware Concept Embedding Models","summary":" Concept Bottleneck Models (CBMs) tackle the opacity of neural architectures\nby constructing and explaining their predictions using a set of high-level\nconcepts. A special property of these models is that they permit concept\ninterventions, wherein users can correct mispredicted concepts and thus improve\nthe model's performance. Recent work, however, has shown that intervention\nefficacy can be highly dependent on the order in which concepts are intervened\non and on the model's architecture and training hyperparameters. We argue that\nthis is rooted in a CBM's lack of train-time incentives for the model to be\nappropriately receptive to concept interventions. To address this, we propose\nIntervention-aware Concept Embedding models (IntCEMs), a novel CBM-based\narchitecture and training paradigm that improves a model's receptiveness to\ntest-time interventions. Our model learns a concept intervention policy in an\nend-to-end fashion from where it can sample meaningful intervention\ntrajectories at train-time. This conditions IntCEMs to effectively select and\nreceive concept interventions when deployed at test-time. Our experiments show\nthat IntCEMs significantly outperform state-of-the-art concept-interpretable\nmodels when provided with test-time concept interventions, demonstrating the\neffectiveness of our approach.\n","authors":["Mateo Espinosa Zarlenga","Katherine M. Collins","Krishnamurthy Dvijotham","Adrian Weller","Zohreh Shams","Mateja Jamnik"],"pdf_url":"https://arxiv.org/pdf/2309.16928v3.pdf","comment":"Accepted as a spotlight at the Thirty-seventh Conference on Neural\n Information Processing Systems (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2409.17767v1","updated":"2024-09-26T12:02:36Z","published":"2024-09-26T12:02:36Z","title":"Federated Learning under Attack: Improving Gradient Inversion for Batch\n of Images","summary":" Federated Learning (FL) has emerged as a machine learning approach able to\npreserve the privacy of user's data. Applying FL, clients train machine\nlearning models on a local dataset and a central server aggregates the learned\nparameters coming from the clients, training a global machine learning model\nwithout sharing user's data. However, the state-of-the-art shows several\napproaches to promote attacks on FL systems. For instance, inverting or leaking\ngradient attacks can find, with high precision, the local dataset used during\nthe training phase of the FL. This paper presents an approach, called Deep\nLeakage from Gradients with Feedback Blending (DLG-FB), which is able to\nimprove the inverting gradient attack, considering the spatial correlation that\ntypically exists in batches of images. The performed evaluation shows an\nimprovement of 19.18% and 48,82% in terms of attack success rate and the number\nof iterations per attacked image, respectively.\n","authors":["Luiz Leite","Yuri Santo","Bruno L. Dalmazo","André Riker"],"pdf_url":"https://arxiv.org/pdf/2409.17767v1.pdf","comment":"5 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.17763v1","updated":"2024-09-26T11:58:41Z","published":"2024-09-26T11:58:41Z","title":"Confidence intervals uncovered: Are we ready for real-world medical\n imaging AI?","summary":" Medical imaging is spearheading the AI transformation of healthcare.\nPerformance reporting is key to determine which methods should be translated\ninto clinical practice. Frequently, broad conclusions are simply derived from\nmean performance values. In this paper, we argue that this common practice is\noften a misleading simplification as it ignores performance variability. Our\ncontribution is threefold. (1) Analyzing all MICCAI segmentation papers (n =\n221) published in 2023, we first observe that more than 50\\% of papers do not\nassess performance variability at all. Moreover, only one (0.5\\%) paper\nreported confidence intervals (CIs) for model performance. (2) To address the\nreporting bottleneck, we show that the unreported standard deviation (SD) in\nsegmentation papers can be approximated by a second-order polynomial function\nof the mean Dice similarity coefficient (DSC). Based on external validation\ndata from 56 previous MICCAI challenges, we demonstrate that this approximation\ncan accurately reconstruct the CI of a method using information provided in\npublications. (3) Finally, we reconstructed 95\\% CIs around the mean DSC of\nMICCAI 2023 segmentation papers. The median CI width was 0.03 which is three\ntimes larger than the median performance gap between the first and second\nranked method. For more than 60\\% of papers, the mean performance of the\nsecond-ranked method was within the CI of the first-ranked method. We conclude\nthat current publications typically do not provide sufficient evidence to\nsupport which models could potentially be translated into clinical practice.\n","authors":["Evangelia Christodoulou","Annika Reinke","Rola Houhou","Piotr Kalinowski","Selen Erkan","Carole H. Sudre","Ninon Burgos","Sofiène Boutaj","Sophie Loizillon","Maëlys Solal","Nicola Rieke","Veronika Cheplygina","Michela Antonelli","Leon D. Mayer","Minu D. Tizabi","M. Jorge Cardoso","Amber Simpson","Paul F. Jäger","Annette Kopp-Schneider","Gaël Varoquaux","Olivier Colliot","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2409.17763v1.pdf","comment":"Paper accepted at MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2409.17757v1","updated":"2024-09-26T11:46:58Z","published":"2024-09-26T11:46:58Z","title":"Integrating Hierarchical Semantic into Iterative Generation Model for\n Entailment Tree Explanation","summary":" Manifestly and logically displaying the line of reasoning from evidence to\nanswer is significant to explainable question answering (QA). The entailment\ntree exhibits the lines structurally, which is different from the\nself-explanation principle in large-scale language models. Existing methods\nrarely consider the semantic association of sentences between and within\nhierarchies within the tree structure, which is prone to apparent mistakes in\ncombinations. In this work, we propose an architecture of integrating the\nHierarchical Semantics of sentences under the framework of Controller-Generator\n(HiSCG) to explain answers. The HiSCG designs a hierarchical mapping between\nhypotheses and facts, discriminates the facts involved in tree constructions,\nand optimizes single-step entailments. To the best of our knowledge, We are the\nfirst to notice hierarchical semantics of sentences between the same layer and\nadjacent layers to yield improvements. The proposed method achieves comparable\nperformance on all three settings of the EntailmentBank dataset. The\ngeneralization results on two out-of-domain datasets also demonstrate the\neffectiveness of our method.\n","authors":["Qin Wang","Jianzhou Feng","Yiming Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04259v2","updated":"2024-09-26T11:42:35Z","published":"2024-08-08T06:57:49Z","title":"EfficientRAG: Efficient Retriever for Multi-Hop Question Answering","summary":" Retrieval-augmented generation (RAG) methods encounter difficulties when\naddressing complex questions like multi-hop queries. While iterative retrieval\nmethods improve performance by gathering additional information, current\napproaches often rely on multiple calls of large language models (LLMs). In\nthis paper, we introduce EfficientRAG, an efficient retriever for multi-hop\nquestion answering. EfficientRAG iteratively generates new queries without the\nneed for LLM calls at each iteration and filters out irrelevant information.\nExperimental results demonstrate that EfficientRAG surpasses existing RAG\nmethods on three open-domain multi-hop question-answering datasets.\n","authors":["Ziyuan Zhuang","Zhiyang Zhang","Sitao Cheng","Fangkai Yang","Jia Liu","Shujian Huang","Qingwei Lin","Saravan Rajmohan","Dongmei Zhang","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.04259v2.pdf","comment":"20 pages, 4 figures"},{"id":"http://arxiv.org/abs/2001.07495v5","updated":"2024-09-26T11:42:25Z","published":"2020-01-21T13:05:31Z","title":"Unsupervisedly Learned Representations: Should the Quest be Over?","summary":" After four decades of research there still exists a Classification accuracy\ngap of about 20% between our best Unsupervisedly Learned Representations\nmethods and the accuracy rates achieved by intelligent animals. It thus may\nwell be that we are looking in the wrong direction. A possible solution to this\npuzzle is presented. We demonstrate that Reinforcement Learning can learn\nrepresentations which achieve the same accuracy as that of animals. Our main\nmodest contribution lies in the observations that: a. when applied to a real\nworld environment Reinforcement Learning does not require labels, and thus may\nbe legitimately considered as Unsupervised Learning, and b. in contrast, when\nReinforcement Learning is applied in a simulated environment it does inherently\nrequire labels and should thus be generally be considered as Supervised\nLearning. The corollary of these observations is that further search for\nUnsupervised Learning competitive paradigms which may be trained in simulated\nenvironments may be futile.\n","authors":["Daniel N. Nissani"],"pdf_url":"https://arxiv.org/pdf/2001.07495v5.pdf","comment":"To be published at The 6th International Conference on Machine\n Learning, Optimization and Data Science - LOD 2020"},{"id":"http://arxiv.org/abs/2409.17755v1","updated":"2024-09-26T11:40:07Z","published":"2024-09-26T11:40:07Z","title":"SECURE: Semantics-aware Embodied Conversation under Unawareness for\n Lifelong Robot Learning","summary":" This paper addresses a challenging interactive task learning scenario we call\nrearrangement under unawareness: to manipulate a rigid-body environment in a\ncontext where the robot is unaware of a concept that's key to solving the\ninstructed task. We propose SECURE, an interactive task learning framework\ndesigned to solve such problems by fixing a deficient domain model using\nembodied conversation. Through dialogue, the robot discovers and then learns to\nexploit unforeseen possibilities. Using SECURE, the robot not only learns from\nthe user's corrective feedback when it makes a mistake, but it also learns to\nmake strategic dialogue decisions for revealing useful evidence about novel\nconcepts for solving the instructed task. Together, these abilities allow the\nrobot to generalise to subsequent tasks using newly acquired knowledge. We\ndemonstrate that a robot that is semantics-aware -- that is, it exploits the\nlogical consequences of both sentence and discourse semantics in the learning\nand inference process -- learns to solve rearrangement under unawareness more\neffectively than a robot that lacks such capabilities.\n","authors":["Rimvydas Rubavicius","Peter David Fagan","Alex Lascarides","Subramanian Ramamoorthy"],"pdf_url":"https://arxiv.org/pdf/2409.17755v1.pdf","comment":"10 pages,4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.17754v1","updated":"2024-09-26T11:36:08Z","published":"2024-09-26T11:36:08Z","title":"Byzantine-Robust Aggregation for Securing Decentralized Federated\n Learning","summary":" Federated Learning (FL) emerges as a distributed machine learning approach\nthat addresses privacy concerns by training AI models locally on devices.\nDecentralized Federated Learning (DFL) extends the FL paradigm by eliminating\nthe central server, thereby enhancing scalability and robustness through the\navoidance of a single point of failure. However, DFL faces significant\nchallenges in optimizing security, as most Byzantine-robust algorithms proposed\nin the literature are designed for centralized scenarios. In this paper, we\npresent a novel Byzantine-robust aggregation algorithm to enhance the security\nof Decentralized Federated Learning environments, coined WFAgg. This proposal\nhandles the adverse conditions and strength robustness of dynamic decentralized\ntopologies at the same time by employing multiple filters to identify and\nmitigate Byzantine attacks. Experimental results demonstrate the effectiveness\nof the proposed algorithm in maintaining model accuracy and convergence in the\npresence of various Byzantine attack scenarios, outperforming state-of-the-art\ncentralized Byzantine-robust aggregation schemes (such as Multi-Krum or\nClustering). These algorithms are evaluated on an IID image classification\nproblem in both centralized and decentralized scenarios.\n","authors":["Diego Cajaraville-Aboy","Ana Fernández-Vilas","Rebeca P. Díaz-Redondo","Manuel Fernández-Veiga"],"pdf_url":"https://arxiv.org/pdf/2409.17754v1.pdf","comment":"18 pages, 7 figures, 1 table"},{"id":"http://arxiv.org/abs/2401.01008v3","updated":"2024-09-26T11:35:22Z","published":"2023-12-13T17:05:37Z","title":"Fast Sampling Through The Reuse Of Attention Maps In Diffusion Models","summary":" Text-to-image diffusion models have demonstrated unprecedented capabilities\nfor flexible and realistic image synthesis. Nevertheless, these models rely on\na time-consuming sampling procedure, which has motivated attempts to reduce\ntheir latency. When improving efficiency, researchers often use the original\ndiffusion model to train an additional network designed specifically for fast\nimage generation. In contrast, our approach seeks to reduce latency directly,\nwithout any retraining, fine-tuning, or knowledge distillation. In particular,\nwe find the repeated calculation of attention maps to be costly yet redundant,\nand instead suggest reusing them during sampling. Our specific reuse strategies\nare based on ODE theory, which implies that the later a map is reused, the\nsmaller the distortion in the final image. We empirically compare these reuse\nstrategies with few-step sampling procedures of comparable latency, finding\nthat reuse generates images that are closer to those produced by the original\nhigh-latency diffusion model.\n","authors":["Rosco Hunter","Łukasz Dudziak","Mohamed S. Abdelfattah","Abhinav Mehrotra","Sourav Bhattacharya","Hongkai Wen"],"pdf_url":"https://arxiv.org/pdf/2401.01008v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10712v3","updated":"2024-09-26T11:15:14Z","published":"2024-02-16T14:15:15Z","title":"An Empirical Study on Cross-lingual Vocabulary Adaptation for Efficient\n Language Model Inference","summary":" The development of state-of-the-art generative large language models (LLMs)\ndisproportionately relies on English-centric tokenizers, vocabulary and\npre-training data. Despite the fact that some LLMs have multilingual\ncapabilities, recent studies have shown that their inference efficiency\ndeteriorates when generating text in languages other than English. This results\nin increased inference time and costs. Cross-lingual vocabulary adaptation\n(CVA) methods have been proposed for adapting models to a target language\naiming to improve downstream performance. However, the effectiveness of these\nmethods on increasing inference efficiency of generative LLMs has yet to be\nexplored. In this paper, we perform an empirical study of five CVA methods on\nfour generative LLMs (including monolingual and multilingual models) across\nfour typologically-diverse languages and four natural language understanding\ntasks. We find that CVA substantially contributes to LLM inference speedups of\nup to 271.5\\%. We also show that adapting LLMs that have been pre-trained on\nmore balanced multilingual data results in downstream performance comparable to\nthe original models.\n","authors":["Atsuki Yamaguchi","Aline Villavicencio","Nikolaos Aletras"],"pdf_url":"https://arxiv.org/pdf/2402.10712v3.pdf","comment":"Accepted at EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2406.12442v2","updated":"2024-09-26T11:15:14Z","published":"2024-06-18T09:46:44Z","title":"Abstraction-of-Thought Makes Language Models Better Reasoners","summary":" Abstract reasoning, the ability to reason from the abstract essence of a\nproblem, serves as a key to generalization in human reasoning. However,\neliciting language models to perform reasoning with abstraction remains\nunexplored. This paper seeks to bridge this gap by introducing a novel\nstructured reasoning format called Abstraction-of-Thought (AoT). The uniqueness\nof AoT lies in its explicit requirement for varying levels of abstraction\nwithin the reasoning process. This approach could elicit language models to\nfirst contemplate on the abstract level before incorporating concrete details,\nwhich is overlooked by the prevailing step-by-step Chain-of-Thought (CoT)\nmethod. To align models with the AoT format, we present AoT Collection, a\ngeneric finetuning dataset consisting of 348k high-quality samples with AoT\nreasoning processes, collected via an automated and scalable pipeline. We\nfinetune a wide range of language models with AoT Collection and conduct\nextensive evaluations on 23 unseen tasks from the challenging benchmark\nBig-Bench Hard. Experimental results indicate that models aligned to AoT\nreasoning format substantially outperform those aligned to CoT in many\nreasoning tasks.\n","authors":["Ruixin Hong","Hongming Zhang","Xiaoman Pan","Dong Yu","Changshui Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.12442v2.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17728v1","updated":"2024-09-26T10:57:02Z","published":"2024-09-26T10:57:02Z","title":"AlterMOMA: Fusion Redundancy Pruning for Camera-LiDAR Fusion Models with\n Alternative Modality Masking","summary":" Camera-LiDAR fusion models significantly enhance perception performance in\nautonomous driving. The fusion mechanism leverages the strengths of each\nmodality while minimizing their weaknesses. Moreover, in practice, camera-LiDAR\nfusion models utilize pre-trained backbones for efficient training. However, we\nargue that directly loading single-modal pre-trained camera and LiDAR backbones\ninto camera-LiDAR fusion models introduces similar feature redundancy across\nmodalities due to the nature of the fusion mechanism. Unfortunately, existing\npruning methods are developed explicitly for single-modal models, and thus,\nthey struggle to effectively identify these specific redundant parameters in\ncamera-LiDAR fusion models. In this paper, to address the issue above on\ncamera-LiDAR fusion models, we propose a novelty pruning framework Alternative\nModality Masking Pruning (AlterMOMA), which employs alternative masking on each\nmodality and identifies the redundant parameters. Specifically, when one\nmodality parameters are masked (deactivated), the absence of features from the\nmasked backbone compels the model to reactivate previous redundant features of\nthe other modality backbone. Therefore, these redundant features and relevant\nredundant parameters can be identified via the reactivation process. The\nredundant parameters can be pruned by our proposed importance score evaluation\nfunction, Alternative Evaluation (AlterEva), which is based on the observation\nof the loss changes when certain modality parameters are activated and\ndeactivated. Extensive experiments on the nuScene and KITTI datasets\nencompassing diverse tasks, baseline models, and pruning algorithms showcase\nthat AlterMOMA outperforms existing pruning methods, attaining state-of-the-art\nperformance.\n","authors":["Shiqi Sun","Yantao Lu","Ning Liu","Bo Jiang","JinChao Chen","Ying Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17728v1.pdf","comment":"17 pages, 3 figures, Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.08160v2","updated":"2024-09-26T10:54:32Z","published":"2024-08-15T13:49:14Z","title":"General-purpose Clothes Manipulation with Semantic Keypoints","summary":" Clothes manipulation is a critical skill for household robots. Recent\nadvancements have been made in task-specific clothes manipulation, such as\nfolding, flattening, and hanging. However, due to clothes' complex geometries\nand deformability, creating a general-purpose robot system that can manipulate\na diverse range of clothes in many ways remains challenging. Since clothes are\ntypically designed with specific structures, we propose identifying these\nspecific features like ``left sleeve'' as semantic keypoints. Semantic\nkeypoints can provide semantic cues for task planning and geometric cues for\nlow-level action generation. With this insight, we develop a hierarchical\nlearning framework using the large language model (LLM) for general-purpose\nCLothes mAnipulation with Semantic keyPoints (CLASP). Extensive simulation\nexperiments show that CLASP outperforms baseline methods on both seen and\nunseen tasks across various clothes manipulation tasks. Real-world experiments\nshow that CLASP can be directly deployed in the real world and applied to a\nwide variety of clothes.\n","authors":["Yuhong Deng","David Hsu"],"pdf_url":"https://arxiv.org/pdf/2408.08160v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1904.04579v6","updated":"2024-09-26T10:53:16Z","published":"2019-04-09T10:30:23Z","title":"A Concept-Value Network as a Brain Model","summary":" This paper suggests a statistical framework for describing the relations\nbetween the physical and conceptual entities of a brain-like model. Features\nand concept instances are put into context, where the paper suggests that\nfeatures may be the electrical wiring, although chemical connections are also\npossible. With this idea, the actual length of the connection is important,\nbecause it is related to firing rates and neuron synchronization, but the\nsignal type is less important. The paper then suggests that concepts are neuron\ngroups that link feature sets and concept instances are determined by chemical\nsignals from those groups. Therefore, features become the static horizontal\nframework of the neural system and concepts are vertically interconnected\ncombinations of these. With regards to functionality, the neuron is then\nconsidered to be functional and the more horizontal memory structures can even\nbe glial. This would also suggest that features can be distributed entities and\nnot concentrated to a single area. Another aspect could be signal 'breaks' that\ncompartmentalise a pattern and may help with neural binding.\n","authors":["Kieran Greer"],"pdf_url":"https://arxiv.org/pdf/1904.04579v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19454v2","updated":"2024-09-26T10:34:40Z","published":"2024-04-30T11:10:34Z","title":"Augmented neural forms with parametric boundary-matching operators for\n solving ordinary differential equations","summary":" Approximating solutions of ordinary and partial differential equations\nconstitutes a significant challenge. Based on functional expressions that\ninherently depend on neural networks, neural forms are specifically designed to\nprecisely satisfy the prescribed initial or boundary conditions of the problem,\nwhile providing the approximate solutions in closed form. Departing from the\nimportant class of ordinary differential equations, the present work aims to\nrefine and validate the neural forms methodology, paving the ground for further\ndevelopments in more challenging fields. The main contributions are as follows.\nFirst, it introduces a formalism for systematically crafting proper neural\nforms with adaptable boundary matches that are amenable to optimization.\nSecond, it describes a novel technique for converting problems with Neumann or\nRobin conditions into equivalent problems with parametric Dirichlet conditions.\nThird, it outlines a method for determining an upper bound on the absolute\ndeviation from the exact solution. The proposed augmented neural forms approach\nwas tested on a set of diverse problems, encompassing first- and second-order\nordinary differential equations, as well as first-order systems. Stiff\ndifferential equations have been considered as well. The resulting solutions\nwere subjected to assessment against existing exact solutions, solutions\nderived through the common penalized neural method, and solutions obtained via\ncontemporary numerical analysis methods. The reported results demonstrate that\nthe augmented neural forms not only satisfy the boundary and initial conditions\nexactly, but also provide closed-form solutions that facilitate high-quality\ninterpolation and controllable overall precision. These attributes are\nessential for expanding the application field of neural forms to more\nchallenging problems that are described by partial differential equations.\n","authors":["Adam D. Kypriadis","Isaac E. Lagaris","Aristidis Likas","Konstantinos E. Parsopoulos"],"pdf_url":"https://arxiv.org/pdf/2404.19454v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17896v2","updated":"2024-09-26T10:33:53Z","published":"2024-07-25T09:36:37Z","title":"SR-CurvANN: Advancing 3D Surface Reconstruction through Curvature-Aware\n Neural Networks","summary":" Incomplete or missing data in three-dimensional (3D) models can lead to\nerroneous or flawed renderings, limiting their usefulness in applications such\nas visualization, geometric computation, and 3D printing. Conventional\nsurface-repair techniques often fail to infer complex geometric details in\nmissing areas. Neural networks successfully address hole-filling tasks in 2D\nimages using inpainting techniques. The combination of surface reconstruction\nalgorithms, guided by the model's curvature properties and the creativity of\nneural networks in the inpainting processes should provide realistic results in\nthe hole completion task. In this paper, we propose a novel method entitled\nSR-CurvANN (Surface Reconstruction Based on Curvature-Aware Neural Networks)\nthat incorporates neural network-based 2D inpainting to effectively reconstruct\n3D surfaces. We train the neural networks with images that represent planar\nrepresentations of the curvature at vertices of hundreds of 3D models. Once the\nmissing areas have been inferred, a coarse-to-fine surface deformation process\nensures that the surface fits the reconstructed curvature image. Our proposal\nmakes it possible to learn and generalize patterns from a wide variety of\ntraining 3D models, generating comprehensive inpainted curvature images and\nsurfaces. Experiments conducted on 959 models with several holes have\ndemonstrated that SR-CurvANN excels in the shape completion process, filling\nholes with a remarkable level of realism and precision.\n","authors":["Marina Hernández-Bautista","Francisco J. Melero"],"pdf_url":"https://arxiv.org/pdf/2407.17896v2.pdf","comment":"Major changes in title, paper structure, text and figures. Improved\n results. 23 pages, 14 figures. Decision about submission not taken yet"},{"id":"http://arxiv.org/abs/2407.14788v2","updated":"2024-09-26T10:21:33Z","published":"2024-07-20T07:39:07Z","title":"On the Design and Analysis of LLM-Based Algorithms","summary":" We initiate a formal investigation into the design and analysis of LLM-based\nalgorithms, i.e. algorithms that contain one or multiple calls of large\nlanguage models (LLMs) as sub-routines and critically rely on the capabilities\nof LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt\nengineering to complicated LLM-powered agent systems and compound AI systems,\nhave achieved remarkable empirical success, the design and optimization of them\nhave mostly relied on heuristics and trial-and-errors, which is largely due to\na lack of formal and analytical study for these algorithms. To fill this gap,\nwe start by identifying the computational-graph representation of LLM-based\nalgorithms, the design principle of task decomposition, and some key\nabstractions, which then facilitate our formal analysis for the accuracy and\nefficiency of LLM-based algorithms, despite the black-box nature of LLMs.\nThrough extensive analytical and empirical investigation in a series of case\nstudies, we demonstrate that the proposed framework is broadly applicable to a\nwide range of scenarios and diverse patterns of LLM-based algorithms, such as\nparallel, hierarchical and recursive task decomposition. Our proposed framework\nholds promise for advancing LLM-based algorithms, by revealing the reasons\nbehind curious empirical phenomena, guiding the choices of hyperparameters,\npredicting the empirical performance of algorithms, and inspiring new algorithm\ndesign. To promote further study of LLM-based algorithms, we release our source\ncode at\nhttps://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm.\n","authors":["Yanxi Chen","Yaliang Li","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.14788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17702v1","updated":"2024-09-26T10:16:08Z","published":"2024-09-26T10:16:08Z","title":"Episodic Memory Verbalization using Hierarchical Representations of\n Life-Long Robot Experience","summary":" Verbalization of robot experience, i.e., summarization of and question\nanswering about a robot's past, is a crucial ability for improving human-robot\ninteraction. Previous works applied rule-based systems or fine-tuned deep\nmodels to verbalize short (several-minute-long) streams of episodic data,\nlimiting generalization and transferability. In our work, we apply large\npretrained models to tackle this task with zero or few examples, and\nspecifically focus on verbalizing life-long experiences. For this, we derive a\ntree-like data structure from episodic memory (EM), with lower levels\nrepresenting raw perception and proprioception data, and higher levels\nabstracting events to natural language concepts. Given such a hierarchical\nrepresentation built from the experience stream, we apply a large language\nmodel as an agent to interactively search the EM given a user's query,\ndynamically expanding (initially collapsed) tree nodes to find the relevant\ninformation. The approach keeps computational costs low even when scaling to\nmonths of robot experience data. We evaluate our method on simulated household\nrobot data, human egocentric videos, and real-world robot recordings,\ndemonstrating its flexibility and scalability.\n","authors":["Leonard Bärmann","Chad DeChant","Joana Plewnia","Fabian Peller-Konrad","Daniel Bauer","Tamim Asfour","Alex Waibel"],"pdf_url":"https://arxiv.org/pdf/2409.17702v1.pdf","comment":"Code, data and demo videos at https://hierarchical-emv.github.io"},{"id":"http://arxiv.org/abs/2409.17699v1","updated":"2024-09-26T10:12:19Z","published":"2024-09-26T10:12:19Z","title":"MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard\n for Prompt Attacks","summary":" The proliferation of Large Language Models (LLMs) in diverse applications\nunderscores the pressing need for robust security measures to thwart potential\njailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger\ndata integrity and user privacy. Guardrails serve as crucial protective\nmechanisms against such threats, but existing models often fall short in terms\nof both detection accuracy, and computational efficiency. This paper advocates\nfor the significance of jailbreak attack prevention on LLMs, and emphasises the\nrole of input guardrails in safeguarding these models. We introduce MoJE\n(Mixture of Jailbreak Expert), a novel guardrail architecture designed to\nsurpass current limitations in existing state-of-the-art guardrails. By\nemploying simple linguistic statistical techniques, MoJE excels in detecting\njailbreak attacks while maintaining minimal computational overhead during model\ninference. Through rigorous experimentation, MoJE demonstrates superior\nperformance capable of detecting 90% of the attacks without compromising benign\nprompts, enhancing LLMs security against jailbreak attacks.\n","authors":["Giandomenico Cornacchia","Giulio Zizzo","Kieran Fraser","Muhammad Zaid Hamed","Ambrish Rawat","Mark Purcell"],"pdf_url":"https://arxiv.org/pdf/2409.17699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17698v1","updated":"2024-09-26T10:09:10Z","published":"2024-09-26T10:09:10Z","title":"The application of GPT-4 in grading design university students'\n assignment and providing feedback: An exploratory study","summary":" This study aims to investigate whether GPT-4 can effectively grade\nassignments for design university students and provide useful feedback. In\ndesign education, assignments do not have a single correct answer and often\ninvolve solving an open-ended design problem. This subjective nature of design\nprojects often leads to grading problems,as grades can vary between different\nraters,for instance instructor from engineering background or architecture\nbackground. This study employs an iterative research approach in developing a\nCustom GPT with the aim of achieving more reliable results and testing whether\nit can provide design students with constructive feedback. The findings\ninclude: First,through several rounds of iterations the inter-reliability\nbetween GPT and human raters reached a level that is generally accepted by\neducators. This indicates that by providing accurate prompts to GPT,and\ncontinuously iterating to build a Custom GPT, it can be used to effectively\ngrade students' design assignments, serving as a reliable complement to human\nraters. Second, the intra-reliability of GPT's scoring at different times is\nbetween 0.65 and 0.78. This indicates that, with adequate instructions, a\nCustom GPT gives consistent results which is a precondition for grading\nstudents. As consistency and comparability are the two main rules to ensure the\nreliability of educational assessment, this study has looked at whether a\nCustom GPT can be developed that adheres to these two rules. We finish the\npaper by testing whether Custom GPT can provide students with useful feedback\nand reflecting on how educators can develop and iterate a Custom GPT to serve\nas a complementary rater.\n","authors":["Qian Huang","Thijs Willems","King Wang Poon"],"pdf_url":"https://arxiv.org/pdf/2409.17698v1.pdf","comment":"25 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17692v1","updated":"2024-09-26T09:57:16Z","published":"2024-09-26T09:57:16Z","title":"MIO: A Foundation Model on Multimodal Tokens","summary":" In this paper, we introduce MIO, a novel foundation model built on multimodal\ntokens, capable of understanding and generating speech, text, images, and\nvideos in an end-to-end, autoregressive manner. While the emergence of large\nlanguage models (LLMs) and multimodal large language models (MM-LLMs) propels\nadvancements in artificial general intelligence through their versatile\ncapabilities, they still lack true any-to-any understanding and generation.\nRecently, the release of GPT-4o has showcased the remarkable potential of\nany-to-any LLMs for complex real-world tasks, enabling omnidirectional input\nand output across images, speech, and text. However, it is closed-source and\ndoes not support the generation of multimodal interleaved sequences. To address\nthis gap, we present MIO, which is trained on a mixture of discrete tokens\nacross four modalities using causal multimodal modeling. MIO undergoes a\nfour-stage training process: (1) alignment pre-training, (2) interleaved\npre-training, (3) speech-enhanced pre-training, and (4) comprehensive\nsupervised fine-tuning on diverse textual, visual, and speech tasks. Our\nexperimental results indicate that MIO exhibits competitive, and in some cases\nsuperior, performance compared to previous dual-modal baselines, any-to-any\nmodel baselines, and even modality-specific baselines. Moreover, MIO\ndemonstrates advanced capabilities inherent to its any-to-any feature, such as\ninterleaved video-text generation, chain-of-visual-thought reasoning, visual\nguideline generation, instructional image editing, etc.\n","authors":["Zekun Wang","King Zhu","Chunpu Xu","Wangchunshu Zhou","Jiaheng Liu","Yibo Zhang","Jiashuo Wang","Ning Shi","Siyu Li","Yizhi Li","Haoran Que","Zhaoxiang Zhang","Yuanxing Zhang","Ge Zhang","Ke Xu","Jie Fu","Wenhao Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17692v1.pdf","comment":"Technical Report. Codes and models will be available soon"},{"id":"http://arxiv.org/abs/2409.15867v3","updated":"2024-09-26T09:56:58Z","published":"2024-09-24T08:41:01Z","title":"In-Context Ensemble Improves Video-Language Models for Low-Level\n Workflow Understanding from Human Demonstrations","summary":" A Standard Operating Procedure (SOP) defines a low-level, step-by-step\nwritten guide for a business software workflow based on a video demonstration.\nSOPs are a crucial step toward automating end-to-end software workflows.\nManually creating SOPs can be time-consuming. Recent advancements in large\nvideo-language models offer the potential for automating SOP generation by\nanalyzing recordings of human demonstrations. However, current large\nvideo-language models face challenges with zero-shot SOP generation. We explore\nin-context learning with video-language models for SOP generation. We report\nthat in-context learning sometimes helps video-language models at SOP\ngeneration. We then propose an in-context ensemble learning to further enhance\nthe capabilities of the models in SOP generation.\n","authors":["Moucheng Xu","Evangelos Chatzaroulas","Luc McCutcheon","Abdul Ahad","Hamzah Azeem","Janusz Marecki","Ammar Anwar"],"pdf_url":"https://arxiv.org/pdf/2409.15867v3.pdf","comment":"multimodal in-context ensemble learning, video-language models, SOP\n generation, pseudo-labels, in-context learning, prompt engineering"},{"id":"http://arxiv.org/abs/2409.17691v1","updated":"2024-09-26T09:56:13Z","published":"2024-09-26T09:56:13Z","title":"Efficient Bias Mitigation Without Privileged Information","summary":" Deep neural networks trained via empirical risk minimisation often exhibit\nsignificant performance disparities across groups, particularly when group and\ntask labels are spuriously correlated (e.g., \"grassy background\" and \"cows\").\nExisting bias mitigation methods that aim to address this issue often either\nrely on group labels for training or validation, or require an extensive\nhyperparameter search. Such data and computational requirements hinder the\npractical deployment of these methods, especially when datasets are too large\nto be group-annotated, computational resources are limited, and models are\ntrained through already complex pipelines. In this paper, we propose Targeted\nAugmentations for Bias Mitigation (TAB), a simple hyperparameter-free framework\nthat leverages the entire training history of a helper model to identify\nspurious samples, and generate a group-balanced training set from which a\nrobust model can be trained. We show that TAB improves worst-group performance\nwithout any group information or model selection, outperforming existing\nmethods while maintaining overall accuracy.\n","authors":["Mateo Espinosa Zarlenga","Swami Sankaranarayanan","Jerone T. A. Andrews","Zohreh Shams","Mateja Jamnik","Alice Xiang"],"pdf_url":"https://arxiv.org/pdf/2409.17691v1.pdf","comment":"Accepted at the 18th European Conference on Computer Vision (ECCV\n 2024) as an Oral presentation"},{"id":"http://arxiv.org/abs/2405.06802v2","updated":"2024-09-26T09:52:20Z","published":"2024-05-10T20:29:25Z","title":"Leveraging summary of radiology reports with transformers","summary":" Two fundamental problems in health-care stem from patient handoff and triage.\nDoctors are often required to perform complex findings summarization to\nfacilitate efficient communication with specialists and decision making on the\nurgency of each case. To address these challenges, we present a state of the\nart radiology report summarization model utilizing adjusted bidirectional\nencoder representation from transformers BERTtoBERT encoder and decoder\narchitecture. We also provide a data processing pipeline for future models\ndeveloped on the the MIMIC CXR dataset. Our approach includes a novel method\nfor augmenting medical data and a comprehensive performance analysis. Our best\nperforming model achieved a recall oriented understudy for gisting evaluation L\nF1 score of 58.75/100, outperforming specialized checkpoints with more\nsophisticated attention mechanisms. We also provide a data processing pipeline\nfor future models developed on the MIMIC chest X-ray dataset. The model\nintroduced in this paper demonstrates significantly improved capacity in\nradiology report summarization, highlighting the potential for ensuring better\nclinical workflows and enhanced patient care.\n","authors":["Raul Salles de Padua","Imran Qureshi"],"pdf_url":"https://arxiv.org/pdf/2405.06802v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.05181v3","updated":"2024-09-26T09:52:13Z","published":"2023-12-08T17:08:03Z","title":"Tenplex: Dynamic Parallelism for Deep Learning using Parallelizable\n Tensor Collections","summary":" Deep learning (DL) jobs use multi-dimensional parallelism, i.e. combining\ndata, model, and pipeline parallelism, to use large GPU clusters efficiently.\nLong-running jobs may experience changes to their GPU allocation: (i) resource\nelasticity during training adds or removes GPUs; (ii) hardware maintenance may\nrequire redeployment on different GPUs; and (iii) GPU failures force jobs to\nrun with fewer devices. Current DL frameworks tie jobs to a set of GPUs and\nthus lack support for these scenarios. In particular, they cannot change the\nmulti-dimensional parallelism of an already-running job in an efficient and\nmodel-independent way.\n We describe Scalai, a state management library for DL systems that enables\njobs to change their parallelism dynamically after the GPU allocation is\nupdated at runtime. Scalai achieves this through a new abstraction, a\nparallelizable tensor collection (PTC), that externalizes the job state during\ntraining. After a GPU change, Scalai uses the PTC to transform the job state:\nthe PTC repartitions the dataset state under data parallelism and exposes it to\nDL workers through a virtual file system; and the PTC obtains the model state\nas partitioned checkpoints and transforms them to reflect the new\nparallelization configuration. For efficiency, Scalai executes PTC\ntransformations in parallel with minimum data movement between workers. Our\nexperiments show that Scalai enables DL jobs to support dynamic parallelization\nwith low overhead.\n","authors":["Marcel Wagenländer","Guo Li","Bo Zhao","Luo Mai","Peter Pietzuch"],"pdf_url":"https://arxiv.org/pdf/2312.05181v3.pdf","comment":"The 30th Symposium on Operating Systems Principles (SOSP24)"},{"id":"http://arxiv.org/abs/2409.17687v1","updated":"2024-09-26T09:51:29Z","published":"2024-09-26T09:51:29Z","title":"Graph Edit Distance with General Costs Using Neural Set Divergence","summary":" Graph Edit Distance (GED) measures the (dis-)similarity between two given\ngraphs, in terms of the minimum-cost edit sequence that transforms one graph to\nthe other. However, the exact computation of GED is NP-Hard, which has recently\nmotivated the design of neural methods for GED estimation. However, they do not\nexplicitly account for edit operations with different costs. In response, we\npropose GRAPHEDX, a neural GED estimator that can work with general costs\nspecified for the four edit operations, viz., edge deletion, edge addition,\nnode deletion and node addition. We first present GED as a quadratic assignment\nproblem (QAP) that incorporates these four costs. Then, we represent each graph\nas a set of node and edge embeddings and use them to design a family of neural\nset divergence surrogates. We replace the QAP terms corresponding to each\noperation with their surrogates. Computing such neural set divergence require\naligning nodes and edges of the two graphs. We learn these alignments using a\nGumbel-Sinkhorn permutation generator, additionally ensuring that the node and\nedge alignments are consistent with each other. Moreover, these alignments are\ncognizant of both the presence and absence of edges between node-pairs.\nExperiments on several datasets, under a variety of edit cost settings, show\nthat GRAPHEDX consistently outperforms state-of-the-art methods and heuristics\nin terms of prediction error.\n","authors":["Eeshaan Jain","Indradyumna Roy","Saswat Meher","Soumen Chakrabarti","Abir De"],"pdf_url":"https://arxiv.org/pdf/2409.17687v1.pdf","comment":"Published at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17685v1","updated":"2024-09-26T09:51:08Z","published":"2024-09-26T09:51:08Z","title":"Artificial Data Point Generation in Clustered Latent Space for Small\n Medical Datasets","summary":" One of the growing trends in machine learning is the use of data generation\ntechniques, since the performance of machine learning models is dependent on\nthe quantity of the training dataset. However, in many medical applications,\ncollecting large datasets is challenging due to resource constraints, which\nleads to overfitting and poor generalization. This paper introduces a novel\nmethod, Artificial Data Point Generation in Clustered Latent Space (AGCL),\ndesigned to enhance classification performance on small medical datasets\nthrough synthetic data generation. The AGCL framework involves feature\nextraction, K-means clustering, cluster evaluation based on a class separation\nmetric, and the generation of synthetic data points from clusters with distinct\nclass representations. This method was applied to Parkinson's disease\nscreening, utilizing facial expression data, and evaluated across multiple\nmachine learning classifiers. Experimental results demonstrate that AGCL\nsignificantly improves classification accuracy compared to baseline, GN and\nkNNMTD. AGCL achieved the highest overall test accuracy of 83.33% and\ncross-validation accuracy of 90.90% in majority voting over different emotions,\nconfirming its effectiveness in augmenting small datasets.\n","authors":["Yasaman Haghbin","Hadi Moradi","Reshad Hosseini"],"pdf_url":"https://arxiv.org/pdf/2409.17685v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.17684v1","updated":"2024-09-26T09:51:07Z","published":"2024-09-26T09:51:07Z","title":"Preserving logical and functional dependencies in synthetic tabular data","summary":" Dependencies among attributes are a common aspect of tabular data. However,\nwhether existing tabular data generation algorithms preserve these dependencies\nwhile generating synthetic data is yet to be explored. In addition to the\nexisting notion of functional dependencies, we introduce the notion of logical\ndependencies among the attributes in this article. Moreover, we provide a\nmeasure to quantify logical dependencies among attributes in tabular data.\nUtilizing this measure, we compare several state-of-the-art synthetic data\ngeneration algorithms and test their capability to preserve logical and\nfunctional dependencies on several publicly available datasets. We demonstrate\nthat currently available synthetic tabular data generation algorithms do not\nfully preserve functional dependencies when they generate synthetic datasets.\nIn addition, we also showed that some tabular synthetic data generation models\ncan preserve inter-attribute logical dependencies. Our review and comparison of\nthe state-of-the-art reveal research needs and opportunities to develop\ntask-specific synthetic tabular data generation models.\n","authors":["Chaithra Umesh","Kristian Schultz","Manjunath Mahendra","Saparshi Bej","Olaf Wolkenhauer"],"pdf_url":"https://arxiv.org/pdf/2409.17684v1.pdf","comment":"Submitted to Pattern Recognition Journal"},{"id":"http://arxiv.org/abs/2409.17683v1","updated":"2024-09-26T09:49:27Z","published":"2024-09-26T09:49:27Z","title":"Zero- and Few-shot Named Entity Recognition and Text Expansion in\n Medication Prescriptions using ChatGPT","summary":" Introduction: Medication prescriptions are often in free text and include a\nmix of two languages, local brand names, and a wide range of idiosyncratic\nformats and abbreviations. Large language models (LLMs) have shown promising\nability to generate text in response to input prompts. We use ChatGPT 3.5 to\nautomatically structure and expand medication statements in discharge summaries\nand thus make them easier to interpret for people and machines. Methods:\nNamed-entity Recognition (NER) and Text Expansion (EX) are used in a zero- and\nfew-shot setting with different prompt strategies. 100 medication statements\nwere manually annotated and curated. NER performance was measured by using\nstrict and partial matching. For the task EX, two experts interpreted the\nresults by assessing semantic equivalence between original and expanded\nstatements. The model performance was measured by precision, recall, and F1\nscore. Results: For NER, the best-performing prompt reached an average F1 score\nof 0.94 in the test set. For EX, the few-shot prompt showed superior\nperformance among other prompts, with an average F1 score of 0.87. Conclusion:\nOur study demonstrates good performance for NER and EX tasks in free-text\nmedication statements using ChatGPT. Compared to a zero-shot baseline, a\nfew-shot approach prevented the system from hallucinating, which would be\nunacceptable when processing safety-relevant medication data.\n","authors":["Natthanaphop Isaradech","Andrea Riedel","Wachiranun Sirikul","Markus Kreuzthaler","Stefan Schulz"],"pdf_url":"https://arxiv.org/pdf/2409.17683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.13197v2","updated":"2024-09-26T09:40:31Z","published":"2022-08-28T10:47:32Z","title":"IDP-PGFE: An Interpretable Disruption Predictor based on Physics-Guided\n Feature Extraction","summary":" Disruption prediction has made rapid progress in recent years, especially in\nmachine learning (ML)-based methods. Understanding why a predictor makes a\ncertain prediction can be as crucial as the prediction's accuracy for future\ntokamak disruption predictors. The purpose of most disruption predictors is\naccuracy or cross-machine capability. However, if a disruption prediction model\ncan be interpreted, it can tell why certain samples are classified as\ndisruption precursors. This allows us to tell the types of incoming disruption\nand gives us insight into the mechanism of disruption. This paper designs a\ndisruption predictor called Interpretable Disruption Predictor based On\nPhysics-guided feature extraction (IDP-PGFE) on J-TEXT. The prediction\nperformance of the model is effectively improved by extracting physics-guided\nfeatures. A high-performance model is required to ensure the validity of the\ninterpretation results. The interpretability study of IDP-PGFE provides an\nunderstanding of J-TEXT disruption and is generally consistent with existing\ncomprehension of disruption. IDP-PGFE has been applied to the disruption due to\ncontinuously increasing density towards density limit experiments on J-TEXT.\nThe time evolution of the PGFE features contribution demonstrates that the\napplication of ECRH triggers radiation-caused disruption, which lowers the\ndensity at disruption. While the application of RMP indeed raises the density\nlimit in J-TEXT. The interpretability study guides intuition on the physical\nmechanisms of density limit disruption that RMPs affect not only the MHD\ninstabilities but also the radiation profile, which delays density limit\ndisruption.\n","authors":["Chengshuo Shen","Wei Zheng","Yonghua Ding","Xinkun Ai","Fengming Xue","Yu Zhong","Nengchao Wang","Li Gao","Zhipeng Chen","Zhoujun Yang","Zhongyong Chen","Yuan Pan","J-TEXT team"],"pdf_url":"https://arxiv.org/pdf/2208.13197v2.pdf","comment":"17 pages, 13 figures"},{"id":"http://arxiv.org/abs/2403.10433v4","updated":"2024-09-26T09:33:29Z","published":"2024-03-15T16:11:15Z","title":"AI-enhanced Collective Intelligence","summary":" Current societal challenges exceed the capacity of humans operating either\nalone or collectively. As AI evolves, its role within human collectives will\nvary from an assistive tool to a participatory member. Humans and AI possess\ncomplementary capabilities that, together, can surpass the collective\nintelligence of either humans or AI in isolation. However, the interactions in\nhuman-AI systems are inherently complex, involving intricate processes and\ninterdependencies. This review incorporates perspectives from complex network\nscience to conceptualize a multilayer representation of human-AI collective\nintelligence, comprising cognition, physical, and information layers. Within\nthis multilayer network, humans and AI agents exhibit varying characteristics;\nhumans differ in diversity from surface-level to deep-level attributes, while\nAI agents range in degrees of functionality and anthropomorphism. We explore\nhow agents' diversity and interactions influence the system's collective\nintelligence and analyze real-world instances of AI-enhanced collective\nintelligence. We conclude by considering potential challenges and future\ndevelopments in this field.\n","authors":["Hao Cui","Taha Yasseri"],"pdf_url":"https://arxiv.org/pdf/2403.10433v4.pdf","comment":"43 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.13503v2","updated":"2024-09-26T09:26:05Z","published":"2024-09-20T13:44:00Z","title":"SatFed: A Resource-Efficient LEO Satellite-Assisted Heterogeneous\n Federated Learning Framework","summary":" Traditional federated learning (FL) frameworks rely heavily on terrestrial\nnetworks, where coverage limitations and increasing bandwidth congestion\nsignificantly hinder model convergence. Fortunately, the advancement of\nlow-Earth orbit (LEO) satellite networks offers promising new communication\navenues to augment traditional terrestrial FL. Despite this potential, the\nlimited satellite-ground communication bandwidth and the heterogeneous\noperating environments of ground devices-including variations in data,\nbandwidth, and computing power-pose substantial challenges for effective and\nrobust satellite-assisted FL. To address these challenges, we propose SatFed, a\nresource-efficient satellite-assisted heterogeneous FL framework. SatFed\nimplements freshness-based model prioritization queues to optimize the use of\nhighly constrained satellite-ground bandwidth, ensuring the transmission of the\nmost critical models. Additionally, a multigraph is constructed to capture\nreal-time heterogeneous relationships between devices, including data\ndistribution, terrestrial bandwidth, and computing capability. This multigraph\nenables SatFed to aggregate satellite-transmitted models into peer guidance,\nenhancing local training in heterogeneous environments. Extensive experiments\nwith real-world LEO satellite networks demonstrate that SatFed achieves\nsuperior performance and robustness compared to state-of-the-art benchmarks.\n","authors":["Yuxin Zhang","Zheng Lin","Zhe Chen","Zihan Fang","Wenjun Zhu","Xianhao Chen","Jin Zhao","Yue Gao"],"pdf_url":"https://arxiv.org/pdf/2409.13503v2.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.17663v1","updated":"2024-09-26T09:21:48Z","published":"2024-09-26T09:21:48Z","title":"Explanation Bottleneck Models","summary":" Recent concept-based interpretable models have succeeded in providing\nmeaningful explanations by pre-defined concept sets. However, the dependency on\nthe pre-defined concepts restricts the application because of the limited\nnumber of concepts for explanations. This paper proposes a novel interpretable\ndeep neural network called explanation bottleneck models (XBMs). XBMs generate\na text explanation from the input without pre-defined concepts and then predict\na final task prediction based on the generated explanation by leveraging\npre-trained vision-language encoder-decoder models. To achieve both the target\ntask performance and the explanation quality, we train XBMs through the target\ntask loss with the regularization penalizing the explanation decoder via the\ndistillation from the frozen pre-trained decoder. Our experiments, including a\ncomparison to state-of-the-art concept bottleneck models, confirm that XBMs\nprovide accurate and fluent natural language explanations without pre-defined\nconcept sets. Code will be available at https://github.com/yshinya6/xbm/.\n","authors":["Shin'ya Yamaguchi","Kosuke Nishida"],"pdf_url":"https://arxiv.org/pdf/2409.17663v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17661v1","updated":"2024-09-26T09:20:12Z","published":"2024-09-26T09:20:12Z","title":"A Fuzzy-based Approach to Predict Human Interaction by Functional\n Near-Infrared Spectroscopy","summary":" The paper introduces a Fuzzy-based Attention (Fuzzy Attention Layer)\nmechanism, a novel computational approach to enhance the interpretability and\nefficacy of neural models in psychological research. The proposed Fuzzy\nAttention Layer mechanism is integrated as a neural network layer within the\nTransformer Encoder model to facilitate the analysis of complex psychological\nphenomena through neural signals, such as those captured by functional\nNear-Infrared Spectroscopy (fNIRS). By leveraging fuzzy logic, the Fuzzy\nAttention Layer is capable of learning and identifying interpretable patterns\nof neural activity. This capability addresses a significant challenge when\nusing Transformer: the lack of transparency in determining which specific brain\nactivities most contribute to particular predictions. Our experimental results\ndemonstrated on fNIRS data from subjects engaged in social interactions\ninvolving handholding reveal that the Fuzzy Attention Layer not only learns\ninterpretable patterns of neural activity but also enhances model performance.\nAdditionally, the learned patterns provide deeper insights into the neural\ncorrelates of interpersonal touch and emotional exchange. The application of\nour model shows promising potential in deciphering the subtle complexities of\nhuman social behaviors, thereby contributing significantly to the fields of\nsocial neuroscience and psychological AI.\n","authors":["Xiaowei Jiang","Liang Ou","Yanan Chen","Na Ao","Yu-Cheng Chang","Thomas Do","Chin-Teng Lin"],"pdf_url":"https://arxiv.org/pdf/2409.17661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12753v2","updated":"2024-09-26T09:17:10Z","published":"2024-04-19T09:59:44Z","title":"AutoScraper: A Progressive Understanding Web Agent for Web Scraper\n Generation","summary":" Web scraping is a powerful technique that extracts data from websites,\nenabling automated data collection, enhancing data analysis capabilities, and\nminimizing manual data entry efforts. Existing methods, wrappers-based methods\nsuffer from limited adaptability and scalability when faced with a new website,\nwhile language agents, empowered by large language models (LLMs), exhibit poor\nreusability in diverse web environments. In this work, we introduce the\nparadigm of generating web scrapers with LLMs and propose AutoScraper, a\ntwo-stage framework that can handle diverse and changing web environments more\nefficiently. AutoScraper leverages the hierarchical structure of HTML and\nsimilarity across different web pages for generating web scrapers. Besides, we\npropose a new executability metric for better measuring the performance of web\nscraper generation tasks. We conduct comprehensive experiments with multiple\nLLMs and demonstrate the effectiveness of our framework. Resources of this\npaper can be found at \\url{https://github.com/EZ-hwh/AutoScraper}\n","authors":["Wenhao Huang","Zhouhong Gu","Chenghao Peng","Zhixu Li","Jiaqing Liang","Yanghua Xiao","Liqian Wen","Zulong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.12753v2.pdf","comment":"19 pages, 4 figures, 18 tables. Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17659v1","updated":"2024-09-26T09:14:16Z","published":"2024-09-26T09:14:16Z","title":"Hierarchical End-to-End Autonomous Driving: Integrating BEV Perception\n with Deep Reinforcement Learning","summary":" End-to-end autonomous driving offers a streamlined alternative to the\ntraditional modular pipeline, integrating perception, prediction, and planning\nwithin a single framework. While Deep Reinforcement Learning (DRL) has recently\ngained traction in this domain, existing approaches often overlook the critical\nconnection between feature extraction of DRL and perception. In this paper, we\nbridge this gap by mapping the DRL feature extraction network directly to the\nperception phase, enabling clearer interpretation through semantic\nsegmentation. By leveraging Bird's-Eye-View (BEV) representations, we propose a\nnovel DRL-based end-to-end driving framework that utilizes multi-sensor inputs\nto construct a unified three-dimensional understanding of the environment. This\nBEV-based system extracts and translates critical environmental features into\nhigh-level abstract states for DRL, facilitating more informed control.\nExtensive experimental evaluations demonstrate that our approach not only\nenhances interpretability but also significantly outperforms state-of-the-art\nmethods in autonomous driving control tasks, reducing the collision rate by\n20%.\n","authors":["Siyi Lu","Lei He","Shengbo Eben Li","Yugong Luo","Jianqiang Wang","Keqiang Li"],"pdf_url":"https://arxiv.org/pdf/2409.17659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14816v2","updated":"2024-09-26T09:11:28Z","published":"2024-09-23T08:46:15Z","title":"VARADE: a Variational-based AutoRegressive model for Anomaly Detection\n on the Edge","summary":" Detecting complex anomalies on massive amounts of data is a crucial task in\nIndustry 4.0, best addressed by deep learning. However, available solutions are\ncomputationally demanding, requiring cloud architectures prone to latency and\nbandwidth issues. This work presents VARADE, a novel solution implementing a\nlight autoregressive framework based on variational inference, which is best\nsuited for real-time execution on the edge. The proposed approach was validated\non a robotic arm, part of a pilot production line, and compared with several\nstate-of-the-art algorithms, obtaining the best trade-off between anomaly\ndetection accuracy, power consumption and inference frequency on two different\nedge platforms.\n","authors":["Alessio Mascolini","Sebastiano Gaiardelli","Francesco Ponzio","Nicola Dall'Ora","Enrico Macii","Sara Vinco","Santa Di Cataldo","Franco Fummi"],"pdf_url":"https://arxiv.org/pdf/2409.14816v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17656v1","updated":"2024-09-26T09:07:20Z","published":"2024-09-26T09:07:20Z","title":"Prototype based Masked Audio Model for Self-Supervised Learning of Sound\n Event Detection","summary":" A significant challenge in sound event detection (SED) is the effective\nutilization of unlabeled data, given the limited availability of labeled data\ndue to high annotation costs. Semi-supervised algorithms rely on labeled data\nto learn from unlabeled data, and the performance is constrained by the quality\nand size of the former. In this paper, we introduce the Prototype based Masked\nAudio Model~(PMAM) algorithm for self-supervised representation learning in\nSED, to better exploit unlabeled data. Specifically, semantically rich\nframe-level pseudo labels are constructed from a Gaussian mixture model (GMM)\nbased prototypical distribution modeling. These pseudo labels supervise the\nlearning of a Transformer-based masked audio model, in which binary\ncross-entropy loss is employed instead of the widely used InfoNCE loss, to\nprovide independent loss contributions from different prototypes, which is\nimportant in real scenarios in which multiple labels may apply to unsupervised\ndata frames. A final stage of fine-tuning with just a small amount of labeled\ndata yields a very high performing SED model. On like-for-like tests using the\nDESED task, our method achieves a PSDS1 score of 62.5\\%, surpassing current\nstate-of-the-art models and demonstrating the superiority of the proposed\ntechnique.\n","authors":["Pengfei Cai","Yan Song","Nan Jiang","Qing Gu","Ian McLoughlin"],"pdf_url":"https://arxiv.org/pdf/2409.17656v1.pdf","comment":"Submitted to ICASSP2025; The code for this paper will be available at\n https://github.com/cai525/Transformer4SED after the paper is accepted"},{"id":"http://arxiv.org/abs/2409.17655v1","updated":"2024-09-26T09:06:56Z","published":"2024-09-26T09:06:56Z","title":"AssistantX: An LLM-Powered Proactive Assistant in Collaborative\n Human-Populated Environment","summary":" The increasing demand for intelligent assistants in human-populated\nenvironments has motivated significant research in autonomous robotic systems.\nTraditional service robots and virtual assistants, however, struggle with\nreal-world task execution due to their limited capacity for dynamic reasoning\nand interaction, particularly when human collaboration is required. Recent\ndevelopments in Large Language Models have opened new avenues for improving\nthese systems, enabling more sophisticated reasoning and natural interaction\ncapabilities. In this paper, we introduce AssistantX, an LLM-powered proactive\nassistant designed to operate autonomously in a physical office environment.\nUnlike conventional service robots, AssistantX leverages a novel multi-agent\narchitecture, PPDR4X, which provides advanced inference capabilities and\ncomprehensive collaboration awareness. By effectively bridging the gap between\nvirtual operations and physical interactions, AssistantX demonstrates robust\nperformance in managing complex real-world scenarios. Our evaluation highlights\nthe architecture's effectiveness, showing that AssistantX can respond to clear\ninstructions, actively retrieve supplementary information from memory, and\nproactively seek collaboration from team members to ensure successful task\ncompletion. More details and videos can be found at\nhttps://assistantx-agent.github.io/AssistantX/.\n","authors":["Nan Sun","Bo Mao","Yongchang Li","Lumeng Ma","Di Guo","Huaping Liu"],"pdf_url":"https://arxiv.org/pdf/2409.17655v1.pdf","comment":"6 pages, 8 figures, 4 tables"},{"id":"http://arxiv.org/abs/2409.17652v1","updated":"2024-09-26T09:00:30Z","published":"2024-09-26T09:00:30Z","title":"FactorSim: Generative Simulation via Factorized Representation","summary":" Generating simulations to train intelligent agents in game-playing and\nrobotics from natural language input, from user input or task documentation,\nremains an open-ended challenge. Existing approaches focus on parts of this\nchallenge, such as generating reward functions or task hyperparameters. Unlike\nprevious work, we introduce FACTORSIM that generates full simulations in code\nfrom language input that can be used to train agents. Exploiting the structural\nmodularity specific to coded simulations, we propose to use a factored\npartially observable Markov decision process representation that allows us to\nreduce context dependence during each step of the generation. For evaluation,\nwe introduce a generative simulation benchmark that assesses the generated\nsimulation code's accuracy and effectiveness in facilitating zero-shot\ntransfers in reinforcement learning settings. We show that FACTORSIM\noutperforms existing methods in generating simulations regarding prompt\nalignment (e.g., accuracy), zero-shot transfer abilities, and human evaluation.\nWe also demonstrate its effectiveness in generating robotic tasks.\n","authors":["Fan-Yun Sun","S. I. Harini","Angela Yi","Yihan Zhou","Alex Zook","Jonathan Tremblay","Logan Cross","Jiajun Wu","Nick Haber"],"pdf_url":"https://arxiv.org/pdf/2409.17652v1.pdf","comment":"neurips 2024, project website:\n https://cs.stanford.edu/~sunfanyun/factorsim/"},{"id":"http://arxiv.org/abs/2311.18576v5","updated":"2024-09-26T08:57:49Z","published":"2023-11-30T14:15:39Z","title":"Fixed-length Dense Descriptor for Efficient Fingerprint Matching","summary":" In fingerprint matching, fixed-length descriptors generally offer greater\nefficiency compared to minutiae set, but the recognition accuracy is not as\ngood as that of the latter. Although much progress has been made in deep\nlearning based fixed-length descriptors recently, they often fall short when\ndealing with incomplete or partial fingerprints, diverse fingerprint poses, and\nsignificant background noise. In this paper, we propose a three-dimensional\nrepresentation called Fixed-length Dense Descriptor (FDD) for efficient\nfingerprint matching. FDD features great spatial properties, enabling it to\ncapture the spatial relationships of the original fingerprints, thereby\nenhancing interpretability and robustness. Our experiments on various\nfingerprint datasets reveal that FDD outperforms other fixed-length\ndescriptors, especially in matching fingerprints of different areas,\ncross-modal fingerprint matching, and fingerprint matching with background\nnoise.\n","authors":["Zhiyu Pan","Yongjie Duan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.18576v5.pdf","comment":"Accepted by WIFS 2024"},{"id":"http://arxiv.org/abs/2409.17650v1","updated":"2024-09-26T08:56:54Z","published":"2024-09-26T08:56:54Z","title":"Digital Twin Ecosystem for Oncology Clinical Operations","summary":" Artificial Intelligence (AI) and Large Language Models (LLMs) hold\nsignificant promise in revolutionizing healthcare, especially in clinical\napplications. Simultaneously, Digital Twin technology, which models and\nsimulates complex systems, has gained traction in enhancing patient care.\nHowever, despite the advances in experimental clinical settings, the potential\nof AI and digital twins to streamline clinical operations remains largely\nuntapped. This paper introduces a novel digital twin framework specifically\ndesigned to enhance oncology clinical operations. We propose the integration of\nmultiple specialized digital twins, such as the Medical Necessity Twin, Care\nNavigator Twin, and Clinical History Twin, to enhance workflow efficiency and\npersonalize care for each patient based on their unique data. Furthermore, by\nsynthesizing multiple data sources and aligning them with the National\nComprehensive Cancer Network (NCCN) guidelines, we create a dynamic Cancer Care\nPath, a continuously evolving knowledge base that enables these digital twins\nto provide precise, tailored clinical recommendations.\n","authors":["Himanshu Pandey","Akhil Amod"," Shivang","Kshitij Jaggi","Ruchi Garg","Abheet Jain","Vinayak Tantia"],"pdf_url":"https://arxiv.org/pdf/2409.17650v1.pdf","comment":"Pre Print"},{"id":"http://arxiv.org/abs/2409.17642v1","updated":"2024-09-26T08:45:15Z","published":"2024-09-26T08:45:15Z","title":"AI Delegates with a Dual Focus: Ensuring Privacy and Strategic\n Self-Disclosure","summary":" Large language model (LLM)-based AI delegates are increasingly utilized to\nact on behalf of users, assisting them with a wide range of tasks through\nconversational interfaces. Despite their advantages, concerns arise regarding\nthe potential risk of privacy leaks, particularly in scenarios involving social\ninteractions. While existing research has focused on protecting privacy by\nlimiting the access of AI delegates to sensitive user information, many social\nscenarios require disclosing private details to achieve desired outcomes,\nnecessitating a balance between privacy protection and disclosure. To address\nthis challenge, we conduct a pilot study to investigate user preferences for AI\ndelegates across various social relations and task scenarios, and then propose\na novel AI delegate system that enables privacy-conscious self-disclosure. Our\nuser study demonstrates that the proposed AI delegate strategically protects\nprivacy, pioneering its use in diverse and dynamic social interactions.\n","authors":["Xi Chen","Zhiyang Zhang","Fangkai Yang","Xiaoting Qin","Chao Du","Xi Cheng","Hangxin Liu","Qingwei Lin","Saravan Rajmohan","Dongmei Zhang","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17640v1","updated":"2024-09-26T08:44:38Z","published":"2024-09-26T08:44:38Z","title":"T3: A Novel Zero-shot Transfer Learning Framework Iteratively Training\n on an Assistant Task for a Target Task","summary":" Long text summarization, gradually being essential for efficiently processing\nlarge volumes of information, stays challenging for Large Language Models\n(LLMs) such as GPT and LLaMA families because of the insufficient open-sourced\ntraining datasets and the high requirement of contextual details dealing. To\naddress the issue, we design a novel zero-shot transfer learning framework,\nabbreviated as T3, to iteratively training a baseline LLM on an assistant task\nfor the target task, where the former should own richer data resources and\nshare structural or semantic similarity with the latter. In practice, T3 is\napproached to deal with the long text summarization task by utilizing question\nanswering as the assistant task, and further validated its effectiveness on the\nBBC summary, NarraSum, FairytaleQA, and NLQuAD datasets, with up to nearly 14%\nimprovement in ROUGE, 35% improvement in BLEU, and 16% improvement in Factscore\ncompared to three baseline LLMs, demonstrating its potential for more\nassistant-target task combinations.\n","authors":["Xindi Tong","Yujin Zhu","Shijian Fan","Liang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17634v1","updated":"2024-09-26T08:31:27Z","published":"2024-09-26T08:31:27Z","title":"P4Q: Learning to Prompt for Quantization in Visual-language Models","summary":" Large-scale pre-trained Vision-Language Models (VLMs) have gained prominence\nin various visual and multimodal tasks, yet the deployment of VLMs on\ndownstream application platforms remains challenging due to their prohibitive\nrequirements of training samples and computing resources. Fine-tuning and\nquantization of VLMs can substantially reduce the sample and computation costs,\nwhich are in urgent need. There are two prevailing paradigms in quantization,\nQuantization-Aware Training (QAT) can effectively quantize large-scale VLMs but\nincur a huge training cost, while low-bit Post-Training Quantization (PTQ)\nsuffers from a notable performance drop. We propose a method that balances\nfine-tuning and quantization named ``Prompt for Quantization'' (P4Q), in which\nwe design a lightweight architecture to leverage contrastive loss supervision\nto enhance the recognition performance of a PTQ model. Our method can\neffectively reduce the gap between image features and text features caused by\nlow-bit quantization, based on learnable prompts to reorganize textual\nrepresentations and a low-bit adapter to realign the distributions of image and\ntext features. We also introduce a distillation loss based on cosine similarity\npredictions to distill the quantized model using a full-precision teacher.\nExtensive experimental results demonstrate that our P4Q method outperforms\nprior arts, even achieving comparable results to its full-precision\ncounterparts. For instance, our 8-bit P4Q can theoretically compress the\nCLIP-ViT/B-32 by 4 $\\times$ while achieving 66.94\\% Top-1 accuracy,\noutperforming the learnable prompt fine-tuned full-precision model by 2.24\\%\nwith negligible additional parameters on the ImageNet dataset.\n","authors":["Huixin Sun","Runqi Wang","Yanjing Li","Xianbin Cao","Xiaolong Jiang","Yao Hu","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14545v2","updated":"2024-09-26T08:29:00Z","published":"2023-06-26T09:35:56Z","title":"Time and State Dependent Neural Delay Differential Equations","summary":" Discontinuities and delayed terms are encountered in the governing equations\nof a large class of problems ranging from physics and engineering to medicine\nand economics. These systems cannot be properly modelled and simulated with\nstandard Ordinary Differential Equations (ODE), or data-driven approximations\nsuch as Neural Ordinary Differential Equations (NODE). To circumvent this\nissue, latent variables are typically introduced to solve the dynamics of the\nsystem in a higher dimensional space and obtain the solution as a projection to\nthe original space. However, this solution lacks physical interpretability. In\ncontrast, Delay Differential Equations (DDEs), and their data-driven\napproximated counterparts, naturally appear as good candidates to characterize\nsuch systems. In this work we revisit the recently proposed Neural DDE by\nintroducing Neural State-Dependent DDE (SDDDE), a general and flexible\nframework that can model multiple and state- and time-dependent delays. We show\nthat our method is competitive and outperforms other continuous-class models on\na wide variety of delayed dynamical systems. Code is available at the\nrepository\n\\href{https://github.com/thibmonsel/Time-and-State-Dependent-Neural-Delay-Differential-Equations}{here}.\n","authors":["Thibault Monsel","Onofrio Semeraro","Lionel Mathelin","Guillaume Charpiat"],"pdf_url":"https://arxiv.org/pdf/2306.14545v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17629v1","updated":"2024-09-26T08:23:04Z","published":"2024-09-26T08:23:04Z","title":"Hand-object reconstruction via interaction-aware graph attention\n mechanism","summary":" Estimating the poses of both a hand and an object has become an important\narea of research due to the growing need for advanced vision computing. The\nprimary challenge involves understanding and reconstructing how hands and\nobjects interact, such as contact and physical plausibility. Existing\napproaches often adopt a graph neural network to incorporate spatial\ninformation of hand and object meshes. However, these approaches have not fully\nexploited the potential of graphs without modification of edges within and\nbetween hand- and object-graphs. We propose a graph-based refinement method\nthat incorporates an interaction-aware graph-attention mechanism to account for\nhand-object interactions. Using edges, we establish connections among closely\ncorrelated nodes, both within individual graphs and across different graphs.\nExperiments demonstrate the effectiveness of our proposed method with notable\nimprovements in the realm of physical plausibility.\n","authors":["Taeyun Woo","Tae-Kyun Kim","Jinah Park"],"pdf_url":"https://arxiv.org/pdf/2409.17629v1.pdf","comment":"7 pages, Accepted by ICIP 2024"},{"id":"http://arxiv.org/abs/2409.17622v1","updated":"2024-09-26T08:16:59Z","published":"2024-09-26T08:16:59Z","title":"Neural P$^3$M: A Long-Range Interaction Modeling Enhancer for Geometric\n GNNs","summary":" Geometric graph neural networks (GNNs) have emerged as powerful tools for\nmodeling molecular geometry. However, they encounter limitations in effectively\ncapturing long-range interactions in large molecular systems. To address this\nchallenge, we introduce Neural P$^3$M, a versatile enhancer of geometric GNNs\nto expand the scope of their capabilities by incorporating mesh points\nalongside atoms and reimaging traditional mathematical operations in a\ntrainable manner. Neural P$^3$M exhibits flexibility across a wide range of\nmolecular systems and demonstrates remarkable accuracy in predicting energies\nand forces, outperforming on benchmarks such as the MD22 dataset. It also\nachieves an average improvement of 22% on the OE62 dataset while integrating\nwith various architectures.\n","authors":["Yusong Wang","Chaoran Cheng","Shaoning Li","Yuxuan Ren","Bin Shao","Ge Liu","Pheng-Ann Heng","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.17622v1.pdf","comment":"Published as a conference paper at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2311.09802v2","updated":"2024-09-26T08:15:50Z","published":"2023-11-16T11:26:21Z","title":"Neuro-Symbolic Integration Brings Causal and Reliable Reasoning Proofs","summary":" Two lines of approaches are adopted for complex reasoning with LLMs. One line\nof work prompts LLMs with various reasoning structures, while the structural\noutputs can be naturally regarded as intermediate reasoning steps. Another line\nof work adopt LLM-free declarative solvers to do the reasoning task, rendering\nhigher reasoning accuracy but lacking interpretability due to the black-box\nnature of the solvers. Aiming to resolve the trade-off between answer accuracy\nand interpretability, we present a simple extension to the latter line of work.\nSpecifically, we showcase that the intermediate search logs generated by Prolog\ninterpreters can be accessed and interpreted into human-readable reasoning\nproofs. As long as LLMs correctly translate problem descriptions into Prolog\nrepresentations, the corresponding reasoning proofs are ensured to be causal\nand reliable. On two logical reasoning and one arithmetic reasoning datasets,\nour framework obtains significant improvements in terms of both answer accuracy\nand reasoning proof accuracy. Our code is released at\nhttps://github.com/DAMO-NLP-SG/CaRing\n","authors":["Sen Yang","Xin Li","Leyang Cui","Lidong Bing","Wai Lam"],"pdf_url":"https://arxiv.org/pdf/2311.09802v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14208v2","updated":"2024-09-26T08:12:59Z","published":"2024-06-20T11:26:06Z","title":"SeCoKD: Aligning Large Language Models for In-Context Learning with\n Fewer Shots","summary":" Previous studies have shown that demonstrations can significantly help Large\nLanguage Models (LLMs ) perform better on the given tasks. However, this\nso-called In-Context Learning ( ICL ) ability is very sensitive to the\npresenting context, and often dozens of demonstrations are needed. In this\nwork, we investigate if we can reduce the shot number while still maintaining a\ncompetitive performance. We present SeCoKD, a self-Knowledge Distillation ( KD\n) training framework that aligns the student model with a heavily prompted\nvariation, thereby increasing the utilization of a single demonstration. We\nexperiment with the SeCoKD across three LLMs and six benchmarks focusing mainly\non reasoning tasks. Results show that our method outperforms the base model and\nSupervised Fine-tuning ( SFT ), especially in zero-shot and one-shot settings\nby 30% and 10%, respectively. Moreover, SeCoKD brings little negative artifacts\nwhen evaluated on new tasks, which is more robust than Supervised Fine-tuning.\n","authors":["Weixing Wang","Haojin Yang","Christoph Meinel"],"pdf_url":"https://arxiv.org/pdf/2406.14208v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2409.15254v3","updated":"2024-09-26T08:01:39Z","published":"2024-09-23T17:53:42Z","title":"Archon: An Architecture Search Framework for Inference-Time Techniques","summary":" Inference-time techniques are emerging as highly effective tools to increase\nlarge language model (LLM) capabilities. However, there is still limited\nunderstanding of the best practices for developing systems that combine\ninference-time techniques with one or more LLMs, with challenges including: (1)\neffectively allocating inference compute budget, (2) understanding the\ninteractions between different combinations of inference-time techniques and\ntheir impact on downstream performance, and 3) efficiently searching over the\nlarge space of model choices, inference-time techniques, and their\ncompositions. To address these challenges, we introduce Archon, an automated\nframework for designing inference-time architectures. Archon defines an\nextensible design space, encompassing methods such as generation ensembling,\nmulti-sampling, ranking, fusion, critiquing, verification, and unit testing. It\nthen transforms the problem of selecting and combining LLMs and inference-time\ntechniques into a hyperparameter optimization objective. To optimize this\nobjective, we introduce automated Inference-Time Architecture Search (ITAS)\nalgorithms. Given target benchmark(s), an inference compute budget, and\navailable LLMs, ITAS outputs optimized architectures. We evaluate Archon\narchitectures across a wide range of instruction-following and reasoning\nbenchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval,\nMixEval Hard, MATH, and CodeContests. We show that automatically designed\ninference-time architectures by Archon outperform strong models such as GPT-4o\nand Claude 3.5 Sonnet on these benchmarks, achieving an average increase of\n15.1 and 11.2 percentage points with all-source models and open-source models,\nrespectively. We make our code and datasets available publicly on Github:\nhttps://github.com/ScalingIntelligence/Archon.\n","authors":["Jon Saad-Falcon","Adrian Gamarra Lafuente","Shlok Natarajan","Nahum Maru","Hristo Todorov","Etash Guha","E. Kelly Buchanan","Mayee Chen","Neel Guha","Christopher Ré","Azalia Mirhoseini"],"pdf_url":"https://arxiv.org/pdf/2409.15254v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04428v2","updated":"2024-09-26T07:53:04Z","published":"2024-09-06T17:48:44Z","title":"Hybrid Spiking Neural Networks for Low-Power Intra-Cortical\n Brain-Machine Interfaces","summary":" Intra-cortical brain-machine interfaces (iBMIs) have the potential to\ndramatically improve the lives of people with paraplegia by restoring their\nability to perform daily activities. However, current iBMIs suffer from\nscalability and mobility limitations due to bulky hardware and wiring. Wireless\niBMIs offer a solution but are constrained by a limited data rate. To overcome\nthis challenge, we are investigating hybrid spiking neural networks for\nembedded neural decoding in wireless iBMIs. The networks consist of a temporal\nconvolution-based compression followed by recurrent processing and a final\ninterpolation back to the original sequence length. As recurrent units, we\nexplore gated recurrent units (GRUs), leaky integrate-and-fire (LIF) neurons,\nand a combination of both - spiking GRUs (sGRUs) and analyze their differences\nin terms of accuracy, footprint, and activation sparsity. To that end, we train\ndecoders on the \"Nonhuman Primate Reaching with Multichannel Sensorimotor\nCortex Electrophysiology\" dataset and evaluate it using the NeuroBench\nframework, targeting both tracks of the IEEE BioCAS Grand Challenge on Neural\nDecoding. Our approach achieves high accuracy in predicting velocities of\nprimate reaching movements from multichannel primary motor cortex recordings\nwhile maintaining a low number of synaptic operations, surpassing the current\nbaseline models in the NeuroBench framework. This work highlights the potential\nof hybrid neural networks to facilitate wireless iBMIs with high decoding\nprecision and a substantial increase in the number of monitored neurons, paving\nthe way toward more advanced neuroprosthetic technologies.\n","authors":["Alexandru Vasilache","Jann Krausse","Klaus Knobloch","Juergen Becker"],"pdf_url":"https://arxiv.org/pdf/2409.04428v2.pdf","comment":"This work has been accepted at the 2024 IEEE Biomedical Circuits and\n Systems Conference"},{"id":"http://arxiv.org/abs/2409.17607v1","updated":"2024-09-26T07:47:50Z","published":"2024-09-26T07:47:50Z","title":"Dirichlet-Based Coarse-to-Fine Example Selection For Open-Set Annotation","summary":" Active learning (AL) has achieved great success by selecting the most\nvaluable examples from unlabeled data. However, they usually deteriorate in\nreal scenarios where open-set noise gets involved, which is studied as open-set\nannotation (OSA). In this paper, we owe the deterioration to the unreliable\npredictions arising from softmax-based translation invariance and propose a\nDirichlet-based Coarse-to-Fine Example Selection (DCFS) strategy accordingly.\nOur method introduces simplex-based evidential deep learning (EDL) to break\ntranslation invariance and distinguish known and unknown classes by considering\nevidence-based data and distribution uncertainty simultaneously. Furthermore,\nhard known-class examples are identified by model discrepancy generated from\ntwo classifier heads, where we amplify and alleviate the model discrepancy\nrespectively for unknown and known classes. Finally, we combine the discrepancy\nwith uncertainties to form a two-stage strategy, selecting the most informative\nexamples from known classes. Extensive experiments on various openness ratio\ndatasets demonstrate that DCFS achieves state-of-art performance.\n","authors":["Ye-Wen Wang","Chen-Chen Zong","Ming-Kun Xie","Sheng-Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17602v1","updated":"2024-09-26T07:36:49Z","published":"2024-09-26T07:36:49Z","title":"Open Digital Rights Enforcement Framework (ODRE): from descriptive to\n enforceable policies","summary":" From centralised platforms to decentralised ecosystems, like Data Spaces,\nsharing data has become a paramount challenge. For this reason, the definition\nof data usage policies has become crucial in these domains, highlighting the\nnecessity of effective policy enforcement mechanisms. The Open Digital Rights\nLanguage (ODRL) is a W3C standard ontology designed to describe data usage\npolicies, however, it lacks built-in enforcement capabilities, limiting its\npractical application. This paper introduces the Open Digital Rights\nEnforcement (ODRE) framework, whose goal is to provide ODRL with enforcement\ncapabilities. The ODRE framework proposes a novel approach to express ODRL\npolicies that integrates the descriptive ontology terms of ODRL with other\nlanguages that allow behaviour specification, such as dynamic data handling or\nfunction evaluation. The framework includes an enforcement algorithm for ODRL\npolicies and two open-source implementations in Python and Java. The ODRE\nframework is also designed to support future extensions of ODRL to specific\ndomain scenarios. In addition, current limitations of ODRE, ODRL, and current\nchallenges are reported. Finally, to demonstrate the enforcement capabilities\nof the implementations, their performance, and their extensibility features,\nseveral experiments have been carried out with positive results.\n","authors":["Andrea Cimmino","Juan Cano-Benito","Raúl García-Castro"],"pdf_url":"https://arxiv.org/pdf/2409.17602v1.pdf","comment":"20 pages, 3 Figures, Submitted to Computers & Security journal"},{"id":"http://arxiv.org/abs/2409.17601v1","updated":"2024-09-26T07:35:23Z","published":"2024-09-26T07:35:23Z","title":"TA-Cleaner: A Fine-grained Text Alignment Backdoor Defense Strategy for\n Multimodal Contrastive Learning","summary":" Pre-trained large models for multimodal contrastive learning, such as CLIP,\nhave been widely recognized in the industry as highly susceptible to\ndata-poisoned backdoor attacks. This poses significant risks to downstream\nmodel training. In response to such potential threats, finetuning offers a\nsimpler and more efficient defense choice compared to retraining large models\nwith augmented data. In the supervised learning domain, fine-tuning defense\nstrategies can achieve excellent defense performance. However, in the\nunsupervised and semi-supervised domain, we find that when CLIP faces some\ncomplex attack techniques, the existing fine-tuning defense strategy,\nCleanCLIP, has some limitations on defense performance. The synonym\nsubstitution of its text-augmentation is insufficient to enhance the text\nfeature space. To compensate for this weakness, we improve it by proposing a\nfine-grained \\textbf{T}ext \\textbf{A}lignment \\textbf{C}leaner (TA-Cleaner) to\ncut off feature connections of backdoor triggers. We randomly select a few\nsamples for positive and negative subtext generation at each epoch of\nCleanCLIP, and align the subtexts to the images to strengthen the text\nself-supervision. We evaluate the effectiveness of our TA-Cleaner against six\nattack algorithms and conduct comprehensive zero-shot classification tests on\nImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves\nstate-of-the-art defensiveness among finetuning-based defense techniques. Even\nwhen faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms\nCleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\\% and 63.88\\%,\nrespectively.\n","authors":["Yuan Xun","Siyuan Liang","Xiaojun Jia","Xinwei Liu","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2409.17601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17596v1","updated":"2024-09-26T07:22:38Z","published":"2024-09-26T07:22:38Z","title":"Subjective and Objective Quality-of-Experience Evaluation Study for Live\n Video Streaming","summary":" In recent years, live video streaming has gained widespread popularity across\nvarious social media platforms. Quality of experience (QoE), which reflects\nend-users' satisfaction and overall experience, plays a critical role for media\nservice providers to optimize large-scale live compression and transmission\nstrategies to achieve perceptually optimal rate-distortion trade-off. Although\nmany QoE metrics for video-on-demand (VoD) have been proposed, there remain\nsignificant challenges in developing QoE metrics for live video streaming. To\nbridge this gap, we conduct a comprehensive study of subjective and objective\nQoE evaluations for live video streaming. For the subjective QoE study, we\nintroduce the first live video streaming QoE dataset, TaoLive QoE, which\nconsists of $42$ source videos collected from real live broadcasts and $1,155$\ncorresponding distorted ones degraded due to a variety of streaming\ndistortions, including conventional streaming distortions such as compression,\nstalling, as well as live streaming-specific distortions like frame skipping,\nvariable frame rate, etc. Subsequently, a human study was conducted to derive\nsubjective QoE scores of videos in the TaoLive QoE dataset. For the objective\nQoE study, we benchmark existing QoE models on the TaoLive QoE dataset as well\nas publicly available QoE datasets for VoD scenarios, highlighting that current\nmodels struggle to accurately assess video QoE, particularly for live content.\nHence, we propose an end-to-end QoE evaluation model, Tao-QoE, which integrates\nmulti-scale semantic features and optical flow-based motion features to\npredicting a retrospective QoE score, eliminating reliance on statistical\nquality of service (QoS) features.\n","authors":["Zehao Zhu","Wei Sun","Jun Jia","Wei Wu","Sibin Deng","Kai Li","Ying Chen","Xiongkuo Min","Jia Wang","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2409.17596v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17592v1","updated":"2024-09-26T07:19:12Z","published":"2024-09-26T07:19:12Z","title":"Deep Manifold Part 1: Anatomy of Neural Network Manifold","summary":" Based on the numerical manifold method principle, we developed a mathematical\nframework of a neural network manifold: Deep Manifold and discovered that\nneural networks: 1) is numerical computation combining forward and inverse; 2)\nhave near infinite degrees of freedom; 3) exponential learning capacity with\ndepth; 4) have self-progressing boundary conditions; 5) has training hidden\nbottleneck. We also define two concepts: neural network learning space and deep\nmanifold space and introduce two concepts: neural network intrinsic pathway and\nfixed point. We raise three fundamental questions: 1). What is the training\ncompletion definition; 2). where is the deep learning convergence point (neural\nnetwork fixed point); 3). How important is token timestamp in training data\ngiven negative time is critical in inverse problem.\n","authors":["Max Y. Ma","Gen-Hua Shi"],"pdf_url":"https://arxiv.org/pdf/2409.17592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17589v1","updated":"2024-09-26T07:12:04Z","published":"2024-09-26T07:12:04Z","title":"Improving Fast Adversarial Training via Self-Knowledge Guidance","summary":" Adversarial training has achieved remarkable advancements in defending\nagainst adversarial attacks. Among them, fast adversarial training (FAT) is\ngaining attention for its ability to achieve competitive robustness with fewer\ncomputing resources. Existing FAT methods typically employ a uniform strategy\nthat optimizes all training data equally without considering the influence of\ndifferent examples, which leads to an imbalanced optimization. However, this\nimbalance remains unexplored in the field of FAT. In this paper, we conduct a\ncomprehensive study of the imbalance issue in FAT and observe an obvious class\ndisparity regarding their performances. This disparity could be embodied from a\nperspective of alignment between clean and robust accuracy. Based on the\nanalysis, we mainly attribute the observed misalignment and disparity to the\nimbalanced optimization in FAT, which motivates us to optimize different\ntraining data adaptively to enhance robustness. Specifically, we take disparity\nand misalignment into consideration. First, we introduce self-knowledge guided\nregularization, which assigns differentiated regularization weights to each\nclass based on its training state, alleviating class disparity. Additionally,\nwe propose self-knowledge guided label relaxation, which adjusts label\nrelaxation according to the training accuracy, alleviating the misalignment and\nimproving robustness. By combining these methods, we formulate the\nSelf-Knowledge Guided FAT (SKG-FAT), leveraging naturally generated knowledge\nduring training to enhance the adversarial robustness without compromising\ntraining efficiency. Extensive experiments on four standard datasets\ndemonstrate that the SKG-FAT improves the robustness and preserves competitive\nclean accuracy, outperforming the state-of-the-art methods.\n","authors":["Chengze Jiang","Junkai Wang","Minjing Dong","Jie Gui","Xinli Shi","Yuan Cao","Yuan Yan Tang","James Tin-Yau Kwok"],"pdf_url":"https://arxiv.org/pdf/2409.17589v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2409.17587v1","updated":"2024-09-26T07:07:08Z","published":"2024-09-26T07:07:08Z","title":"Multimodal Banking Dataset: Understanding Client Needs through Event\n Sequences","summary":" Financial organizations collect a huge amount of data about clients that\ntypically has a temporal (sequential) structure and is collected from various\nsources (modalities). Due to privacy issues, there are no large-scale\nopen-source multimodal datasets of event sequences, which significantly limits\nthe research in this area. In this paper, we present the industrial-scale\npublicly available multimodal banking dataset, MBD, that contains more than\n1.5M corporate clients with several modalities: 950M bank transactions, 1B geo\nposition events, 5M embeddings of dialogues with technical support and monthly\naggregated purchases of four bank's products. All entries are properly\nanonymized from real proprietary bank data. Using this dataset, we introduce a\nnovel benchmark with two business tasks: campaigning (purchase prediction in\nthe next month) and matching of clients. We provide numerical results that\ndemonstrate the superiority of our multi-modal baselines over single-modal\ntechniques for each task. As a result, the proposed dataset can open new\nperspectives and facilitate the future development of practically important\nlarge-scale multimodal algorithms for event sequences.\n HuggingFace Link: https://huggingface.co/datasets/ai-lab/MBD\n Github Link: https://github.com/Dzhambo/MBD\n","authors":["Mollaev Dzhambulat","Alexander Kostin","Postnova Maria","Ivan Karpukhin","Ivan A Kireev","Gleb Gusev","Andrey Savchenko"],"pdf_url":"https://arxiv.org/pdf/2409.17587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17583v1","updated":"2024-09-26T07:01:29Z","published":"2024-09-26T07:01:29Z","title":"Let the Quantum Creep In: Designing Quantum Neural Network Models by\n Gradually Swapping Out Classical Components","summary":" Artificial Intelligence (AI), with its multiplier effect and wide\napplications in multiple areas, could potentially be an important application\nof quantum computing. Since modern AI systems are often built on neural\nnetworks, the design of quantum neural networks becomes a key challenge in\nintegrating quantum computing into AI. To provide a more fine-grained\ncharacterisation of the impact of quantum components on the performance of\nneural networks, we propose a framework where classical neural network layers\nare gradually replaced by quantum layers that have the same type of input and\noutput while keeping the flow of information between layers unchanged,\ndifferent from most current research in quantum neural network, which favours\nan end-to-end quantum model. We start with a simple three-layer classical\nneural network without any normalisation layers or activation functions, and\ngradually change the classical layers to the corresponding quantum versions. We\nconduct numerical experiments on image classification datasets such as the\nMNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of\nperformance brought by the systematic introduction of quantum components.\nThrough this framework, our research sheds new light on the design of future\nquantum neural network models where it could be more favourable to search for\nmethods and frameworks that harness the advantages from both the classical and\nquantum worlds.\n","authors":["Peiyong Wang","Casey. R. Myers","Lloyd C. L. Hollenberg","Udaya Parampalli"],"pdf_url":"https://arxiv.org/pdf/2409.17583v1.pdf","comment":"50 pages (including Appendix), many figures, accepted as a poster on\n QTML2024. Code available at\n https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In"},{"id":"http://arxiv.org/abs/2406.10267v2","updated":"2024-09-26T06:57:27Z","published":"2024-06-11T09:24:18Z","title":"Unused information in token probability distribution of generative LLM:\n improving LLM reading comprehension through calculation of expected values","summary":" LLM text decoding is key component for perceived LLM quality. We demonstrate\ntwo experiments showing that decoding methods could be improved by manipulation\nof token probabilities. First, we test few LLM on SummEval summary scoring\ndataset, to measure reading comprehension. We compare scores from greedy\ndecoding to expected values over the next token distribution. We scale logits\nby large temperature to increase the entropy of scores. This allows strong\nimprovement of performance on SummEval (in terms of correlations to human\njudgement). We see improvement from 6-8% to 13-28% for 7B Mistral and from\n20%-46% to 37%-56% for Mixtral, beating GPT 4 0314 result on two metrics. Part\nof the gain seems related to positional bias. Secondly, we use\nprobability-based tree sampling algorithm, to examine all most probable\ngenerations for given prompt.\n","authors":["Krystian Zawistowski"],"pdf_url":"https://arxiv.org/pdf/2406.10267v2.pdf","comment":"7 pages, 1 figure, presented at FEDCSIS 2024 conference,"},{"id":"http://arxiv.org/abs/2409.17581v1","updated":"2024-09-26T06:57:22Z","published":"2024-09-26T06:57:22Z","title":"A Scalable Data-Driven Framework for Systematic Analysis of SEC 10-K\n Filings Using Large Language Models","summary":" The number of companies listed on the NYSE has been growing exponentially,\ncreating a significant challenge for market analysts, traders, and stockholders\nwho must monitor and assess the performance and strategic shifts of a large\nnumber of companies regularly. There is an increasing need for a fast,\ncost-effective, and comprehensive method to evaluate the performance and detect\nand compare many companies' strategy changes efficiently. We propose a novel\ndata-driven approach that leverages large language models (LLMs) to\nsystematically analyze and rate the performance of companies based on their SEC\n10-K filings. These filings, which provide detailed annual reports on a\ncompany's financial performance and strategic direction, serve as a rich source\nof data for evaluating various aspects of corporate health, including\nconfidence, environmental sustainability, innovation, and workforce management.\nWe also introduce an automated system for extracting and preprocessing 10-K\nfilings. This system accurately identifies and segments the required sections\nas outlined by the SEC, while also isolating key textual content that contains\ncritical information about the company. This curated data is then fed into\nCohere's Command-R+ LLM to generate quantitative ratings across various\nperformance metrics. These ratings are subsequently processed and visualized to\nprovide actionable insights. The proposed scheme is then implemented on an\ninteractive GUI as a no-code solution for running the data pipeline and\ncreating the visualizations. The application showcases the rating results and\nprovides year-on-year comparisons of company performance.\n","authors":["Syed Affan Daimi","Asma Iqbal"],"pdf_url":"https://arxiv.org/pdf/2409.17581v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.17580v1","updated":"2024-09-26T06:53:29Z","published":"2024-09-26T06:53:29Z","title":"Enhancing Structured-Data Retrieval with GraphRAG: Soccer Data Case\n Study","summary":" Extracting meaningful insights from large and complex datasets poses\nsignificant challenges, particularly in ensuring the accuracy and relevance of\nretrieved information. Traditional data retrieval methods such as sequential\nsearch and index-based retrieval often fail when handling intricate and\ninterconnected data structures, resulting in incomplete or misleading outputs.\nTo overcome these limitations, we introduce Structured-GraphRAG, a versatile\nframework designed to enhance information retrieval across structured datasets\nin natural language queries. Structured-GraphRAG utilizes multiple knowledge\ngraphs, which represent data in a structured format and capture complex\nrelationships between entities, enabling a more nuanced and comprehensive\nretrieval of information. This graph-based approach reduces the risk of errors\nin language model outputs by grounding responses in a structured format,\nthereby enhancing the reliability of results. We demonstrate the effectiveness\nof Structured-GraphRAG by comparing its performance with that of a recently\npublished method using traditional retrieval-augmented generation. Our findings\nshow that Structured-GraphRAG significantly improves query processing\nefficiency and reduces response times. While our case study focuses on soccer\ndata, the framework's design is broadly applicable, offering a powerful tool\nfor data analysis and enhancing language model applications across various\nstructured domains.\n","authors":["Zahra Sepasdar","Sushant Gautam","Cise Midoglu","Michael A. Riegler","Pål Halvorsen"],"pdf_url":"https://arxiv.org/pdf/2409.17580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17572v1","updated":"2024-09-26T06:40:45Z","published":"2024-09-26T06:40:45Z","title":"Dr. GPT in Campus Counseling: Understanding Higher Education Students'\n Opinions on LLM-assisted Mental Health Services","summary":" In response to the increasing mental health challenges faced by college\nstudents, we sought to understand their perspectives on how AI applications,\nparticularly Large Language Models (LLMs), can be leveraged to enhance their\nmental well-being. Through pilot interviews with ten diverse students, we\nexplored their opinions on the use of LLMs across five fictional scenarios:\nGeneral Information Inquiry, Initial Screening, Reshaping Patient-Expert\nDynamics, Long-term Care, and Follow-up Care. Our findings revealed that\nstudents' acceptance of LLMs varied by scenario, with participants highlighting\nboth potential benefits, such as proactive engagement and personalized\nfollow-up care, and concerns, including limitations in training data and\nemotional support. These insights inform how AI technology should be designed\nand implemented to effectively support and enhance students' mental well-being,\nparticularly in scenarios where LLMs can complement traditional methods, while\nmaintaining empathy and respecting individual preferences.\n","authors":["Owen Xingjian Zhang","Shuyao Zhou","Jiayi Geng","Yuhan Liu","Sunny Xun Liu"],"pdf_url":"https://arxiv.org/pdf/2409.17572v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2409.17568v1","updated":"2024-09-26T06:31:31Z","published":"2024-09-26T06:31:31Z","title":"Showing Many Labels in Multi-label Classification Models: An Empirical\n Study of Adversarial Examples","summary":" With the rapid development of Deep Neural Networks (DNNs), they have been\napplied in numerous fields. However, research indicates that DNNs are\nsusceptible to adversarial examples, and this is equally true in the\nmulti-label domain. To further investigate multi-label adversarial examples, we\nintroduce a novel type of attacks, termed \"Showing Many Labels\". The objective\nof this attack is to maximize the number of labels included in the classifier's\nprediction results. In our experiments, we select nine attack algorithms and\nevaluate their performance under \"Showing Many Labels\". Eight of the attack\nalgorithms were adapted from the multi-class environment to the multi-label\nenvironment, while the remaining one was specifically designed for the\nmulti-label environment. We choose ML-LIW and ML-GCN as target models and train\nthem on four popular multi-label datasets: VOC2007, VOC2012, NUS-WIDE, and\nCOCO. We record the success rate of each algorithm when it shows the expected\nnumber of labels in eight different scenarios. Experimental results indicate\nthat under the \"Showing Many Labels\", iterative attacks perform significantly\nbetter than one-step attacks. Moreover, it is possible to show all labels in\nthe dataset.\n","authors":["Yujiang Liu","Wenjian Luo","Zhijian Chen","Muhammad Luqman Naseem"],"pdf_url":"https://arxiv.org/pdf/2409.17568v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.12598v2","updated":"2024-09-26T06:31:25Z","published":"2024-08-22T17:59:01Z","title":"ND-SDF: Learning Normal Deflection Fields for High-Fidelity Indoor\n Reconstruction","summary":" Neural implicit reconstruction via volume rendering has demonstrated its\neffectiveness in recovering dense 3D surfaces. However, it is non-trivial to\nsimultaneously recover meticulous geometry and preserve smoothness across\nregions with differing characteristics. To address this issue, previous methods\ntypically employ geometric priors, which are often constrained by the\nperformance of the prior models. In this paper, we propose ND-SDF, which learns\na Normal Deflection field to represent the angular deviation between the scene\nnormal and the prior normal. Unlike previous methods that uniformly apply\ngeometric priors on all samples, introducing significant bias in accuracy, our\nproposed normal deflection field dynamically learns and adapts the utilization\nof samples based on their specific characteristics, thereby improving both the\naccuracy and effectiveness of the model. Our method not only obtains smooth\nweakly textured regions such as walls and floors but also preserves the\ngeometric details of complex structures. In addition, we introduce a novel ray\nsampling strategy based on the deflection angle to facilitate the unbiased\nrendering process, which significantly improves the quality and accuracy of\nintricate surfaces, especially on thin structures. Consistent improvements on\nvarious challenging datasets demonstrate the superiority of our method.\n","authors":["Ziyu Tang","Weicai Ye","Yifan Wang","Di Huang","Hujun Bao","Tong He","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17565v1","updated":"2024-09-26T06:27:26Z","published":"2024-09-26T06:27:26Z","title":"Pixel-Space Post-Training of Latent Diffusion Models","summary":" Latent diffusion models (LDMs) have made significant advancements in the\nfield of image generation in recent years. One major advantage of LDMs is their\nability to operate in a compressed latent space, allowing for more efficient\ntraining and deployment. However, despite these advantages, challenges with\nLDMs still remain. For example, it has been observed that LDMs often generate\nhigh-frequency details and complex compositions imperfectly. We hypothesize\nthat one reason for these flaws is due to the fact that all pre- and\npost-training of LDMs are done in latent space, which is typically $8 \\times 8$\nlower spatial-resolution than the output images. To address this issue, we\npropose adding pixel-space supervision in the post-training process to better\npreserve high-frequency details. Experimentally, we show that adding a\npixel-space objective significantly improves both supervised quality\nfine-tuning and preference-based post-training by a large margin on a\nstate-of-the-art DiT transformer and U-Net diffusion models in both visual\nquality and visual flaw metrics, while maintaining the same text alignment\nquality.\n","authors":["Christina Zhang","Simran Motwani","Matthew Yu","Ji Hou","Felix Juefei-Xu","Sam Tsai","Peter Vajda","Zijian He","Jialiang Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16997v2","updated":"2024-09-26T06:13:04Z","published":"2024-09-25T15:02:25Z","title":"INT-FlashAttention: Enabling Flash Attention for INT8 Quantization","summary":" As the foundation of large language models (LLMs), self-attention module\nfaces the challenge of quadratic time and memory complexity with respect to\nsequence length. FlashAttention accelerates attention computation and reduces\nits memory usage by leveraging the GPU memory hierarchy. A promising research\ndirection is to integrate FlashAttention with quantization methods. This paper\nintroduces INT-FlashAttention, the first INT8 quantization architecture\ncompatible with the forward workflow of FlashAttention, which significantly\nimproves the inference speed of FlashAttention on Ampere GPUs. We implement our\nINT-FlashAttention prototype with fully INT8 activations and general\nmatrix-multiplication (GEMM) kernels, making it the first attention operator\nwith fully INT8 input. As a general token-level post-training quantization\nframework, INT-FlashAttention is also compatible with other data formats like\nINT4, etc. Experimental results show INT-FlashAttention achieves 72% faster\ninference speed and 82% smaller quantization error compared to standard\nFlashAttention with FP16 and FP8 data format.\n","authors":["Shimao Chen","Zirui Liu","Zhiying Wu","Ce Zheng","Peizhuang Cong","Zihan Jiang","Yuhan Wu","Lei Su","Tong Yang"],"pdf_url":"https://arxiv.org/pdf/2409.16997v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02569v2","updated":"2024-09-26T05:57:37Z","published":"2024-04-03T08:42:36Z","title":"SliceIt! -- A Dual Simulator Framework for Learning Robot Food Slicing","summary":" Cooking robots can enhance the home experience by reducing the burden of\ndaily chores. However, these robots must perform their tasks dexterously and\nsafely in shared human environments, especially when handling dangerous tools\nsuch as kitchen knives. This study focuses on enabling a robot to autonomously\nand safely learn food-cutting tasks. More specifically, our goal is to enable a\ncollaborative robot or industrial robot arm to perform food-slicing tasks by\nadapting to varying material properties using compliance control. Our approach\ninvolves using Reinforcement Learning (RL) to train a robot to compliantly\nmanipulate a knife, by reducing the contact forces exerted by the food items\nand by the cutting board. However, training the robot in the real world can be\ninefficient, and dangerous, and result in a lot of food waste. Therefore, we\nproposed SliceIt!, a framework for safely and efficiently learning robot\nfood-slicing tasks in simulation. Following a real2sim2real approach, our\nframework consists of collecting a few real food slicing data, calibrating our\ndual simulation environment (a high-fidelity cutting simulator and a robotic\nsimulator), learning compliant control policies on the calibrated simulation\nenvironment, and finally, deploying the policies on the real robot.\n","authors":["Cristian C. Beltran-Hernandez","Nicolas Erbetti","Masashi Hamaya"],"pdf_url":"https://arxiv.org/pdf/2404.02569v2.pdf","comment":"Accepted to ICRA 2024"},{"id":"http://arxiv.org/abs/2406.14990v2","updated":"2024-09-26T05:51:20Z","published":"2024-06-21T09:03:37Z","title":"Learning Variable Compliance Control From a Few Demonstrations for\n Bimanual Robot with Haptic Feedback Teleoperation System","summary":" Automating dexterous, contact-rich manipulation tasks using rigid robots is a\nsignificant challenge in robotics. Rigid robots, defined by their actuation\nthrough position commands, face issues of excessive contact forces due to their\ninability to adapt to contact with the environment, potentially causing damage.\nWhile compliance control schemes have been introduced to mitigate these issues\nby controlling forces via external sensors, they are hampered by the need for\nfine-tuning task-specific controller parameters. Learning from Demonstrations\n(LfD) offers an intuitive alternative, allowing robots to learn manipulations\nthrough observed actions. In this work, we introduce a novel system to enhance\nthe teaching of dexterous, contact-rich manipulations to rigid robots. Our\nsystem is twofold: firstly, it incorporates a teleoperation interface utilizing\nVirtual Reality (VR) controllers, designed to provide an intuitive and\ncost-effective method for task demonstration with haptic feedback. Secondly, we\npresent Comp-ACT (Compliance Control via Action Chunking with Transformers), a\nmethod that leverages the demonstrations to learn variable compliance control\nfrom a few demonstrations. Our methods have been validated across various\ncomplex contact-rich manipulation tasks using single-arm and bimanual robot\nsetups in simulated and real-world environments, demonstrating the\neffectiveness of our system in teaching robots dexterous manipulations with\nenhanced adaptability and safety. Code available at:\nhttps://github.com/omron-sinicx/CompACT\n","authors":["Tatsuya Kamijo","Cristian C. Beltran-Hernandez","Masashi Hamaya"],"pdf_url":"https://arxiv.org/pdf/2406.14990v2.pdf","comment":"Accepted to IROS 2024"},{"id":"http://arxiv.org/abs/2406.06911v3","updated":"2024-09-26T05:47:36Z","published":"2024-06-11T03:09:37Z","title":"AsyncDiff: Parallelizing Diffusion Models by Asynchronous Denoising","summary":" Diffusion models have garnered significant interest from the community for\ntheir great generative ability across various applications. However, their\ntypical multi-step sequential-denoising nature gives rise to high cumulative\nlatency, thereby precluding the possibilities of parallel computation. To\naddress this, we introduce AsyncDiff, a universal and plug-and-play\nacceleration scheme that enables model parallelism across multiple devices. Our\napproach divides the cumbersome noise prediction model into multiple\ncomponents, assigning each to a different device. To break the dependency chain\nbetween these components, it transforms the conventional sequential denoising\ninto an asynchronous process by exploiting the high similarity between hidden\nstates in consecutive diffusion steps. Consequently, each component is\nfacilitated to compute in parallel on separate devices. The proposed strategy\nsignificantly reduces inference latency while minimally impacting the\ngenerative quality. Specifically, for the Stable Diffusion v2.1, AsyncDiff\nachieves a 2.7x speedup with negligible degradation and a 4.0x speedup with\nonly a slight reduction of 0.38 in CLIP Score, on four NVIDIA A5000 GPUs. Our\nexperiments also demonstrate that AsyncDiff can be readily applied to video\ndiffusion models with encouraging performances. The code is available at\nhttps://github.com/czg1225/AsyncDiff.\n","authors":["Zigeng Chen","Xinyin Ma","Gongfan Fang","Zhenxiong Tan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2406.06911v3.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.15763v2","updated":"2024-09-26T05:43:08Z","published":"2024-09-24T05:39:53Z","title":"IRSC: A Zero-shot Evaluation Benchmark for Information Retrieval through\n Semantic Comprehension in Retrieval-Augmented Generation Scenarios","summary":" In Retrieval-Augmented Generation (RAG) tasks using Large Language Models\n(LLMs), the quality of retrieved information is critical to the final output.\nThis paper introduces the IRSC benchmark for evaluating the performance of\nembedding models in multilingual RAG tasks. The benchmark encompasses five\nretrieval tasks: query retrieval, title retrieval, part-of-paragraph retrieval,\nkeyword retrieval, and summary retrieval. Our research addresses the current\nlack of comprehensive testing and effective comparison methods for embedding\nmodels in RAG scenarios. We introduced new metrics: the Similarity of Semantic\nComprehension Index (SSCI) and the Retrieval Capability Contest Index (RCCI),\nand evaluated models such as Snowflake-Arctic, BGE, GTE, and M3E. Our\ncontributions include: 1) the IRSC benchmark, 2) the SSCI and RCCI metrics, and\n3) insights into the cross-lingual limitations of embedding models. The IRSC\nbenchmark aims to enhance the understanding and development of accurate\nretrieval systems in RAG tasks. All code and datasets are available at:\nhttps://github.com/Jasaxion/IRSC_Benchmark\n","authors":["Hai Lin","Shaoxiong Zhan","Junyou Su","Haitao Zheng","Hui Wang"],"pdf_url":"https://arxiv.org/pdf/2409.15763v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17547v1","updated":"2024-09-26T05:33:30Z","published":"2024-09-26T05:33:30Z","title":"Triple Point Masking","summary":" Existing 3D mask learning methods encounter performance bottlenecks under\nlimited data, and our objective is to overcome this limitation. In this paper,\nwe introduce a triple point masking scheme, named TPM, which serves as a\nscalable framework for pre-training of masked autoencoders to achieve\nmulti-mask learning for 3D point clouds. Specifically, we augment the baselines\nwith two additional mask choices (i.e., medium mask and low mask) as our core\ninsight is that the recovery process of an object can manifest in diverse ways.\nPrevious high-masking schemes focus on capturing the global representation but\nlack the fine-grained recovery capability, so that the generated pre-trained\nweights tend to play a limited role in the fine-tuning process. With the\nsupport of the proposed TPM, available methods can exhibit more flexible and\naccurate completion capabilities, enabling the potential autoencoder in the\npre-training stage to consider multiple representations of a single 3D object.\nIn addition, an SVM-guided weight selection module is proposed to fill the\nencoder parameters for downstream networks with the optimal weight during the\nfine-tuning stage, maximizing linear accuracy and facilitating the acquisition\nof intricate representations for new objects. Extensive experiments show that\nthe four baselines equipped with the proposed TPM achieve comprehensive\nperformance improvements on various downstream tasks.\n","authors":["Jiaming Liu","Linghe Kong","Yue Wu","Maoguo Gong","Hao Li","Qiguang Miao","Wenping Ma","Can Qin"],"pdf_url":"https://arxiv.org/pdf/2409.17547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17545v1","updated":"2024-09-26T05:24:14Z","published":"2024-09-26T05:24:14Z","title":"Modulated Intervention Preference Optimization (MIPO): Keey the Easy,\n Refine the Difficult","summary":" Preference optimization methods typically begin training with a well-trained\nSFT model as a reference model. In RLHF and DPO, a regularization term is used\nduring the preference optimization process to prevent the policy model from\ndeviating too far from the reference model's distribution, thereby avoiding the\ngeneration of anomalous responses. When the reference model is already\nwell-aligned with the given data or only requires slight adjustments, this\napproach can produce a well-aligned model. However, if the reference model is\nnot aligned with the given data and requires significant deviation from its\ncurrent state, a regularization term may actually hinder the model alignment.\nIn this study, we propose \\textbf{Modulated Intervention Preference\nOptimization (MIPO)} to address this issue. MIPO modulates the degree of\nintervention from the reference model based on how well the given data is\naligned with it. If the data is well-aligned, the intervention is increased to\nprevent the policy model from diverging significantly from reference model.\nConversely, if the alignment is poor, the interference is reduced to facilitate\nmore extensive training. We compare the performance of MIPO and DPO using\nMistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental\nresults demonstrate that MIPO consistently outperforms DPO across various\nevaluation scenarios.\n","authors":["Cheolhun Jang"],"pdf_url":"https://arxiv.org/pdf/2409.17545v1.pdf","comment":"8pages, submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2402.11645v2","updated":"2024-09-26T05:15:00Z","published":"2024-02-18T16:55:54Z","title":"Image Denoising with Machine Learning: A Novel Approach to Improve\n Quantum Image Processing Quality and Reliability","summary":" Quantum Image Processing (QIP) is a field that aims to utilize the benefits\nof quantum computing for manipulating and analyzing images. However, QIP faces\ntwo challenges: the limitation of qubits and the presence of noise in a quantum\nmachine. In this research, we propose a novel approach to address the issue of\nnoise in QIP. By training and employing a machine learning model that\nidentifies and corrects the noise in quantum-processed images, we can\ncompensate for the noisiness caused by the machine and retrieve a processing\nresult similar to that performed by a classical computer with higher\nefficiency. The model is trained by learning a dataset consisting of both\nexisting processed images and quantum-processed images from open-access\ndatasets. This model will be capable of providing us with the confidence level\nfor each pixel and its potential original value. To assess the model's accuracy\nin compensating for loss and decoherence in QIP, we evaluate it using three\nmetrics: Peak Signal to Noise Ratio (PSNR), Structural Similarity Index (SSIM),\nand Mean Opinion Score (MOS). Additionally, we discuss the applicability of our\nmodel across domains well as its cost effectiveness compared to alternative\nmethods.\n","authors":["Yifan Zhou","Yan Shing Liang"],"pdf_url":"https://arxiv.org/pdf/2402.11645v2.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.17538v1","updated":"2024-09-26T04:56:49Z","published":"2024-09-26T04:56:49Z","title":"On the Implicit Relation Between Low-Rank Adaptation and Differential\n Privacy","summary":" A significant approach in natural language processing involves large-scale\npre-training on general domain data followed by adaptation to specific tasks or\ndomains. As models grow in size, full fine-tuning all parameters becomes\nincreasingly impractical. To address this, some methods for low-rank task\nadaptation of language models have been proposed, e.g. LoRA and FLoRA. These\nmethods keep the pre-trained model weights fixed and incorporate trainable\nlow-rank decomposition matrices into some layers of the transformer\narchitecture, called adapters. This approach significantly reduces the number\nof trainable parameters required for downstream tasks compared to full\nfine-tuning all parameters. In this work, we look at low-rank adaptation from\nthe lens of data privacy. We show theoretically that the low-rank adaptation\nused in LoRA and FLoRA is equivalent to injecting some random noise into the\nbatch gradients w.r.t the adapter parameters coming from their full\nfine-tuning, and we quantify the variance of the injected noise. By\nestablishing a Berry-Esseen type bound on the total variation distance between\nthe noise distribution and a Gaussian distribution with the same variance, we\nshow that the dynamics of LoRA and FLoRA are very close to differentially\nprivate full fine-tuning the adapters, which suggests that low-rank adaptation\nimplicitly provides privacy w.r.t the fine-tuning data. Finally, using\nJohnson-Lindenstrauss lemma, we show that when augmented with gradient\nclipping, low-rank adaptation is almost equivalent to differentially private\nfull fine-tuning adapters with a fixed noise scale.\n","authors":["Saber Malekmohammadi","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2409.17538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17534v1","updated":"2024-09-26T04:41:08Z","published":"2024-09-26T04:41:08Z","title":"Just say what you want: only-prompting self-rewarding online preference\n optimization","summary":" We address the challenge of online Reinforcement Learning from Human Feedback\n(RLHF) with a focus on self-rewarding alignment methods. In online RLHF,\nobtaining feedback requires interaction with the environment, which can be\ncostly when using additional reward models or the GPT-4 API. Current\nself-rewarding approaches rely heavily on the discriminator's judgment\ncapabilities, which are effective for large-scale models but challenging to\ntransfer to smaller ones. To address these limitations, we propose a novel,\nonly-prompting self-rewarding online algorithm that generates preference\ndatasets without relying on judgment capabilities. Additionally, we employ\nfine-grained arithmetic control over the optimality gap between positive and\nnegative examples, generating more hard negatives in the later stages of\ntraining to help the model better capture subtle human preferences. Finally, we\nconduct extensive experiments on two base models, Mistral-7B and\nMistral-Instruct-7B, which significantly bootstrap the performance of the\nreference model, achieving 34.5% in the Length-controlled Win Rates of\nAlpacaEval 2.0.\n","authors":["Ruijie Xu","Zhihan Liu","Yongfei Liu","Shipeng Yan","Zhaoran Wang","Zhi Zhang","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2409.17534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17531v1","updated":"2024-09-26T04:36:19Z","published":"2024-09-26T04:36:19Z","title":"SimVG: A Simple Framework for Visual Grounding with Decoupled\n Multi-modal Fusion","summary":" Visual grounding is a common vision task that involves grounding descriptive\nsentences to the corresponding regions of an image. Most existing methods use\nindependent image-text encoding and apply complex hand-crafted modules or\nencoder-decoder architectures for modal interaction and query reasoning.\nHowever, their performance significantly drops when dealing with complex\ntextual expressions. This is because the former paradigm only utilizes limited\ndownstream data to fit the multi-modal feature fusion. Therefore, it is only\neffective when the textual expressions are relatively simple. In contrast,\ngiven the wide diversity of textual expressions and the uniqueness of\ndownstream training data, the existing fusion module, which extracts multimodal\ncontent from a visual-linguistic context, has not been fully investigated. In\nthis paper, we present a simple yet robust transformer-based framework, SimVG,\nfor visual grounding. Specifically, we decouple visual-linguistic feature\nfusion from downstream tasks by leveraging existing multimodal pre-trained\nmodels and incorporating additional object tokens to facilitate deep\nintegration of downstream and pre-training tasks. Furthermore, we design a\ndynamic weight-balance distillation method in the multi-branch synchronous\nlearning process to enhance the representation capability of the simpler\nbranch. This branch only consists of a lightweight MLP, which simplifies the\nstructure and improves reasoning speed. Experiments on six widely used VG\ndatasets, i.e., RefCOCO/+/g, ReferIt, Flickr30K, and GRefCOCO, demonstrate the\nsuperiority of SimVG. Finally, the proposed method not only achieves\nimprovements in efficiency and convergence speed but also attains new\nstate-of-the-art performance on these benchmarks. Codes and models will be\navailable at \\url{https://github.com/Dmmm1997/SimVG}.\n","authors":["Ming Dai","Lingfeng Yang","Yihao Xu","Zhenhua Feng","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2409.17531v1.pdf","comment":"21pages, 11figures, NeurIPS2024"},{"id":"http://arxiv.org/abs/2310.04696v3","updated":"2024-09-26T04:33:35Z","published":"2023-10-07T06:01:35Z","title":"Serving Deep Learning Model in Relational Databases","summary":" Serving deep learning (DL) models on relational data has become a critical\nrequirement across diverse commercial and scientific domains, sparking growing\ninterest recently. In this visionary paper, we embark on a comprehensive\nexploration of representative architectures to address the requirement. We\nhighlight three pivotal paradigms: The state-of-the-art DL-centric architecture\noffloads DL computations to dedicated DL frameworks. The potential UDF-centric\narchitecture encapsulates one or more tensor computations into User Defined\nFunctions (UDFs) within the relational database management system (RDBMS). The\npotential relation-centric architecture aims to represent a large-scale tensor\ncomputation through relational operators. While each of these architectures\ndemonstrates promise in specific use scenarios, we identify urgent requirements\nfor seamless integration of these architectures and the middle ground\nin-between these architectures. We delve into the gaps that impede the\nintegration and explore innovative strategies to close them. We present a\npathway to establish a novel RDBMS for enabling a broad class of data-intensive\nDL inference applications.\n","authors":["Lixi Zhou","Qi Lin","Kanchan Chowdhury","Saif Masood","Alexandre Eichenberger","Hong Min","Alexander Sim","Jie Wang","Yida Wang","Kesheng Wu","Binhang Yuan","Jia Zou"],"pdf_url":"https://arxiv.org/pdf/2310.04696v3.pdf","comment":"* Authors are ordered alphabetically; Jia Zou is the corresponding\n author"},{"id":"http://arxiv.org/abs/2409.17526v1","updated":"2024-09-26T04:27:44Z","published":"2024-09-26T04:27:44Z","title":"Drone Stereo Vision for Radiata Pine Branch Detection and Distance\n Measurement: Integrating SGBM and Segmentation Models","summary":" Manual pruning of radiata pine trees presents significant safety risks due to\ntheir substantial height and the challenging terrains in which they thrive. To\naddress these risks, this research proposes the development of a drone-based\npruning system equipped with specialized pruning tools and a stereo vision\ncamera, enabling precise detection and trimming of branches. Deep learning\nalgorithms, including YOLO and Mask R-CNN, are employed to ensure accurate\nbranch detection, while the Semi-Global Matching algorithm is integrated to\nprovide reliable distance estimation. The synergy between these techniques\nfacilitates the precise identification of branch locations and enables\nefficient, targeted pruning. Experimental results demonstrate that the combined\nimplementation of YOLO and SGBM enables the drone to accurately detect branches\nand measure their distances from the drone. This research not only improves the\nsafety and efficiency of pruning operations but also makes a significant\ncontribution to the advancement of drone technology in the automation of\nagricultural and forestry practices, laying a foundational framework for\nfurther innovations in environmental management.\n","authors":["Yida Lin","Bing Xue","Mengjie Zhang","Sam Schofield","Richard Green"],"pdf_url":"https://arxiv.org/pdf/2409.17526v1.pdf","comment":null}]},"2024-09-27T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2409.18052v2","updated":"2024-09-27T02:09:44Z","published":"2024-09-26T16:55:44Z","title":"Explaining Explaining","summary":" Explanation is key to people having confidence in high-stakes AI systems.\nHowever, machine-learning-based systems -- which account for almost all current\nAI -- can't explain because they are usually black boxes. The explainable AI\n(XAI) movement hedges this problem by redefining \"explanation\". The\nhuman-centered explainable AI (HCXAI) movement identifies the\nexplanation-oriented needs of users but can't fulfill them because of its\ncommitment to machine learning. In order to achieve the kinds of explanations\nneeded by real people operating in critical domains, we must rethink how to\napproach AI. We describe a hybrid approach to developing cognitive agents that\nuses a knowledge-based infrastructure supplemented by data obtained through\nmachine learning when applicable. These agents will serve as assistants to\nhumans who will bear ultimate responsibility for the decisions and actions of\nthe human-robot team. We illustrate the explanatory potential of such agents\nusing the under-the-hood panels of a demonstration system in which a team of\nsimulated robots collaborate on a search task assigned by a human.\n","authors":["Sergei Nirenburg","Marjorie McShane","Kenneth W. Goodman","Sanjay Oruganti"],"pdf_url":"https://arxiv.org/pdf/2409.18052v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17618v2","updated":"2024-09-27T01:45:26Z","published":"2024-09-26T08:10:28Z","title":"Learning Occlusion-aware Decision-making from Agent Interaction via\n Active Perception","summary":" Occlusion-aware decision-making is essential in autonomous driving due to the\nhigh uncertainty of various occlusions. Recent occlusion-aware decision-making\nmethods encounter issues such as high computational complexity, scenario\nscalability challenges, or reliance on limited expert data. Benefiting from\nautomatically generating data by exploration randomization, we uncover that\nreinforcement learning (RL) may show promise in occlusion-aware\ndecision-making. However, previous occlusion-aware RL faces challenges in\nexpanding to various dynamic and static occlusion scenarios, low learning\nefficiency, and lack of predictive ability. To address these issues, we\nintroduce Pad-AI, a self-reinforcing framework to learn occlusion-aware\ndecision-making through active perception. Pad-AI utilizes vectorized\nrepresentation to represent occluded environments efficiently and learns over\nthe semantic motion primitives to focus on high-level active perception\nexploration. Furthermore, Pad-AI integrates prediction and RL within a unified\nframework to provide risk-aware learning and security guarantees. Our framework\nwas tested in challenging scenarios under both dynamic and static occlusions\nand demonstrated efficient and general perception-aware exploration performance\nto other strong baselines in closed-loop evaluations.\n","authors":["Jie Jia","Yiming Shu","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2409.17618v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.17993v2","updated":"2024-09-27T02:35:47Z","published":"2024-09-26T16:04:31Z","title":"InterNet: Unsupervised Cross-modal Homography Estimation Based on\n Interleaved Modality Transfer and Self-supervised Homography Prediction","summary":" We propose a novel unsupervised cross-modal homography estimation framework,\nbased on interleaved modality transfer and self-supervised homography\nprediction, named InterNet. InterNet integrates modality transfer and\nself-supervised homography estimation, introducing an innovative interleaved\noptimization framework to alternately promote both components. The modality\ntransfer gradually narrows the modality gaps, facilitating the self-supervised\nhomography estimation to fully leverage the synthetic intra-modal data. The\nself-supervised homography estimation progressively achieves reliable\npredictions, thereby providing robust cross-modal supervision for the modality\ntransfer. To further boost the estimation accuracy, we also formulate a\nfine-grained homography feature loss to improve the connection between two\ncomponents. Furthermore, we employ a simple yet effective distillation training\ntechnique to reduce model parameters and improve cross-domain generalization\nability while maintaining comparable performance. Experiments reveal that\nInterNet achieves the state-of-the-art (SOTA) performance among unsupervised\nmethods, and even outperforms many supervised methods such as MHN and\nLocalTrans.\n","authors":["Junchen Yu","Si-Yuan Cao","Runmin Zhang","Chenghao Zhang","Jianxin Hu","Zhu Yu","Beinan Yu","Hui-liang Shen"],"pdf_url":"https://arxiv.org/pdf/2409.17993v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17851v2","updated":"2024-09-27T15:59:45Z","published":"2024-09-26T13:57:05Z","title":"A New Dataset for Monocular Depth Estimation Under Viewpoint Shifts","summary":" Monocular depth estimation is a critical task for autonomous driving and many\nother computer vision applications. While significant progress has been made in\nthis field, the effects of viewpoint shifts on depth estimation models remain\nlargely underexplored. This paper introduces a novel dataset and evaluation\nmethodology to quantify the impact of different camera positions and\norientations on monocular depth estimation performance. We propose a ground\ntruth strategy based on homography estimation and object detection, eliminating\nthe need for expensive lidar sensors. We collect a diverse dataset of road\nscenes from multiple viewpoints and use it to assess the robustness of a modern\ndepth estimation model to geometric shifts. After assessing the validity of our\nstrategy on a public dataset, we provide valuable insights into the limitations\nof current models and highlight the importance of considering viewpoint\nvariations in real-world applications.\n","authors":["Aurel Pjetri","Stefano Caprasecca","Leonardo Taccari","Matteo Simoncini","Henrique Piñeiro Monteagudo","Walter Wallace","Douglas Coimbra de Andrade","Francesco Sambo","Andrew David Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2409.17851v2.pdf","comment":"17 pages, 5 figures. Accepted at ECCV 2024 2nd Workshop on\n Vision-Centric Autonomous Driving (VCAD)"},{"id":"http://arxiv.org/abs/2409.17763v2","updated":"2024-09-27T06:50:21Z","published":"2024-09-26T11:58:41Z","title":"Confidence intervals uncovered: Are we ready for real-world medical\n imaging AI?","summary":" Medical imaging is spearheading the AI transformation of healthcare.\nPerformance reporting is key to determine which methods should be translated\ninto clinical practice. Frequently, broad conclusions are simply derived from\nmean performance values. In this paper, we argue that this common practice is\noften a misleading simplification as it ignores performance variability. Our\ncontribution is threefold. (1) Analyzing all MICCAI segmentation papers (n =\n221) published in 2023, we first observe that more than 50% of papers do not\nassess performance variability at all. Moreover, only one (0.5%) paper reported\nconfidence intervals (CIs) for model performance. (2) To address the reporting\nbottleneck, we show that the unreported standard deviation (SD) in segmentation\npapers can be approximated by a second-order polynomial function of the mean\nDice similarity coefficient (DSC). Based on external validation data from 56\nprevious MICCAI challenges, we demonstrate that this approximation can\naccurately reconstruct the CI of a method using information provided in\npublications. (3) Finally, we reconstructed 95% CIs around the mean DSC of\nMICCAI 2023 segmentation papers. The median CI width was 0.03 which is three\ntimes larger than the median performance gap between the first and second\nranked method. For more than 60% of papers, the mean performance of the\nsecond-ranked method was within the CI of the first-ranked method. We conclude\nthat current publications typically do not provide sufficient evidence to\nsupport which models could potentially be translated into clinical practice.\n","authors":["Evangelia Christodoulou","Annika Reinke","Rola Houhou","Piotr Kalinowski","Selen Erkan","Carole H. Sudre","Ninon Burgos","Sofiène Boutaj","Sophie Loizillon","Maëlys Solal","Nicola Rieke","Veronika Cheplygina","Michela Antonelli","Leon D. Mayer","Minu D. Tizabi","M. Jorge Cardoso","Amber Simpson","Paul F. Jäger","Annette Kopp-Schneider","Gaël Varoquaux","Olivier Colliot","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2409.17763v2.pdf","comment":"Paper accepted at MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2409.17671v2","updated":"2024-09-27T10:02:53Z","published":"2024-09-26T09:30:37Z","title":"Leveraging Anthropometric Measurements to Improve Human Mesh Estimation\n and Ensure Consistent Body Shapes","summary":" The basic body shape of a person does not change within a single video.\nHowever, most SOTA human mesh estimation (HME) models output a slightly\ndifferent body shape for each video frame, which results in inconsistent body\nshapes for the same person. In contrast, we leverage anthropometric\nmeasurements like tailors are already obtaining from humans for centuries. We\ncreate a model called A2B that converts such anthropometric measurements to\nbody shape parameters of human mesh models. Moreover, we find that finetuned\nSOTA 3D human pose estimation (HPE) models outperform HME models regarding the\nprecision of the estimated keypoints. We show that applying inverse kinematics\n(IK) to the results of such a 3D HPE model and combining the resulting body\npose with the A2B body shape leads to superior and consistent human meshes for\nchallenging datasets like ASPset or fit3D, where we can lower the MPJPE by over\n30 mm compared to SOTA HME models. Further, replacing HME models estimates of\nthe body shape parameters with A2B model results not only increases the\nperformance of these HME models, but also leads to consistent body shapes.\n","authors":["Katja Ludwig","Julian Lorenz","Daniel Kienzle","Tuan Bui","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2409.17671v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16225v3","updated":"2024-09-27T14:04:22Z","published":"2024-09-24T16:38:41Z","title":"VideoPatchCore: An Effective Method to Memorize Normality for Video\n Anomaly Detection","summary":" Video anomaly detection (VAD) is a crucial task in video analysis and\nsurveillance within computer vision. Currently, VAD is gaining attention with\nmemory techniques that store the features of normal frames. The stored features\nare utilized for frame reconstruction, identifying an abnormality when a\nsignificant difference exists between the reconstructed and input frames.\nHowever, this approach faces several challenges due to the simultaneous\noptimization required for both the memory and encoder-decoder model. These\nchallenges include increased optimization difficulty, complexity of\nimplementation, and performance variability depending on the memory size. To\naddress these challenges,we propose an effective memory method for VAD, called\nVideoPatchCore. Inspired by PatchCore, our approach introduces a structure that\nprioritizes memory optimization and configures three types of memory tailored\nto the characteristics of video data. This method effectively addresses the\nlimitations of existing memory-based methods, achieving good performance\ncomparable to state-of-the-art methods. Furthermore, our method requires no\ntraining and is straightforward to implement, making VAD tasks more accessible.\nOur code is available online at github.com/SkiddieAhn/Paper-VideoPatchCore.\n","authors":["Sunghyun Ahn","Youngwan Jo","Kijung Lee","Sanghyun Park"],"pdf_url":"https://arxiv.org/pdf/2409.16225v3.pdf","comment":"Accepted to ACCV 2024"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2409.15816v2","updated":"2024-09-27T08:07:55Z","published":"2024-09-24T07:27:00Z","title":"Diffusion Models for Intelligent Transportation Systems: A Survey","summary":" Intelligent Transportation Systems (ITS) are vital in modern traffic\nmanagement and optimization, significantly enhancing traffic efficiency and\nsafety. Recently, diffusion models have emerged as transformative tools for\naddressing complex challenges within ITS. In this paper, we present a\ncomprehensive survey of diffusion models for ITS, covering both theoretical and\npractical aspects. First, we introduce the theoretical foundations of diffusion\nmodels and their key variants, including conditional diffusion models and\nlatent diffusion models, highlighting their suitability for modeling complex,\nmulti-modal traffic data and enabling controllable generation. Second, we\noutline the primary challenges in ITS and the corresponding advantages of\ndiffusion models, providing readers with a deeper understanding of the\nintersection between ITS and diffusion models. Third, we offer a\nmulti-perspective investigation of current applications of diffusion models in\nITS domains, including autonomous driving, traffic simulation, trajectory\nprediction, and traffic safety. Finally, we discuss state-of-the-art diffusion\nmodel techniques and highlight key ITS research directions that warrant further\ninvestigation. Through this structured overview, we aim to provide researchers\nwith a comprehensive understanding of diffusion models for ITS, thereby\nadvancing their future applications in the transportation domain.\n","authors":["Mingxing Peng","Kehua Chen","Xusen Guo","Qiming Zhang","Hongliang Lu","Hui Zhong","Di Chen","Meixin Zhu","Hai Yang"],"pdf_url":"https://arxiv.org/pdf/2409.15816v2.pdf","comment":"7 figures"},{"id":"http://arxiv.org/abs/2409.14738v2","updated":"2024-09-27T07:13:22Z","published":"2024-09-23T06:34:06Z","title":"Enabling On-Chip High-Frequency Adaptive Linear Optimal Control via\n Linearized Gaussian Process","summary":" Unpredictable and complex aerodynamic effects pose significant challenges to\nachieving precise flight control, such as the downwash effect from upper\nvehicles to lower ones. Conventional methods often struggle to accurately model\nthese interactions, leading to controllers that require large safety margins\nbetween vehicles. Moreover, the controller on real drones usually requires\nhigh-frequency and has limited on-chip computation, making the adaptive control\ndesign more difficult to implement. To address these challenges, we incorporate\nGaussian process (GP) to model the adaptive external aerodynamics with linear\nmodel predictive control. The GP is linearized to enable real-time\nhigh-frequency solutions. Moreover, to handle the error caused by\nlinearization, we integrate end-to-end Bayesian optimization during sample\ncollection stages to improve the control performance. Experimental results on\nboth simulations and real quadrotors show that we can achieve real-time\nsolvable computation speed with acceptable tracking errors.\n","authors":["Yuan Gao","Yinyi Lai","Jun Wang","Yini Fang"],"pdf_url":"https://arxiv.org/pdf/2409.14738v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2409.17985v2","updated":"2024-09-27T02:08:56Z","published":"2024-09-26T15:55:59Z","title":"Hypergame Theory for Decentralized Resource Allocation in Multi-user\n Semantic Communications","summary":" Semantic communications (SC) is an emerging communication paradigm in which\nwireless devices can send only relevant information from a source of data while\nrelying on computing resources to regenerate missing data points. However, the\ndesign of a multi-user SC system becomes more challenging because of the\ncomputing and communication overhead required for coordination. Existing\nsolutions for learning the semantic language and performing resource allocation\noften fail to capture the computing and communication tradeoffs involved in\nmultiuser SC. To address this gap, a novel framework for decentralized\ncomputing and communication resource allocation in multiuser SC systems is\nproposed. The challenge of efficiently allocating communication and computing\nresources (for reasoning) in a decentralized manner to maximize the quality of\ntask experience for the end users is addressed through the application of\nStackelberg hyper game theory. Leveraging the concept of second-level hyper\ngames, novel analytical formulations are developed to model misperceptions of\nthe users about each other's communication and control strategies. Further,\nequilibrium analysis of the learned resource allocation protocols examines the\nconvergence of the computing and communication strategies to a local\nStackelberg equilibria, considering misperceptions. Simulation results show\nthat the proposed Stackelberg hyper game results in efficient usage of\ncommunication and computing resources while maintaining a high quality of\nexperience for the users compared to state-of-the-art that does not account for\nthe misperceptions.\n","authors":["Christo Kurisummoottil Thomas","Walid Saad"],"pdf_url":"https://arxiv.org/pdf/2409.17985v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17763v2","updated":"2024-09-27T06:50:21Z","published":"2024-09-26T11:58:41Z","title":"Confidence intervals uncovered: Are we ready for real-world medical\n imaging AI?","summary":" Medical imaging is spearheading the AI transformation of healthcare.\nPerformance reporting is key to determine which methods should be translated\ninto clinical practice. Frequently, broad conclusions are simply derived from\nmean performance values. In this paper, we argue that this common practice is\noften a misleading simplification as it ignores performance variability. Our\ncontribution is threefold. (1) Analyzing all MICCAI segmentation papers (n =\n221) published in 2023, we first observe that more than 50% of papers do not\nassess performance variability at all. Moreover, only one (0.5%) paper reported\nconfidence intervals (CIs) for model performance. (2) To address the reporting\nbottleneck, we show that the unreported standard deviation (SD) in segmentation\npapers can be approximated by a second-order polynomial function of the mean\nDice similarity coefficient (DSC). Based on external validation data from 56\nprevious MICCAI challenges, we demonstrate that this approximation can\naccurately reconstruct the CI of a method using information provided in\npublications. (3) Finally, we reconstructed 95% CIs around the mean DSC of\nMICCAI 2023 segmentation papers. The median CI width was 0.03 which is three\ntimes larger than the median performance gap between the first and second\nranked method. For more than 60% of papers, the mean performance of the\nsecond-ranked method was within the CI of the first-ranked method. We conclude\nthat current publications typically do not provide sufficient evidence to\nsupport which models could potentially be translated into clinical practice.\n","authors":["Evangelia Christodoulou","Annika Reinke","Rola Houhou","Piotr Kalinowski","Selen Erkan","Carole H. Sudre","Ninon Burgos","Sofiène Boutaj","Sophie Loizillon","Maëlys Solal","Nicola Rieke","Veronika Cheplygina","Michela Antonelli","Leon D. Mayer","Minu D. Tizabi","M. Jorge Cardoso","Amber Simpson","Paul F. Jäger","Annette Kopp-Schneider","Gaël Varoquaux","Olivier Colliot","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2409.17763v2.pdf","comment":"Paper accepted at MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2409.17745v2","updated":"2024-09-27T08:19:29Z","published":"2024-09-26T11:19:09Z","title":"Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval\n Model","summary":" A supervised ranking model, despite its advantage of being effective, usually\ninvolves complex processing - typically multiple stages of task-specific\npre-training and fine-tuning. This has motivated researchers to explore simpler\npipelines leveraging large language models (LLMs) that are capable of working\nin a zero-shot manner. However, since zero-shot inference does not make use of\na training set of pairs of queries and their relevant documents, its\nperformance is mostly worse than that of supervised models, which are trained\non such example pairs. Motivated by the existing findings that training\nexamples generally improve zero-shot performance, in our work, we explore if\nthis also applies to ranking models. More specifically, given a query and a\npair of documents, the preference prediction task is improved by augmenting\nexamples of preferences for similar queries from a training set. Our proposed\npairwise few-shot ranker demonstrates consistent improvements over the\nzero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset)\nretrieval benchmarks. Our method also achieves a close performance to that of a\nsupervised model without requiring any complex training pipeline.\n","authors":["Nilanjan Sinhababu","Andrew Parry","Debasis Ganguly","Debasis Samanta","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2409.17745v2.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17699v2","updated":"2024-09-27T10:16:37Z","published":"2024-09-26T10:12:19Z","title":"MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard\n for Prompt Attacks","summary":" The proliferation of Large Language Models (LLMs) in diverse applications\nunderscores the pressing need for robust security measures to thwart potential\njailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger\ndata integrity and user privacy. Guardrails serve as crucial protective\nmechanisms against such threats, but existing models often fall short in terms\nof both detection accuracy, and computational efficiency. This paper advocates\nfor the significance of jailbreak attack prevention on LLMs, and emphasises the\nrole of input guardrails in safeguarding these models. We introduce MoJE\n(Mixture of Jailbreak Expert), a novel guardrail architecture designed to\nsurpass current limitations in existing state-of-the-art guardrails. By\nemploying simple linguistic statistical techniques, MoJE excels in detecting\njailbreak attacks while maintaining minimal computational overhead during model\ninference. Through rigorous experimentation, MoJE demonstrates superior\nperformance capable of detecting 90% of the attacks without compromising benign\nprompts, enhancing LLMs security against jailbreak attacks.\n","authors":["Giandomenico Cornacchia","Giulio Zizzo","Kieran Fraser","Muhammad Zaid Hamed","Ambrish Rawat","Mark Purcell"],"pdf_url":"https://arxiv.org/pdf/2409.17699v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17545v2","updated":"2024-09-27T06:48:08Z","published":"2024-09-26T05:24:14Z","title":"Modulated Intervention Preference Optimization (MIPO): Keep the Easy,\n Refine the Difficult","summary":" Preference optimization methods typically begin training with a well-trained\nSFT model as a reference model. In RLHF and DPO, a regularization term is used\nduring the preference optimization process to prevent the policy model from\ndeviating too far from the reference model's distribution, thereby avoiding the\ngeneration of anomalous responses. When the reference model is already\nwell-aligned with the given data or only requires slight adjustments, this\napproach can produce a well-aligned model. However, if the reference model is\nnot aligned with the given data and requires significant deviation from its\ncurrent state, a regularization term may actually hinder the model alignment.\nIn this study, we propose \\textbf{Modulated Intervention Preference\nOptimization (MIPO)} to address this issue. MIPO modulates the degree of\nintervention from the reference model based on how well the given data is\naligned with it. If the data is well-aligned, the intervention is increased to\nprevent the policy model from diverging significantly from reference model.\nConversely, if the alignment is poor, the interference is reduced to facilitate\nmore extensive training. We compare the performance of MIPO and DPO using\nMistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental\nresults demonstrate that MIPO consistently outperforms DPO across various\nevaluation scenarios.\n","authors":["Cheolhun Jang"],"pdf_url":"https://arxiv.org/pdf/2409.17545v2.pdf","comment":"8pages, submitted to AAAI 2025"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2409.18052v2","updated":"2024-09-27T02:09:44Z","published":"2024-09-26T16:55:44Z","title":"Explaining Explaining","summary":" Explanation is key to people having confidence in high-stakes AI systems.\nHowever, machine-learning-based systems -- which account for almost all current\nAI -- can't explain because they are usually black boxes. The explainable AI\n(XAI) movement hedges this problem by redefining \"explanation\". The\nhuman-centered explainable AI (HCXAI) movement identifies the\nexplanation-oriented needs of users but can't fulfill them because of its\ncommitment to machine learning. In order to achieve the kinds of explanations\nneeded by real people operating in critical domains, we must rethink how to\napproach AI. We describe a hybrid approach to developing cognitive agents that\nuses a knowledge-based infrastructure supplemented by data obtained through\nmachine learning when applicable. These agents will serve as assistants to\nhumans who will bear ultimate responsibility for the decisions and actions of\nthe human-robot team. We illustrate the explanatory potential of such agents\nusing the under-the-hood panels of a demonstration system in which a team of\nsimulated robots collaborate on a search task assigned by a human.\n","authors":["Sergei Nirenburg","Marjorie McShane","Kenneth W. Goodman","Sanjay Oruganti"],"pdf_url":"https://arxiv.org/pdf/2409.18052v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17994v2","updated":"2024-09-27T04:03:19Z","published":"2024-09-26T16:06:38Z","title":"CRoP: Context-wise Robust Static Human-Sensing Personalization","summary":" The advancement in deep learning and internet-of-things have led to diverse\nhuman sensing applications. However, distinct patterns in human sensing,\ninfluenced by various factors or contexts, challenge generic neural network\nmodel's performance due to natural distribution shifts. To address this,\npersonalization tailors models to individual users. Yet most personalization\nstudies overlook intra-user heterogeneity across contexts in sensory data,\nlimiting intra-user generalizability. This limitation is especially critical in\nclinical applications, where limited data availability hampers both\ngeneralizability and personalization. Notably, intra-user sensing attributes\nare expected to change due to external factors such as treatment progression,\nfurther complicating the challenges. This work introduces CRoP, a novel static\npersonalization approach using an off-the-shelf pre-trained model and pruning\nto optimize personalization and generalization. CRoP shows superior\npersonalization effectiveness and intra-user robustness across four\nhuman-sensing datasets, including two from real-world health domains,\nhighlighting its practical and social impact. Additionally, to support CRoP's\ngeneralization ability and design choices, we provide empirical justification\nthrough gradient inner product analysis, ablation studies, and comparisons\nagainst state-of-the-art baselines.\n","authors":["Sawinder Kaur","Avery Gump","Jingyu Xin","Yi Xiao","Harshit Sharma","Nina R Benway","Jonathan L Preston","Asif Salekin"],"pdf_url":"https://arxiv.org/pdf/2409.17994v2.pdf","comment":"31 pages, 10 figues and 13 tables"},{"id":"http://arxiv.org/abs/2409.17763v2","updated":"2024-09-27T06:50:21Z","published":"2024-09-26T11:58:41Z","title":"Confidence intervals uncovered: Are we ready for real-world medical\n imaging AI?","summary":" Medical imaging is spearheading the AI transformation of healthcare.\nPerformance reporting is key to determine which methods should be translated\ninto clinical practice. Frequently, broad conclusions are simply derived from\nmean performance values. In this paper, we argue that this common practice is\noften a misleading simplification as it ignores performance variability. Our\ncontribution is threefold. (1) Analyzing all MICCAI segmentation papers (n =\n221) published in 2023, we first observe that more than 50% of papers do not\nassess performance variability at all. Moreover, only one (0.5%) paper reported\nconfidence intervals (CIs) for model performance. (2) To address the reporting\nbottleneck, we show that the unreported standard deviation (SD) in segmentation\npapers can be approximated by a second-order polynomial function of the mean\nDice similarity coefficient (DSC). Based on external validation data from 56\nprevious MICCAI challenges, we demonstrate that this approximation can\naccurately reconstruct the CI of a method using information provided in\npublications. (3) Finally, we reconstructed 95% CIs around the mean DSC of\nMICCAI 2023 segmentation papers. The median CI width was 0.03 which is three\ntimes larger than the median performance gap between the first and second\nranked method. For more than 60% of papers, the mean performance of the\nsecond-ranked method was within the CI of the first-ranked method. We conclude\nthat current publications typically do not provide sufficient evidence to\nsupport which models could potentially be translated into clinical practice.\n","authors":["Evangelia Christodoulou","Annika Reinke","Rola Houhou","Piotr Kalinowski","Selen Erkan","Carole H. Sudre","Ninon Burgos","Sofiène Boutaj","Sophie Loizillon","Maëlys Solal","Nicola Rieke","Veronika Cheplygina","Michela Antonelli","Leon D. Mayer","Minu D. Tizabi","M. Jorge Cardoso","Amber Simpson","Paul F. Jäger","Annette Kopp-Schneider","Gaël Varoquaux","Olivier Colliot","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2409.17763v2.pdf","comment":"Paper accepted at MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2409.17699v2","updated":"2024-09-27T10:16:37Z","published":"2024-09-26T10:12:19Z","title":"MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard\n for Prompt Attacks","summary":" The proliferation of Large Language Models (LLMs) in diverse applications\nunderscores the pressing need for robust security measures to thwart potential\njailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger\ndata integrity and user privacy. Guardrails serve as crucial protective\nmechanisms against such threats, but existing models often fall short in terms\nof both detection accuracy, and computational efficiency. This paper advocates\nfor the significance of jailbreak attack prevention on LLMs, and emphasises the\nrole of input guardrails in safeguarding these models. We introduce MoJE\n(Mixture of Jailbreak Expert), a novel guardrail architecture designed to\nsurpass current limitations in existing state-of-the-art guardrails. By\nemploying simple linguistic statistical techniques, MoJE excels in detecting\njailbreak attacks while maintaining minimal computational overhead during model\ninference. Through rigorous experimentation, MoJE demonstrates superior\nperformance capable of detecting 90% of the attacks without compromising benign\nprompts, enhancing LLMs security against jailbreak attacks.\n","authors":["Giandomenico Cornacchia","Giulio Zizzo","Kieran Fraser","Muhammad Zaid Hamed","Ambrish Rawat","Mark Purcell"],"pdf_url":"https://arxiv.org/pdf/2409.17699v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06802v3","updated":"2024-09-27T06:13:06Z","published":"2024-05-10T20:29:25Z","title":"Summarizing Radiology Reports Findings into Impressions","summary":" Patient hand-off and triage are two fundamental problems in health care.\nOften doctors must painstakingly summarize complex findings to efficiently\ncommunicate with specialists and quickly make decisions on which patients have\nthe most urgent cases. In pursuit of these challenges, we present (1) a model\nwith state-of-art radiology report summarization performance using (2) a novel\nmethod for augmenting medical data, and (3) an analysis of the model\nlimitations and radiology knowledge gain. We also provide a data processing\npipeline for future models developed on the the MIMIC CXR dataset. Our best\nperforming model was a fine-tuned BERT-to-BERT encoder-decoder with 58.75/100\nROUGE-L F1, which outperformed specialized checkpoints with more sophisticated\nattention mechanisms. We investigate these aspects in this work.\n","authors":["Raul Salles de Padua","Imran Qureshi"],"pdf_url":"https://arxiv.org/pdf/2405.06802v3.pdf","comment":"This version reverts to the original preprint, following the advice\n from the Artificial Intelligence in Health editorial office. The published\n version is peer-reviewed and available in the journal (see external DOI). The\n preprint remains unchanged to maintain version transparency, as noted in the\n further disclosure section of the published article"},{"id":"http://arxiv.org/abs/2409.17545v2","updated":"2024-09-27T06:48:08Z","published":"2024-09-26T05:24:14Z","title":"Modulated Intervention Preference Optimization (MIPO): Keep the Easy,\n Refine the Difficult","summary":" Preference optimization methods typically begin training with a well-trained\nSFT model as a reference model. In RLHF and DPO, a regularization term is used\nduring the preference optimization process to prevent the policy model from\ndeviating too far from the reference model's distribution, thereby avoiding the\ngeneration of anomalous responses. When the reference model is already\nwell-aligned with the given data or only requires slight adjustments, this\napproach can produce a well-aligned model. However, if the reference model is\nnot aligned with the given data and requires significant deviation from its\ncurrent state, a regularization term may actually hinder the model alignment.\nIn this study, we propose \\textbf{Modulated Intervention Preference\nOptimization (MIPO)} to address this issue. MIPO modulates the degree of\nintervention from the reference model based on how well the given data is\naligned with it. If the data is well-aligned, the intervention is increased to\nprevent the policy model from diverging significantly from reference model.\nConversely, if the alignment is poor, the interference is reduced to facilitate\nmore extensive training. We compare the performance of MIPO and DPO using\nMistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental\nresults demonstrate that MIPO consistently outperforms DPO across various\nevaluation scenarios.\n","authors":["Cheolhun Jang"],"pdf_url":"https://arxiv.org/pdf/2409.17545v2.pdf","comment":"8pages, submitted to AAAI 2025"}],"Computation and Language":[{"id":"http://arxiv.org/abs/2409.17827v2","updated":"2024-09-27T16:07:54Z","published":"2024-09-26T13:26:46Z","title":"BeanCounter: A low-toxicity, large-scale, and open dataset of\n business-oriented text","summary":" Many of the recent breakthroughs in language modeling have resulted from\nscaling effectively the same model architecture to larger datasets. In this\nvein, recent work has highlighted performance gains from increasing training\ndataset size and quality, suggesting a need for novel sources of large-scale\ndatasets. In this work, we introduce BeanCounter, a public dataset consisting\nof more than 159B tokens extracted from businesses' disclosures. We show that\nthis data is indeed novel: less than 0.1% of BeanCounter appears in Common\nCrawl-based datasets and it is an order of magnitude larger than datasets\nrelying on similar sources. Given the data's provenance, we hypothesize that\nBeanCounter is comparatively more factual and less toxic than web-based\ndatasets. Exploring this hypothesis, we find that many demographic identities\noccur with similar prevalence in BeanCounter but with significantly less toxic\ncontext relative to other datasets. To demonstrate the utility of BeanCounter,\nwe evaluate and compare two LLMs continually pre-trained on BeanCounter with\ntheir base models. We find an 18-33% reduction in toxic generation and improved\nperformance within the finance domain for the continually pretrained models.\nCollectively, our work suggests that BeanCounter is a novel source of\nlow-toxicity and high-quality domain-specific data with sufficient scale to\ntrain multi-billion parameter LLMs.\n","authors":["Siyan Wang","Bradford Levy"],"pdf_url":"https://arxiv.org/pdf/2409.17827v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17745v2","updated":"2024-09-27T08:19:29Z","published":"2024-09-26T11:19:09Z","title":"Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval\n Model","summary":" A supervised ranking model, despite its advantage of being effective, usually\ninvolves complex processing - typically multiple stages of task-specific\npre-training and fine-tuning. This has motivated researchers to explore simpler\npipelines leveraging large language models (LLMs) that are capable of working\nin a zero-shot manner. However, since zero-shot inference does not make use of\na training set of pairs of queries and their relevant documents, its\nperformance is mostly worse than that of supervised models, which are trained\non such example pairs. Motivated by the existing findings that training\nexamples generally improve zero-shot performance, in our work, we explore if\nthis also applies to ranking models. More specifically, given a query and a\npair of documents, the preference prediction task is improved by augmenting\nexamples of preferences for similar queries from a training set. Our proposed\npairwise few-shot ranker demonstrates consistent improvements over the\nzero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset)\nretrieval benchmarks. Our method also achieves a close performance to that of a\nsupervised model without requiring any complex training pipeline.\n","authors":["Nilanjan Sinhababu","Andrew Parry","Debasis Ganguly","Debasis Samanta","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2409.17745v2.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2403.01432v4","updated":"2024-09-27T13:47:33Z","published":"2024-03-03T08:07:55Z","title":"Fine Tuning vs. Retrieval Augmented Generation for Less Popular\n Knowledge","summary":" Language Models (LMs) memorize a vast amount of factual knowledge, exhibiting\nstrong performance across diverse tasks and domains. However, it has been\nobserved that the performance diminishes when dealing with less-popular or\nlow-frequency concepts and entities, for example in domain specific\napplications. The two prominent approaches to enhance the performance of LMs on\nlow-frequent topics are: Retrieval Augmented Generation (RAG) and fine-tuning\n(FT) over synthetic data. This paper explores and evaluates the impact of RAG\nand FT on customizing LMs in handling low-frequency entities on question\nanswering tasks. We conduct extensive experiments on twelve LMs of varying size\nand type and different fine tuning, data augmentation, and retrieval models.\nOur findings indicate that while FT boosts the performance across entities of\nvarying popularity, RAG surpasses FT by a large margin particularly for least\npopular factual knowledge. Additionally, the success of both RAG and FT\napproaches is amplified by improving retrieval and data augmentation\ntechniques. Fine tuning, while beneficial for small LMs, requires extensive\nresources. To address this issue, we propose the new Stimulus RAG approach that\nsurpasses the effectiveness of fine tuning based approaches, thereby\neliminating the need for the costly data augmentation and fine tuning step for\nenriching LMs with less popular factual knowledge.\n","authors":["Heydar Soudani","Evangelos Kanoulas","Faegheh Hasibi"],"pdf_url":"https://arxiv.org/pdf/2403.01432v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06802v3","updated":"2024-09-27T06:13:06Z","published":"2024-05-10T20:29:25Z","title":"Summarizing Radiology Reports Findings into Impressions","summary":" Patient hand-off and triage are two fundamental problems in health care.\nOften doctors must painstakingly summarize complex findings to efficiently\ncommunicate with specialists and quickly make decisions on which patients have\nthe most urgent cases. In pursuit of these challenges, we present (1) a model\nwith state-of-art radiology report summarization performance using (2) a novel\nmethod for augmenting medical data, and (3) an analysis of the model\nlimitations and radiology knowledge gain. We also provide a data processing\npipeline for future models developed on the the MIMIC CXR dataset. Our best\nperforming model was a fine-tuned BERT-to-BERT encoder-decoder with 58.75/100\nROUGE-L F1, which outperformed specialized checkpoints with more sophisticated\nattention mechanisms. We investigate these aspects in this work.\n","authors":["Raul Salles de Padua","Imran Qureshi"],"pdf_url":"https://arxiv.org/pdf/2405.06802v3.pdf","comment":"This version reverts to the original preprint, following the advice\n from the Artificial Intelligence in Health editorial office. The published\n version is peer-reviewed and available in the journal (see external DOI). The\n preprint remains unchanged to maintain version transparency, as noted in the\n further disclosure section of the published article"},{"id":"http://arxiv.org/abs/2409.17545v2","updated":"2024-09-27T06:48:08Z","published":"2024-09-26T05:24:14Z","title":"Modulated Intervention Preference Optimization (MIPO): Keep the Easy,\n Refine the Difficult","summary":" Preference optimization methods typically begin training with a well-trained\nSFT model as a reference model. In RLHF and DPO, a regularization term is used\nduring the preference optimization process to prevent the policy model from\ndeviating too far from the reference model's distribution, thereby avoiding the\ngeneration of anomalous responses. When the reference model is already\nwell-aligned with the given data or only requires slight adjustments, this\napproach can produce a well-aligned model. However, if the reference model is\nnot aligned with the given data and requires significant deviation from its\ncurrent state, a regularization term may actually hinder the model alignment.\nIn this study, we propose \\textbf{Modulated Intervention Preference\nOptimization (MIPO)} to address this issue. MIPO modulates the degree of\nintervention from the reference model based on how well the given data is\naligned with it. If the data is well-aligned, the intervention is increased to\nprevent the policy model from diverging significantly from reference model.\nConversely, if the alignment is poor, the interference is reduced to facilitate\nmore extensive training. We compare the performance of MIPO and DPO using\nMistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental\nresults demonstrate that MIPO consistently outperforms DPO across various\nevaluation scenarios.\n","authors":["Cheolhun Jang"],"pdf_url":"https://arxiv.org/pdf/2409.17545v2.pdf","comment":"8pages, submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2409.17391v2","updated":"2024-09-27T02:18:22Z","published":"2024-09-25T22:08:31Z","title":"Scaling Behavior for Large Language Models regarding Numeral Systems: An\n Example using Pythia","summary":" Though Large Language Models (LLMs) have shown remarkable abilities in\nmathematics reasoning, they are still struggling with performing numeric\noperations accurately, such as addition and multiplication. Numbers can be\ntokenized into tokens in various ways by different LLMs and affect the numeric\noperations performance. Currently, there are two representatives: 1) Tokenize\ninto $1$-digit, and 2) Tokenize into $1\\sim 3$ digit. The difference is roughly\nequivalent to using different numeral systems (namely base $10$ or base\n$10^{3}$). In light of this, we study the scaling behavior of different numeral\nsystems in the context of transformer-based large language models. We\nempirically show that a base $10$ system is consistently more data-efficient\nthan a base $10^{2}$ or $10^{3}$ system across training data scale, model sizes\nunder from-scratch training settings, while different number systems have very\nsimilar fine-tuning performances. We attribute this to higher token frequencies\nof a base $10$ system. Additionally, we reveal extrapolation behavior patterns\non addition and multiplication. We identify that base $100$ and base $1000$\nsystems struggle on token-level discernment and token-level operations. We also\nsheds light on the mechanism learnt by the models.\n","authors":["Zhejian Zhou","Jiayu Wang","Dahua Lin","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2409.17391v2.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17353v2","updated":"2024-09-27T01:42:54Z","published":"2024-09-25T20:59:12Z","title":"Internalizing ASR with Implicit Chain of Thought for Efficient\n Speech-to-Speech Conversational LLM","summary":" Current speech-based LLMs are predominantly trained on extensive ASR and TTS\ndatasets, excelling in tasks related to these domains. However, their ability\nto handle direct speech-to-speech conversations remains notably constrained.\nThese models often rely on an ASR-to-TTS chain-of-thought pipeline, converting\nspeech into text for processing before generating audio responses, which\nintroduces latency and loses audio features. We propose a method that\nimplicitly internalizes ASR chain of thought into a speech LLM, enhancing its\nnative speech understanding capabilities. Our approach reduces latency and\nimproves the model's native understanding of speech, paving the way for more\nefficient and natural real-time audio interactions. We also release a\nlarge-scale synthetic conversational dataset to facilitate further research.\n","authors":["Robin Shing-Hei Yuen","Timothy Tin-Long Tse","Jian Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.17353v2.pdf","comment":"Corrected style from final to preprint"},{"id":"http://arxiv.org/abs/2409.17213v2","updated":"2024-09-27T12:12:44Z","published":"2024-09-25T17:38:39Z","title":"Plurals: A System for Guiding LLMs Via Simulated Social Ensembles","summary":" Recent debates raised concerns that language models may favor certain\nviewpoints. But what if the solution is not to aim for a 'view from nowhere'\nbut rather to leverage different viewpoints? We introduce Plurals, a system and\nPython library for pluralistic AI deliberation. Plurals consists of Agents\n(LLMs, optionally with personas) which deliberate within customizable\nStructures, with Moderators overseeing deliberation. Plurals is a generator of\nsimulated social ensembles. Plurals integrates with government datasets to\ncreate nationally representative personas, includes deliberation templates\ninspired by democratic deliberation theory, and allows users to customize both\ninformation-sharing structures and deliberation behavior within Structures. Six\ncase studies demonstrate fidelity to theoretical constructs and efficacy. Three\nrandomized experiments show simulated focus groups produced output resonant\nwith an online sample of the relevant audiences (chosen over zero-shot\ngeneration in 75% of trials). Plurals is both a paradigm and a concrete system\nfor pluralistic AI. The Plurals library is available at\nhttps://github.com/josh-ashkinaze/plurals and will be continually updated.\n","authors":["Joshua Ashkinaze","Emily Fry","Narendra Edara","Eric Gilbert","Ceren Budak"],"pdf_url":"https://arxiv.org/pdf/2409.17213v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16937v2","updated":"2024-09-27T11:16:35Z","published":"2024-09-25T13:51:19Z","title":"Semi-Supervised Cognitive State Classification from Speech with\n Multi-View Pseudo-Labeling","summary":" The lack of labeled data is a common challenge in speech classification\ntasks, particularly those requiring extensive subjective assessment, such as\ncognitive state classification. In this work, we propose a Semi-Supervised\nLearning (SSL) framework, introducing a novel multi-view pseudo-labeling method\nthat leverages both acoustic and linguistic characteristics to select the most\nconfident data for training the classification model. Acoustically, unlabeled\ndata are compared to labeled data using the Frechet audio distance, calculated\nfrom embeddings generated by multiple audio encoders. Linguistically, large\nlanguage models are prompted to revise automatic speech recognition\ntranscriptions and predict labels based on our proposed task-specific\nknowledge. High-confidence data are identified when pseudo-labels from both\nsources align, while mismatches are treated as low-confidence data. A bimodal\nclassifier is then trained to iteratively label the low-confidence data until a\npredefined criterion is met. We evaluate our SSL framework on emotion\nrecognition and dementia detection tasks. Experimental results demonstrate that\nour method achieves competitive performance compared to fully supervised\nlearning using only 30% of the labeled data and significantly outperforms two\nselected baselines.\n","authors":["Yuanchao Li","Zixing Zhang","Jing Han","Peter Bell","Catherine Lai"],"pdf_url":"https://arxiv.org/pdf/2409.16937v2.pdf","comment":null}],"General Literature":[{"id":"http://arxiv.org/abs/2301.09771v6","updated":"2024-09-27T06:57:04Z","published":"2023-01-24T00:57:37Z","title":"Automation and AI Technology in Surface Mining With a Brief Introduction\n to Open-Pit Operations in the Pilbara","summary":" This survey article provides a synopsis on some of the engineering problems,\ntechnological innovations, robotic development and automation efforts\nencountered in the mining industry -- particularly in the Pilbara iron-ore\nregion of Western Australia. The goal is to paint the technology landscape and\nhighlight issues relevant to an engineering audience to raise awareness of AI\nand automation trends in mining. It assumes the reader has no prior knowledge\nof mining and builds context gradually through focused discussion and short\nsummaries of common open-pit mining operations. The principal activities that\ntake place may be categorized in terms of resource development, mine-, rail-\nand port operations. From mineral exploration to ore shipment, there are\nroughly nine steps in between. These include: geological assessment, mine\nplanning and development, production drilling and assaying, blasting and\nexcavation, transportation of ore and waste, crush and screen, stockpile and\nload-out, rail network distribution, and ore-car dumping. The objective is to\ndescribe these processes and provide insights on some of the\nchallenges/opportunities from the perspective of a decade-long\nindustry-university R&D partnership.\n","authors":["Raymond Leung","Andrew J Hill","Arman Melkumyan"],"pdf_url":"https://arxiv.org/pdf/2301.09771v6.pdf","comment":"Accepted manuscript. Paper provides insights on state-of-the-art\n technologies and future trends. Keywords: Mining automation, robotics,\n intelligent systems, machine learning, remote sensing, geostatistics,\n planning, scheduling, optimization, modelling, geology, complex systems.\n Document: 21 pages, 6 figures, 2 tables. 2024 Update: Added ICRA conference\n poster + slides as ancilliary files"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Robotics 2 + +
+
+
+ + ♻ ☆ Explaining Explaining + + +
+ Explanation is key to people having confidence in high-stakes AI systems. +However, machine-learning-based systems -- which account for almost all current +AI -- can't explain because they are usually black boxes. The explainable AI +(XAI) movement hedges this problem by redefining "explanation". The +human-centered explainable AI (HCXAI) movement identifies the +explanation-oriented needs of users but can't fulfill them because of its +commitment to machine learning. In order to achieve the kinds of explanations +needed by real people operating in critical domains, we must rethink how to +approach AI. We describe a hybrid approach to developing cognitive agents that +uses a knowledge-based infrastructure supplemented by data obtained through +machine learning when applicable. These agents will serve as assistants to +humans who will bear ultimate responsibility for the decisions and actions of +the human-robot team. We illustrate the explanatory potential of such agents +using the under-the-hood panels of a demonstration system in which a team of +simulated robots collaborate on a search task assigned by a human. + +
+
+
+
+
+ + ♻ Learning Occlusion-aware Decision-making from Agent Interaction via + Active Perception + + +
+ Occlusion-aware decision-making is essential in autonomous driving due to the +high uncertainty of various occlusions. Recent occlusion-aware decision-making +methods encounter issues such as high computational complexity, scenario +scalability challenges, or reliance on limited expert data. Benefiting from +automatically generating data by exploration randomization, we uncover that +reinforcement learning (RL) may show promise in occlusion-aware +decision-making. However, previous occlusion-aware RL faces challenges in +expanding to various dynamic and static occlusion scenarios, low learning +efficiency, and lack of predictive ability. To address these issues, we +introduce Pad-AI, a self-reinforcing framework to learn occlusion-aware +decision-making through active perception. Pad-AI utilizes vectorized +representation to represent occluded environments efficiently and learns over +the semantic motion primitives to focus on high-level active perception +exploration. Furthermore, Pad-AI integrates prediction and RL within a unified +framework to provide risk-aware learning and security guarantees. Our framework +was tested in challenging scenarios under both dynamic and static occlusions +and demonstrated efficient and general perception-aware exploration performance +to other strong baselines in closed-loop evaluations. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 5 + +
+
+
+ + ♻ ☆ InterNet: Unsupervised Cross-modal Homography Estimation Based on + Interleaved Modality Transfer and Self-supervised Homography Prediction + + +
+ We propose a novel unsupervised cross-modal homography estimation framework, +based on interleaved modality transfer and self-supervised homography +prediction, named InterNet. InterNet integrates modality transfer and +self-supervised homography estimation, introducing an innovative interleaved +optimization framework to alternately promote both components. The modality +transfer gradually narrows the modality gaps, facilitating the self-supervised +homography estimation to fully leverage the synthetic intra-modal data. The +self-supervised homography estimation progressively achieves reliable +predictions, thereby providing robust cross-modal supervision for the modality +transfer. To further boost the estimation accuracy, we also formulate a +fine-grained homography feature loss to improve the connection between two +components. Furthermore, we employ a simple yet effective distillation training +technique to reduce model parameters and improve cross-domain generalization +ability while maintaining comparable performance. Experiments reveal that +InterNet achieves the state-of-the-art (SOTA) performance among unsupervised +methods, and even outperforms many supervised methods such as MHN and +LocalTrans. + +
+
+
+
+
+ + ♻ ☆ A New Dataset for Monocular Depth Estimation Under Viewpoint Shifts ECCV 2024 + + +
+ Monocular depth estimation is a critical task for autonomous driving and many +other computer vision applications. While significant progress has been made in +this field, the effects of viewpoint shifts on depth estimation models remain +largely underexplored. This paper introduces a novel dataset and evaluation +methodology to quantify the impact of different camera positions and +orientations on monocular depth estimation performance. We propose a ground +truth strategy based on homography estimation and object detection, eliminating +the need for expensive lidar sensors. We collect a diverse dataset of road +scenes from multiple viewpoints and use it to assess the robustness of a modern +depth estimation model to geometric shifts. After assessing the validity of our +strategy on a public dataset, we provide valuable insights into the limitations +of current models and highlight the importance of considering viewpoint +variations in real-world applications. + +
+
+ comment: 17 pages, 5 figures. Accepted at ECCV 2024 2nd Workshop on + Vision-Centric Autonomous Driving (VCAD) +
+
+
+
+
+ + ♻ ☆ Confidence intervals uncovered: Are we ready for real-world medical + imaging AI? MICCAI 2024 + + +
+ Medical imaging is spearheading the AI transformation of healthcare. +Performance reporting is key to determine which methods should be translated +into clinical practice. Frequently, broad conclusions are simply derived from +mean performance values. In this paper, we argue that this common practice is +often a misleading simplification as it ignores performance variability. Our +contribution is threefold. (1) Analyzing all MICCAI segmentation papers (n = +221) published in 2023, we first observe that more than 50% of papers do not +assess performance variability at all. Moreover, only one (0.5%) paper reported +confidence intervals (CIs) for model performance. (2) To address the reporting +bottleneck, we show that the unreported standard deviation (SD) in segmentation +papers can be approximated by a second-order polynomial function of the mean +Dice similarity coefficient (DSC). Based on external validation data from 56 +previous MICCAI challenges, we demonstrate that this approximation can +accurately reconstruct the CI of a method using information provided in +publications. (3) Finally, we reconstructed 95% CIs around the mean DSC of +MICCAI 2023 segmentation papers. The median CI width was 0.03 which is three +times larger than the median performance gap between the first and second +ranked method. For more than 60% of papers, the mean performance of the +second-ranked method was within the CI of the first-ranked method. We conclude +that current publications typically do not provide sufficient evidence to +support which models could potentially be translated into clinical practice. + +
+
+ comment: Paper accepted at MICCAI 2024 conference +
+
+
+
+
+ + ♻ ☆ Leveraging Anthropometric Measurements to Improve Human Mesh Estimation + and Ensure Consistent Body Shapes + + +
+ The basic body shape of a person does not change within a single video. +However, most SOTA human mesh estimation (HME) models output a slightly +different body shape for each video frame, which results in inconsistent body +shapes for the same person. In contrast, we leverage anthropometric +measurements like tailors are already obtaining from humans for centuries. We +create a model called A2B that converts such anthropometric measurements to +body shape parameters of human mesh models. Moreover, we find that finetuned +SOTA 3D human pose estimation (HPE) models outperform HME models regarding the +precision of the estimated keypoints. We show that applying inverse kinematics +(IK) to the results of such a 3D HPE model and combining the resulting body +pose with the A2B body shape leads to superior and consistent human meshes for +challenging datasets like ASPset or fit3D, where we can lower the MPJPE by over +30 mm compared to SOTA HME models. Further, replacing HME models estimates of +the body shape parameters with A2B model results not only increases the +performance of these HME models, but also leads to consistent body shapes. + +
+
+
+
+
+ + ♻ ☆ VideoPatchCore: An Effective Method to Memorize Normality for Video + Anomaly Detection ACCV 2024 + + +
+ Video anomaly detection (VAD) is a crucial task in video analysis and +surveillance within computer vision. Currently, VAD is gaining attention with +memory techniques that store the features of normal frames. The stored features +are utilized for frame reconstruction, identifying an abnormality when a +significant difference exists between the reconstructed and input frames. +However, this approach faces several challenges due to the simultaneous +optimization required for both the memory and encoder-decoder model. These +challenges include increased optimization difficulty, complexity of +implementation, and performance variability depending on the memory size. To +address these challenges,we propose an effective memory method for VAD, called +VideoPatchCore. Inspired by PatchCore, our approach introduces a structure that +prioritizes memory optimization and configures three types of memory tailored +to the characteristics of video data. This method effectively addresses the +limitations of existing memory-based methods, achieving good performance +comparable to state-of-the-art methods. Furthermore, our method requires no +training and is straightforward to implement, making VAD tasks more accessible. +Our code is available online at github.com/SkiddieAhn/Paper-VideoPatchCore. + +
+
+ comment: Accepted to ACCV 2024 +
+
+
+
+
+
+
+
+ + Systems and Control 2 + +
+
+
+ + ♻ ☆ Diffusion Models for Intelligent Transportation Systems: A Survey + + +
+ Intelligent Transportation Systems (ITS) are vital in modern traffic +management and optimization, significantly enhancing traffic efficiency and +safety. Recently, diffusion models have emerged as transformative tools for +addressing complex challenges within ITS. In this paper, we present a +comprehensive survey of diffusion models for ITS, covering both theoretical and +practical aspects. First, we introduce the theoretical foundations of diffusion +models and their key variants, including conditional diffusion models and +latent diffusion models, highlighting their suitability for modeling complex, +multi-modal traffic data and enabling controllable generation. Second, we +outline the primary challenges in ITS and the corresponding advantages of +diffusion models, providing readers with a deeper understanding of the +intersection between ITS and diffusion models. Third, we offer a +multi-perspective investigation of current applications of diffusion models in +ITS domains, including autonomous driving, traffic simulation, trajectory +prediction, and traffic safety. Finally, we discuss state-of-the-art diffusion +model techniques and highlight key ITS research directions that warrant further +investigation. Through this structured overview, we aim to provide researchers +with a comprehensive understanding of diffusion models for ITS, thereby +advancing their future applications in the transportation domain. + +
+
+ comment: 7 figures +
+
+
+
+
+ + ♻ ☆ Enabling On-Chip High-Frequency Adaptive Linear Optimal Control via + Linearized Gaussian Process + + +
+ Unpredictable and complex aerodynamic effects pose significant challenges to +achieving precise flight control, such as the downwash effect from upper +vehicles to lower ones. Conventional methods often struggle to accurately model +these interactions, leading to controllers that require large safety margins +between vehicles. Moreover, the controller on real drones usually requires +high-frequency and has limited on-chip computation, making the adaptive control +design more difficult to implement. To address these challenges, we incorporate +Gaussian process (GP) to model the adaptive external aerodynamics with linear +model predictive control. The GP is linearized to enable real-time +high-frequency solutions. Moreover, to handle the error caused by +linearization, we integrate end-to-end Bayesian optimization during sample +collection stages to improve the control performance. Experimental results on +both simulations and real quadrotors show that we can achieve real-time +solvable computation speed with acceptable tracking errors. + +
+
+
+
+
+
+
+
+ + Machine Learning 5 + +
+
+
+ + ♻ ☆ Hypergame Theory for Decentralized Resource Allocation in Multi-user + Semantic Communications + + +
+ Semantic communications (SC) is an emerging communication paradigm in which +wireless devices can send only relevant information from a source of data while +relying on computing resources to regenerate missing data points. However, the +design of a multi-user SC system becomes more challenging because of the +computing and communication overhead required for coordination. Existing +solutions for learning the semantic language and performing resource allocation +often fail to capture the computing and communication tradeoffs involved in +multiuser SC. To address this gap, a novel framework for decentralized +computing and communication resource allocation in multiuser SC systems is +proposed. The challenge of efficiently allocating communication and computing +resources (for reasoning) in a decentralized manner to maximize the quality of +task experience for the end users is addressed through the application of +Stackelberg hyper game theory. Leveraging the concept of second-level hyper +games, novel analytical formulations are developed to model misperceptions of +the users about each other's communication and control strategies. Further, +equilibrium analysis of the learned resource allocation protocols examines the +convergence of the computing and communication strategies to a local +Stackelberg equilibria, considering misperceptions. Simulation results show +that the proposed Stackelberg hyper game results in efficient usage of +communication and computing resources while maintaining a high quality of +experience for the users compared to state-of-the-art that does not account for +the misperceptions. + +
+
+
+
+
+ + ♻ ☆ Confidence intervals uncovered: Are we ready for real-world medical + imaging AI? MICCAI 2024 + + +
+ Medical imaging is spearheading the AI transformation of healthcare. +Performance reporting is key to determine which methods should be translated +into clinical practice. Frequently, broad conclusions are simply derived from +mean performance values. In this paper, we argue that this common practice is +often a misleading simplification as it ignores performance variability. Our +contribution is threefold. (1) Analyzing all MICCAI segmentation papers (n = +221) published in 2023, we first observe that more than 50% of papers do not +assess performance variability at all. Moreover, only one (0.5%) paper reported +confidence intervals (CIs) for model performance. (2) To address the reporting +bottleneck, we show that the unreported standard deviation (SD) in segmentation +papers can be approximated by a second-order polynomial function of the mean +Dice similarity coefficient (DSC). Based on external validation data from 56 +previous MICCAI challenges, we demonstrate that this approximation can +accurately reconstruct the CI of a method using information provided in +publications. (3) Finally, we reconstructed 95% CIs around the mean DSC of +MICCAI 2023 segmentation papers. The median CI width was 0.03 which is three +times larger than the median performance gap between the first and second +ranked method. For more than 60% of papers, the mean performance of the +second-ranked method was within the CI of the first-ranked method. We conclude +that current publications typically do not provide sufficient evidence to +support which models could potentially be translated into clinical practice. + +
+
+ comment: Paper accepted at MICCAI 2024 conference +
+
+
+
+
+ + ♻ ☆ Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval + Model EMNLP 2024 + + +
+ A supervised ranking model, despite its advantage of being effective, usually +involves complex processing - typically multiple stages of task-specific +pre-training and fine-tuning. This has motivated researchers to explore simpler +pipelines leveraging large language models (LLMs) that are capable of working +in a zero-shot manner. However, since zero-shot inference does not make use of +a training set of pairs of queries and their relevant documents, its +performance is mostly worse than that of supervised models, which are trained +on such example pairs. Motivated by the existing findings that training +examples generally improve zero-shot performance, in our work, we explore if +this also applies to ranking models. More specifically, given a query and a +pair of documents, the preference prediction task is improved by augmenting +examples of preferences for similar queries from a training set. Our proposed +pairwise few-shot ranker demonstrates consistent improvements over the +zero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset) +retrieval benchmarks. Our method also achieves a close performance to that of a +supervised model without requiring any complex training pipeline. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard + for Prompt Attacks + + +
+ The proliferation of Large Language Models (LLMs) in diverse applications +underscores the pressing need for robust security measures to thwart potential +jailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger +data integrity and user privacy. Guardrails serve as crucial protective +mechanisms against such threats, but existing models often fall short in terms +of both detection accuracy, and computational efficiency. This paper advocates +for the significance of jailbreak attack prevention on LLMs, and emphasises the +role of input guardrails in safeguarding these models. We introduce MoJE +(Mixture of Jailbreak Expert), a novel guardrail architecture designed to +surpass current limitations in existing state-of-the-art guardrails. By +employing simple linguistic statistical techniques, MoJE excels in detecting +jailbreak attacks while maintaining minimal computational overhead during model +inference. Through rigorous experimentation, MoJE demonstrates superior +performance capable of detecting 90% of the attacks without compromising benign +prompts, enhancing LLMs security against jailbreak attacks. + +
+
+
+
+
+ + ♻ ☆ Modulated Intervention Preference Optimization (MIPO): Keep the Easy, + Refine the Difficult AAAI 2025 + + +
+ Preference optimization methods typically begin training with a well-trained +SFT model as a reference model. In RLHF and DPO, a regularization term is used +during the preference optimization process to prevent the policy model from +deviating too far from the reference model's distribution, thereby avoiding the +generation of anomalous responses. When the reference model is already +well-aligned with the given data or only requires slight adjustments, this +approach can produce a well-aligned model. However, if the reference model is +not aligned with the given data and requires significant deviation from its +current state, a regularization term may actually hinder the model alignment. +In this study, we propose \textbf{Modulated Intervention Preference +Optimization (MIPO)} to address this issue. MIPO modulates the degree of +intervention from the reference model based on how well the given data is +aligned with it. If the data is well-aligned, the intervention is increased to +prevent the policy model from diverging significantly from reference model. +Conversely, if the alignment is poor, the interference is reduced to facilitate +more extensive training. We compare the performance of MIPO and DPO using +Mistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental +results demonstrate that MIPO consistently outperforms DPO across various +evaluation scenarios. + +
+
+ comment: 8pages, submitted to AAAI 2025 +
+
+
+
+
+
+
+
+ + Artificial Intelligence 6 + +
+
+
+ + ♻ ☆ Explaining Explaining + + +
+ Explanation is key to people having confidence in high-stakes AI systems. +However, machine-learning-based systems -- which account for almost all current +AI -- can't explain because they are usually black boxes. The explainable AI +(XAI) movement hedges this problem by redefining "explanation". The +human-centered explainable AI (HCXAI) movement identifies the +explanation-oriented needs of users but can't fulfill them because of its +commitment to machine learning. In order to achieve the kinds of explanations +needed by real people operating in critical domains, we must rethink how to +approach AI. We describe a hybrid approach to developing cognitive agents that +uses a knowledge-based infrastructure supplemented by data obtained through +machine learning when applicable. These agents will serve as assistants to +humans who will bear ultimate responsibility for the decisions and actions of +the human-robot team. We illustrate the explanatory potential of such agents +using the under-the-hood panels of a demonstration system in which a team of +simulated robots collaborate on a search task assigned by a human. + +
+
+
+
+
+ + ♻ ☆ CRoP: Context-wise Robust Static Human-Sensing Personalization + + +
+ The advancement in deep learning and internet-of-things have led to diverse +human sensing applications. However, distinct patterns in human sensing, +influenced by various factors or contexts, challenge generic neural network +model's performance due to natural distribution shifts. To address this, +personalization tailors models to individual users. Yet most personalization +studies overlook intra-user heterogeneity across contexts in sensory data, +limiting intra-user generalizability. This limitation is especially critical in +clinical applications, where limited data availability hampers both +generalizability and personalization. Notably, intra-user sensing attributes +are expected to change due to external factors such as treatment progression, +further complicating the challenges. This work introduces CRoP, a novel static +personalization approach using an off-the-shelf pre-trained model and pruning +to optimize personalization and generalization. CRoP shows superior +personalization effectiveness and intra-user robustness across four +human-sensing datasets, including two from real-world health domains, +highlighting its practical and social impact. Additionally, to support CRoP's +generalization ability and design choices, we provide empirical justification +through gradient inner product analysis, ablation studies, and comparisons +against state-of-the-art baselines. + +
+
+ comment: 31 pages, 10 figues and 13 tables +
+
+
+
+
+ + ♻ ☆ Confidence intervals uncovered: Are we ready for real-world medical + imaging AI? MICCAI 2024 + + +
+ Medical imaging is spearheading the AI transformation of healthcare. +Performance reporting is key to determine which methods should be translated +into clinical practice. Frequently, broad conclusions are simply derived from +mean performance values. In this paper, we argue that this common practice is +often a misleading simplification as it ignores performance variability. Our +contribution is threefold. (1) Analyzing all MICCAI segmentation papers (n = +221) published in 2023, we first observe that more than 50% of papers do not +assess performance variability at all. Moreover, only one (0.5%) paper reported +confidence intervals (CIs) for model performance. (2) To address the reporting +bottleneck, we show that the unreported standard deviation (SD) in segmentation +papers can be approximated by a second-order polynomial function of the mean +Dice similarity coefficient (DSC). Based on external validation data from 56 +previous MICCAI challenges, we demonstrate that this approximation can +accurately reconstruct the CI of a method using information provided in +publications. (3) Finally, we reconstructed 95% CIs around the mean DSC of +MICCAI 2023 segmentation papers. The median CI width was 0.03 which is three +times larger than the median performance gap between the first and second +ranked method. For more than 60% of papers, the mean performance of the +second-ranked method was within the CI of the first-ranked method. We conclude +that current publications typically do not provide sufficient evidence to +support which models could potentially be translated into clinical practice. + +
+
+ comment: Paper accepted at MICCAI 2024 conference +
+
+
+
+
+ + ♻ ☆ MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard + for Prompt Attacks + + +
+ The proliferation of Large Language Models (LLMs) in diverse applications +underscores the pressing need for robust security measures to thwart potential +jailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger +data integrity and user privacy. Guardrails serve as crucial protective +mechanisms against such threats, but existing models often fall short in terms +of both detection accuracy, and computational efficiency. This paper advocates +for the significance of jailbreak attack prevention on LLMs, and emphasises the +role of input guardrails in safeguarding these models. We introduce MoJE +(Mixture of Jailbreak Expert), a novel guardrail architecture designed to +surpass current limitations in existing state-of-the-art guardrails. By +employing simple linguistic statistical techniques, MoJE excels in detecting +jailbreak attacks while maintaining minimal computational overhead during model +inference. Through rigorous experimentation, MoJE demonstrates superior +performance capable of detecting 90% of the attacks without compromising benign +prompts, enhancing LLMs security against jailbreak attacks. + +
+
+
+
+
+ + ♻ ☆ Summarizing Radiology Reports Findings into Impressions + + +
+ Patient hand-off and triage are two fundamental problems in health care. +Often doctors must painstakingly summarize complex findings to efficiently +communicate with specialists and quickly make decisions on which patients have +the most urgent cases. In pursuit of these challenges, we present (1) a model +with state-of-art radiology report summarization performance using (2) a novel +method for augmenting medical data, and (3) an analysis of the model +limitations and radiology knowledge gain. We also provide a data processing +pipeline for future models developed on the the MIMIC CXR dataset. Our best +performing model was a fine-tuned BERT-to-BERT encoder-decoder with 58.75/100 +ROUGE-L F1, which outperformed specialized checkpoints with more sophisticated +attention mechanisms. We investigate these aspects in this work. + +
+
+ comment: This version reverts to the original preprint, following the advice + from the Artificial Intelligence in Health editorial office. The published + version is peer-reviewed and available in the journal (see external DOI). The + preprint remains unchanged to maintain version transparency, as noted in the + further disclosure section of the published article +
+
+
+
+
+ + ♻ ☆ Modulated Intervention Preference Optimization (MIPO): Keep the Easy, + Refine the Difficult AAAI 2025 + + +
+ Preference optimization methods typically begin training with a well-trained +SFT model as a reference model. In RLHF and DPO, a regularization term is used +during the preference optimization process to prevent the policy model from +deviating too far from the reference model's distribution, thereby avoiding the +generation of anomalous responses. When the reference model is already +well-aligned with the given data or only requires slight adjustments, this +approach can produce a well-aligned model. However, if the reference model is +not aligned with the given data and requires significant deviation from its +current state, a regularization term may actually hinder the model alignment. +In this study, we propose \textbf{Modulated Intervention Preference +Optimization (MIPO)} to address this issue. MIPO modulates the degree of +intervention from the reference model based on how well the given data is +aligned with it. If the data is well-aligned, the intervention is increased to +prevent the policy model from diverging significantly from reference model. +Conversely, if the alignment is poor, the interference is reduced to facilitate +more extensive training. We compare the performance of MIPO and DPO using +Mistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental +results demonstrate that MIPO consistently outperforms DPO across various +evaluation scenarios. + +
+
+ comment: 8pages, submitted to AAAI 2025 +
+
+
+
+
+
+
+
+ + Computation and Language 9 + +
+
+
+ + ♻ ☆ BeanCounter: A low-toxicity, large-scale, and open dataset of + business-oriented text + + +
+ Many of the recent breakthroughs in language modeling have resulted from +scaling effectively the same model architecture to larger datasets. In this +vein, recent work has highlighted performance gains from increasing training +dataset size and quality, suggesting a need for novel sources of large-scale +datasets. In this work, we introduce BeanCounter, a public dataset consisting +of more than 159B tokens extracted from businesses' disclosures. We show that +this data is indeed novel: less than 0.1% of BeanCounter appears in Common +Crawl-based datasets and it is an order of magnitude larger than datasets +relying on similar sources. Given the data's provenance, we hypothesize that +BeanCounter is comparatively more factual and less toxic than web-based +datasets. Exploring this hypothesis, we find that many demographic identities +occur with similar prevalence in BeanCounter but with significantly less toxic +context relative to other datasets. To demonstrate the utility of BeanCounter, +we evaluate and compare two LLMs continually pre-trained on BeanCounter with +their base models. We find an 18-33% reduction in toxic generation and improved +performance within the finance domain for the continually pretrained models. +Collectively, our work suggests that BeanCounter is a novel source of +low-toxicity and high-quality domain-specific data with sufficient scale to +train multi-billion parameter LLMs. + +
+
+
+
+
+ + ♻ ☆ Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval + Model EMNLP 2024 + + +
+ A supervised ranking model, despite its advantage of being effective, usually +involves complex processing - typically multiple stages of task-specific +pre-training and fine-tuning. This has motivated researchers to explore simpler +pipelines leveraging large language models (LLMs) that are capable of working +in a zero-shot manner. However, since zero-shot inference does not make use of +a training set of pairs of queries and their relevant documents, its +performance is mostly worse than that of supervised models, which are trained +on such example pairs. Motivated by the existing findings that training +examples generally improve zero-shot performance, in our work, we explore if +this also applies to ranking models. More specifically, given a query and a +pair of documents, the preference prediction task is improved by augmenting +examples of preferences for similar queries from a training set. Our proposed +pairwise few-shot ranker demonstrates consistent improvements over the +zero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset) +retrieval benchmarks. Our method also achieves a close performance to that of a +supervised model without requiring any complex training pipeline. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Fine Tuning vs. Retrieval Augmented Generation for Less Popular + Knowledge + + +
+ Language Models (LMs) memorize a vast amount of factual knowledge, exhibiting +strong performance across diverse tasks and domains. However, it has been +observed that the performance diminishes when dealing with less-popular or +low-frequency concepts and entities, for example in domain specific +applications. The two prominent approaches to enhance the performance of LMs on +low-frequent topics are: Retrieval Augmented Generation (RAG) and fine-tuning +(FT) over synthetic data. This paper explores and evaluates the impact of RAG +and FT on customizing LMs in handling low-frequency entities on question +answering tasks. We conduct extensive experiments on twelve LMs of varying size +and type and different fine tuning, data augmentation, and retrieval models. +Our findings indicate that while FT boosts the performance across entities of +varying popularity, RAG surpasses FT by a large margin particularly for least +popular factual knowledge. Additionally, the success of both RAG and FT +approaches is amplified by improving retrieval and data augmentation +techniques. Fine tuning, while beneficial for small LMs, requires extensive +resources. To address this issue, we propose the new Stimulus RAG approach that +surpasses the effectiveness of fine tuning based approaches, thereby +eliminating the need for the costly data augmentation and fine tuning step for +enriching LMs with less popular factual knowledge. + +
+
+
+
+
+ + ♻ ☆ Summarizing Radiology Reports Findings into Impressions + + +
+ Patient hand-off and triage are two fundamental problems in health care. +Often doctors must painstakingly summarize complex findings to efficiently +communicate with specialists and quickly make decisions on which patients have +the most urgent cases. In pursuit of these challenges, we present (1) a model +with state-of-art radiology report summarization performance using (2) a novel +method for augmenting medical data, and (3) an analysis of the model +limitations and radiology knowledge gain. We also provide a data processing +pipeline for future models developed on the the MIMIC CXR dataset. Our best +performing model was a fine-tuned BERT-to-BERT encoder-decoder with 58.75/100 +ROUGE-L F1, which outperformed specialized checkpoints with more sophisticated +attention mechanisms. We investigate these aspects in this work. + +
+
+ comment: This version reverts to the original preprint, following the advice + from the Artificial Intelligence in Health editorial office. The published + version is peer-reviewed and available in the journal (see external DOI). The + preprint remains unchanged to maintain version transparency, as noted in the + further disclosure section of the published article +
+
+
+
+
+ + ♻ ☆ Modulated Intervention Preference Optimization (MIPO): Keep the Easy, + Refine the Difficult AAAI 2025 + + +
+ Preference optimization methods typically begin training with a well-trained +SFT model as a reference model. In RLHF and DPO, a regularization term is used +during the preference optimization process to prevent the policy model from +deviating too far from the reference model's distribution, thereby avoiding the +generation of anomalous responses. When the reference model is already +well-aligned with the given data or only requires slight adjustments, this +approach can produce a well-aligned model. However, if the reference model is +not aligned with the given data and requires significant deviation from its +current state, a regularization term may actually hinder the model alignment. +In this study, we propose \textbf{Modulated Intervention Preference +Optimization (MIPO)} to address this issue. MIPO modulates the degree of +intervention from the reference model based on how well the given data is +aligned with it. If the data is well-aligned, the intervention is increased to +prevent the policy model from diverging significantly from reference model. +Conversely, if the alignment is poor, the interference is reduced to facilitate +more extensive training. We compare the performance of MIPO and DPO using +Mistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental +results demonstrate that MIPO consistently outperforms DPO across various +evaluation scenarios. + +
+
+ comment: 8pages, submitted to AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Scaling Behavior for Large Language Models regarding Numeral Systems: An + Example using Pythia EMNLP 2024 + + +
+ Though Large Language Models (LLMs) have shown remarkable abilities in +mathematics reasoning, they are still struggling with performing numeric +operations accurately, such as addition and multiplication. Numbers can be +tokenized into tokens in various ways by different LLMs and affect the numeric +operations performance. Currently, there are two representatives: 1) Tokenize +into $1$-digit, and 2) Tokenize into $1\sim 3$ digit. The difference is roughly +equivalent to using different numeral systems (namely base $10$ or base +$10^{3}$). In light of this, we study the scaling behavior of different numeral +systems in the context of transformer-based large language models. We +empirically show that a base $10$ system is consistently more data-efficient +than a base $10^{2}$ or $10^{3}$ system across training data scale, model sizes +under from-scratch training settings, while different number systems have very +similar fine-tuning performances. We attribute this to higher token frequencies +of a base $10$ system. Additionally, we reveal extrapolation behavior patterns +on addition and multiplication. We identify that base $100$ and base $1000$ +systems struggle on token-level discernment and token-level operations. We also +sheds light on the mechanism learnt by the models. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Internalizing ASR with Implicit Chain of Thought for Efficient + Speech-to-Speech Conversational LLM + + +
+ Current speech-based LLMs are predominantly trained on extensive ASR and TTS +datasets, excelling in tasks related to these domains. However, their ability +to handle direct speech-to-speech conversations remains notably constrained. +These models often rely on an ASR-to-TTS chain-of-thought pipeline, converting +speech into text for processing before generating audio responses, which +introduces latency and loses audio features. We propose a method that +implicitly internalizes ASR chain of thought into a speech LLM, enhancing its +native speech understanding capabilities. Our approach reduces latency and +improves the model's native understanding of speech, paving the way for more +efficient and natural real-time audio interactions. We also release a +large-scale synthetic conversational dataset to facilitate further research. + +
+
+ comment: Corrected style from final to preprint +
+
+
+
+
+ + ♻ ☆ Plurals: A System for Guiding LLMs Via Simulated Social Ensembles + + +
+ Recent debates raised concerns that language models may favor certain +viewpoints. But what if the solution is not to aim for a 'view from nowhere' +but rather to leverage different viewpoints? We introduce Plurals, a system and +Python library for pluralistic AI deliberation. Plurals consists of Agents +(LLMs, optionally with personas) which deliberate within customizable +Structures, with Moderators overseeing deliberation. Plurals is a generator of +simulated social ensembles. Plurals integrates with government datasets to +create nationally representative personas, includes deliberation templates +inspired by democratic deliberation theory, and allows users to customize both +information-sharing structures and deliberation behavior within Structures. Six +case studies demonstrate fidelity to theoretical constructs and efficacy. Three +randomized experiments show simulated focus groups produced output resonant +with an online sample of the relevant audiences (chosen over zero-shot +generation in 75% of trials). Plurals is both a paradigm and a concrete system +for pluralistic AI. The Plurals library is available at +https://github.com/josh-ashkinaze/plurals and will be continually updated. + +
+
+
+
+
+ + ♻ ☆ Semi-Supervised Cognitive State Classification from Speech with + Multi-View Pseudo-Labeling + + +
+ The lack of labeled data is a common challenge in speech classification +tasks, particularly those requiring extensive subjective assessment, such as +cognitive state classification. In this work, we propose a Semi-Supervised +Learning (SSL) framework, introducing a novel multi-view pseudo-labeling method +that leverages both acoustic and linguistic characteristics to select the most +confident data for training the classification model. Acoustically, unlabeled +data are compared to labeled data using the Frechet audio distance, calculated +from embeddings generated by multiple audio encoders. Linguistically, large +language models are prompted to revise automatic speech recognition +transcriptions and predict labels based on our proposed task-specific +knowledge. High-confidence data are identified when pseudo-labels from both +sources align, while mismatches are treated as low-confidence data. A bimodal +classifier is then trained to iteratively label the low-confidence data until a +predefined criterion is met. We evaluate our SSL framework on emotion +recognition and dementia detection tasks. Experimental results demonstrate that +our method achieves competitive performance compared to fully supervised +learning using only 30% of the labeled data and significantly outperforms two +selected baselines. + +
+
+
+
+
+
+
+
+ + General Literature 1 + +
+
+
+ + ♻ ☆ Automation and AI Technology in Surface Mining With a Brief Introduction + to Open-Pit Operations in the Pilbara ICRA + + +
+ This survey article provides a synopsis on some of the engineering problems, +technological innovations, robotic development and automation efforts +encountered in the mining industry -- particularly in the Pilbara iron-ore +region of Western Australia. The goal is to paint the technology landscape and +highlight issues relevant to an engineering audience to raise awareness of AI +and automation trends in mining. It assumes the reader has no prior knowledge +of mining and builds context gradually through focused discussion and short +summaries of common open-pit mining operations. The principal activities that +take place may be categorized in terms of resource development, mine-, rail- +and port operations. From mineral exploration to ore shipment, there are +roughly nine steps in between. These include: geological assessment, mine +planning and development, production drilling and assaying, blasting and +excavation, transportation of ore and waste, crush and screen, stockpile and +load-out, rail network distribution, and ore-car dumping. The objective is to +describe these processes and provide insights on some of the +challenges/opportunities from the perspective of a decade-long +industry-university R&D partnership. + +
+
+ comment: Accepted manuscript. Paper provides insights on state-of-the-art + technologies and future trends. Keywords: Mining automation, robotics, + intelligent systems, machine learning, remote sensing, geostatistics, + planning, scheduling, optimization, modelling, geology, complex systems. + Document: 21 pages, 6 figures, 2 tables. 2024 Update: Added ICRA conference + poster + slides as ancilliary files +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 96 + +
+
+
+ + ☆ Open-World Evaluation for Retrieving Diverse Perspectives + + +
+ We study retrieving a set of documents that covers various perspectives on a +complex and contentious question (e.g., will ChatGPT do more harm than good?). +We curate a Benchmark for Retrieval Diversity for Subjective questions (BERDS), +where each example consists of a question and diverse perspectives associated +with the question, sourced from survey questions and debate websites. On this +data, retrievers paired with a corpus are evaluated to surface a document set +that contains diverse perspectives. Our framing diverges from most retrieval +tasks in that document relevancy cannot be decided by simple string matches to +references. Instead, we build a language model based automatic evaluator that +decides whether each retrieved document contains a perspective. This allows us +to evaluate the performance of three different types of corpus (Wikipedia, web +snapshot, and corpus constructed on the fly with retrieved pages from the +search engine) paired with retrievers. Retrieving diverse documents remains +challenging, with the outputs from existing retrievers covering all +perspectives on only 33.74% of the examples. We further study the impact of +query expansion and diversity-focused reranking approaches and analyze +retriever sycophancy. Together, we lay the foundation for future studies in +retrieval diversity handling complex queries. + +
+
+
+
+
+ + ☆ Infer Human's Intentions Before Following Natural Language Instructions + + +
+ For AI agents to be helpful to humans, they should be able to follow natural +language instructions to complete everyday cooperative tasks in human +environments. However, real human instructions inherently possess ambiguity, +because the human speakers assume sufficient prior knowledge about their hidden +goals and intentions. Standard language grounding and planning methods fail to +address such ambiguities because they do not model human internal goals as +additional partially observable factors in the environment. We propose a new +framework, Follow Instructions with Social and Embodied Reasoning (FISER), +aiming for better natural language instruction following in collaborative +embodied tasks. Our framework makes explicit inferences about human goals and +intentions as intermediate reasoning steps. We implement a set of +Transformer-based models and evaluate them over a challenging benchmark, +HandMeThat. We empirically demonstrate that using social reasoning to +explicitly infer human intentions before making action plans surpasses purely +end-to-end approaches. We also compare our implementation with strong +baselines, including Chain of Thought prompting on the largest available +pre-trained language models, and find that FISER provides better performance on +the embodied social reasoning tasks under investigation, reaching the +state-of-the-art on HandMeThat. + +
+
+
+
+
+ + ☆ IFCap: Image-like Retrieval and Frequency-based Entity Filtering for + Zero-shot Captioning EMNLP 2024 + + +
+ Recent advancements in image captioning have explored text-only training +methods to overcome the limitations of paired image-text data. However, +existing text-only training methods often overlook the modality gap between +using text data during training and employing images during inference. To +address this issue, we propose a novel approach called Image-like Retrieval, +which aligns text features with visually relevant features to mitigate the +modality gap. Our method further enhances the accuracy of generated captions by +designing a Fusion Module that integrates retrieved captions with input +features. Additionally, we introduce a Frequency-based Entity Filtering +technique that significantly improves caption quality. We integrate these +methods into a unified framework, which we refer to as IFCap +($\textbf{I}$mage-like Retrieval and $\textbf{F}$requency-based Entity +Filtering for Zero-shot $\textbf{Cap}$tioning). Through extensive +experimentation, our straightforward yet powerful approach has demonstrated its +efficacy, outperforming the state-of-the-art methods by a significant margin in +both image captioning and video captioning compared to zero-shot captioning +based on text-only training. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ Unveiling the Role of Pretraining in Direct Speech Translation EMNLP 2024 + + +
+ Direct speech-to-text translation systems encounter an important drawback in +data scarcity. A common solution consists on pretraining the encoder on +automatic speech recognition, hence losing efficiency in the training process. +In this study, we compare the training dynamics of a system using a pretrained +encoder, the conventional approach, and one trained from scratch. We observe +that, throughout the training, the randomly initialized model struggles to +incorporate information from the speech inputs for its predictions. Hence, we +hypothesize that this issue stems from the difficulty of effectively training +an encoder for direct speech translation. While a model trained from scratch +needs to learn acoustic and semantic modeling simultaneously, a pretrained one +can just focus on the latter. Based on these findings, we propose a subtle +change in the decoder cross-attention to integrate source information from +earlier steps in training. We show that with this change, the model trained +from scratch can achieve comparable performance to the pretrained one, while +reducing the training time. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ☆ EMOVA: Empowering Language Models to See, Hear and Speak with Vivid + Emotions + + +
+ GPT-4o, an omni-modal model that enables vocal conversations with diverse +emotions and tones, marks a milestone for omni-modal foundation models. +However, empowering Large Language Models to perceive and generate images, +texts, and speeches end-to-end with publicly available data remains challenging +in the open-source community. Existing vision-language models rely on external +tools for the speech processing, while speech-language models still suffer from +limited or even without vision-understanding abilities. To address this gap, we +propose EMOVA (EMotionally Omni-present Voice Assistant), to enable Large +Language Models with end-to-end speech capabilities while maintaining the +leading vision-language performance. With a semantic-acoustic disentangled +speech tokenizer, we notice surprisingly that omni-modal alignment can further +enhance vision-language and speech abilities compared with the corresponding +bi-modal aligned counterparts. Moreover, a lightweight style module is proposed +for flexible speech style controls (e.g., emotions and pitches). For the first +time, EMOVA achieves state-of-the-art performance on both the vision-language +and speech benchmarks, and meanwhile, supporting omni-modal spoken dialogue +with vivid emotions. + +
+
+ comment: Project Page: https://emova-ollm.github.io/ +
+
+
+
+
+ + ☆ Automated Detection and Analysis of Power Words in Persuasive Text Using + Natural Language Processing + + +
+ Power words are terms that evoke strong emotional responses and significantly +influence readers' behavior, playing a crucial role in fields like marketing, +politics, and motivational writing. This study proposes a methodology for the +automated detection and analysis of power words in persuasive text using a +custom lexicon and the TextBlob library in Python. By identifying the presence +and frequency of power words within a given text, we aim to classify and +analyze their impact on sentiment and reader engagement. This research examines +diverse datasets across various domains to provide insights into the +effectiveness of power words, offering practical applications for content +creators, advertisers, and policymakers. + +
+
+
+
+
+ + ☆ Compositional Hardness of Code in Large Language Models -- A + Probabilistic Perspective + + +
+ A common practice in large language model (LLM) usage for complex analytical +tasks such as code generation, is to sample a solution for the entire task +within the model's context window. Previous works have shown that subtask +decomposition within the model's context (chain of thought), is beneficial for +solving such tasks. In this work, we point a limitation of LLMs' ability to +perform several sub-tasks within the same context window - an in-context +hardness of composition, pointing to an advantage for distributing a decomposed +problem in a multi-agent system of LLMs. The hardness of composition is +quantified by a generation complexity metric, i.e., the number of LLM +generations required to sample at least one correct solution. We find a gap +between the generation complexity of solving a compositional problem within the +same context relative to distributing it among multiple agents, that increases +exponentially with the solution's length. We prove our results theoretically +and demonstrate them empirically. + +
+
+
+
+
+ + ☆ An Adversarial Perspective on Machine Unlearning for AI Safety + + +
+ Large language models are finetuned to refuse questions about hazardous +knowledge, but these protections can often be bypassed. Unlearning methods aim +at completely removing hazardous capabilities from models and make them +inaccessible to adversaries. This work challenges the fundamental differences +between unlearning and traditional safety post-training from an adversarial +perspective. We demonstrate that existing jailbreak methods, previously +reported as ineffective against unlearning, can be successful when applied +carefully. Furthermore, we develop a variety of adaptive methods that recover +most supposedly unlearned capabilities. For instance, we show that finetuning +on 10 unrelated examples or removing specific directions in the activation +space can recover most hazardous capabilities for models edited with RMU, a +state-of-the-art unlearning method. Our findings challenge the robustness of +current unlearning approaches and question their advantages over safety +training. + +
+
+
+
+
+ + ☆ DARE: Diverse Visual Question Answering with Robustness Evaluation + + +
+ Vision Language Models (VLMs) extend remarkable capabilities of text-only +large language models and vision-only models, and are able to learn from and +process multi-modal vision-text input. While modern VLMs perform well on a +number of standard image classification and image-text matching tasks, they +still struggle with a number of crucial vision-language (VL) reasoning +abilities such as counting and spatial reasoning. Moreover, while they might be +very brittle to small variations in instructions and/or evaluation protocols, +existing benchmarks fail to evaluate their robustness (or rather the lack of +it). In order to couple challenging VL scenarios with comprehensive robustness +evaluation, we introduce DARE, Diverse Visual Question Answering with +Robustness Evaluation, a carefully created and curated multiple-choice VQA +benchmark. DARE evaluates VLM performance on five diverse categories and +includes four robustness-oriented evaluations based on the variations of: +prompts, the subsets of answer options, the output format and the number of +correct answers. Among a spectrum of other findings, we report that +state-of-the-art VLMs still struggle with questions in most categories and are +unable to consistently deliver their peak performance across the tested +robustness evaluations. The worst case performance across the subsets of +options is up to 34% below the performance in the standard case. The robustness +of the open-source VLMs such as LLaVA 1.6 and Idefics2 cannot match the +closed-source models such as GPT-4 and Gemini, but even the latter remain very +brittle to different variations. + +
+
+
+
+
+ + ☆ Multilingual Evaluation of Long Context Retrieval and Reasoning + + +
+ Recent large language models (LLMs) demonstrate impressive capabilities in +handling long contexts, some exhibiting near-perfect recall on synthetic +retrieval tasks. However, these evaluations have mainly focused on English text +and involved a single target sentence within lengthy contexts. Our work +investigates how LLM performance generalizes to multilingual settings with +multiple hidden target sentences. We comprehensively evaluate several +long-context LLMs on retrieval and reasoning tasks across five languages: +English, Vietnamese, Indonesian, Swahili, and Somali. These languages share the +Latin script but belong to distinct language families and resource levels. Our +analysis reveals a significant performance gap between languages. The +best-performing models such as Gemini-1.5 and GPT-4o, achieve around 96% +accuracy in English to around 36% in Somali with a single target sentence. +However, this accuracy drops to 40% in English and 0% in Somali when dealing +with three target sentences. Our findings highlight the challenges long-context +LLMs face when processing longer contexts, an increase in the number of target +sentences, or languages of lower resource levels. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Extracting Affect Aggregates from Longitudinal Social Media Data with + Temporal Adapters for Large Language Models + + +
+ This paper proposes temporally aligned Large Language Models (LLMs) as a tool +for longitudinal analysis of social media data. We fine-tune Temporal Adapters +for Llama 3 8B on full timelines from a panel of British Twitter users, and +extract longitudinal aggregates of emotions and attitudes with established +questionnaires. We validate our estimates against representative British survey +data and find strong positive, significant correlations for several collective +emotions. The obtained estimates are robust across multiple training seeds and +prompt formulations, and in line with collective emotions extracted using a +traditional classification model trained on labeled data. To the best of our +knowledge, this is the first work to extend the analysis of affect in LLMs to a +longitudinal setting through Temporal Adapters. Our work enables new approaches +towards the longitudinal analysis of social media data. + +
+
+ comment: Code available at https://github.com/dess-mannheim/temporal-adapters +
+
+
+
+
+ + ☆ BEATS: Optimizing LLM Mathematical Capabilities with BackVerify and + Adaptive Disambiguate based Efficient Tree Search + + +
+ Large Language Models (LLMs) have exhibited exceptional performance across a +broad range of tasks and domains. However, they still encounter difficulties in +solving mathematical problems due to the rigorous and logical nature of +mathematics. Previous studies have employed techniques such as supervised +fine-tuning (SFT), prompt engineering, and search-based methods to improve the +mathematical problem-solving abilities of LLMs. Despite these efforts, their +performance remains suboptimal and demands substantial computational resources. +To address this issue, we propose a novel approach, BEATS, to enhance +mathematical problem-solving abilities. Our method leverages newly designed +prompts that guide the model to iteratively rewrite, advance by one step, and +generate answers based on previous steps. Additionally, we introduce a new +back-verification technique that uses LLMs to validate the correctness of the +generated answers. Furthermore, we employ a pruning tree search to optimize +search time while achieving strong performance. Notably, our method improves +Qwen2-7b-Instruct's score from 36.94 to 61.52, outperforming GPT4's 42.5 on the +MATH benchmark. + +
+
+
+
+
+ + ☆ The Hard Positive Truth about Vision-Language Compositionality ECCV 2024 + + +
+ Several benchmarks have concluded that our best vision-language models (e.g., +CLIP) are lacking in compositionality. Given an image, these benchmarks probe a +model's ability to identify its associated caption amongst a set of +compositional distractors. In response, a surge of recent proposals show +improvements by finetuning CLIP with distractors as hard negatives. Our +investigations reveal that these improvements have, in fact, been significantly +overstated -- because existing benchmarks do not probe whether finetuned +vision-language models remain invariant to hard positives. By curating an +evaluation dataset with 112,382 hard negatives and hard positives, we uncover +that including hard positives decreases CLIP's performance by 12.9%, while +humans perform effortlessly at 99%. CLIP finetuned with hard negatives results +in an even larger decrease, up to 38.7%. With this finding, we then produce a +1,775,259 image-text training set with both hard negative and hard positive +captions. By training with both, we see improvements on existing benchmarks +while simultaneously improving performance on hard positives, indicating a more +robust improvement in compositionality. Our work suggests the need for future +research to rigorously test and improve CLIP's understanding of semantic +relationships between related "positive" concepts. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Weak-To-Strong Backdoor Attacks for LLMs with Contrastive Knowledge + Distillation + + +
+ Despite being widely applied due to their exceptional capabilities, Large +Language Models (LLMs) have been proven to be vulnerable to backdoor attacks. +These attacks introduce targeted vulnerabilities into LLMs by poisoning +training samples and full-parameter fine-tuning. However, this kind of backdoor +attack is limited since they require significant computational resources, +especially as the size of LLMs increases. Besides, parameter-efficient +fine-tuning (PEFT) offers an alternative but the restricted parameter updating +may impede the alignment of triggers with target labels. In this study, we +first verify that backdoor attacks with PEFT may encounter challenges in +achieving feasible performance. To address these issues and improve the +effectiveness of backdoor attacks with PEFT, we propose a novel backdoor attack +algorithm from weak to strong based on contrastive knowledge distillation +(W2SAttack). Specifically, we poison small-scale language models through +full-parameter fine-tuning to serve as the teacher model. The teacher model +then covertly transfers the backdoor to the large-scale student model through +contrastive knowledge distillation, which employs PEFT. Theoretical analysis +reveals that W2SAttack has the potential to augment the effectiveness of +backdoor attacks. We demonstrate the superior performance of W2SAttack on +classification tasks across four language models, four backdoor attack +algorithms, and two different architectures of teacher models. Experimental +results indicate success rates close to 100% for backdoor attacks targeting +PEFT. + +
+
+
+
+
+ + ☆ On Translating Technical Terminology: A Translation Workflow for + Machine-Translated Acronyms + + +
+ The typical workflow for a professional translator to translate a document +from its source language (SL) to a target language (TL) is not always focused +on what many language models in natural language processing (NLP) do - predict +the next word in a series of words. While high-resource languages like English +and French are reported to achieve near human parity using common metrics for +measurement such as BLEU and COMET, we find that an important step is being +missed: the translation of technical terms, specifically acronyms. Some +state-of-the art machine translation systems like Google Translate which are +publicly available can be erroneous when dealing with acronyms - as much as 50% +in our findings. This article addresses acronym disambiguation for MT systems +by proposing an additional step to the SL-TL (FR-EN) translation workflow where +we first offer a new acronym corpus for public consumption and then experiment +with a search-based thresholding algorithm that achieves nearly 10% increase +when compared to Google Translate and OpusMT. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ Predicting Anchored Text from Translation Memories for Machine + Translation Using Deep Learning Methods + + +
+ Translation memories (TMs) are the backbone for professional translation +tools called computer-aided translation (CAT) tools. In order to perform a +translation using a CAT tool, a translator uses the TM to gather translations +similar to the desired segment to translate (s'). Many CAT tools offer a +fuzzy-match algorithm to locate segments (s) in the TM that are close in +distance to s'. After locating two similar segments, the CAT tool will present +parallel segments (s, t) that contain one segment in the source language along +with its translation in the target language. Additionally, CAT tools contain +fuzzy-match repair (FMR) techniques that will automatically use the parallel +segments from the TM to create new TM entries containing a modified version of +the original with the idea in mind that it will be the translation of s'. Most +FMR techniques use machine translation as a way of "repairing" those words that +have to be modified. In this article, we show that for a large part of those +words which are anchored, we can use other techniques that are based on machine +learning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we +show that for anchored words that follow the continuous bag-of-words (CBOW) +paradigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for +some cases, better results than neural machine translation for translating +anchored words from French to English. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ The Lou Dataset -- Exploring the Impact of Gender-Fair Language in + German Text Classification + + +
+ Gender-fair language, an evolving German linguistic variation, fosters +inclusion by addressing all genders or using neutral forms. Nevertheless, there +is a significant lack of resources to assess the impact of this linguistic +shift on classification using language models (LMs), which are probably not +trained on such variations. To address this gap, we present Lou, the first +dataset featuring high-quality reformulations for German text classification +covering seven tasks, like stance detection and toxicity classification. +Evaluating 16 mono- and multi-lingual LMs on Lou shows that gender-fair +language substantially impacts predictions by flipping labels, reducing +certainty, and altering attention patterns. However, existing evaluations +remain valid, as LM rankings of original and reformulated instances do not +significantly differ. While we offer initial insights on the effect on German +text classification, the findings likely apply to other languages, as +consistent patterns were observed in multi-lingual and English LMs. + +
+
+
+
+
+ + ☆ Pioneering Reliable Assessment in Text-to-Image Knowledge Editing: + Leveraging a Fine-Grained Dataset and an Innovative Criterion EMNLP24 + + +
+ During pre-training, the Text-to-Image (T2I) diffusion models encode factual +knowledge into their parameters. These parameterized facts enable realistic +image generation, but they may become obsolete over time, thereby +misrepresenting the current state of the world. Knowledge editing techniques +aim to update model knowledge in a targeted way. However, facing the dual +challenges posed by inadequate editing datasets and unreliable evaluation +criterion, the development of T2I knowledge editing encounter difficulties in +effectively generalizing injected knowledge. In this work, we design a T2I +knowledge editing framework by comprehensively spanning on three phases: First, +we curate a dataset \textbf{CAKE}, comprising paraphrase and multi-object test, +to enable more fine-grained assessment on knowledge generalization. Second, we +propose a novel criterion, \textbf{adaptive CLIP threshold}, to effectively +filter out false successful images under the current criterion and achieve +reliable editing evaluation. Finally, we introduce \textbf{MPE}, a simple but +effective approach for T2I knowledge editing. Instead of tuning parameters, MPE +precisely recognizes and edits the outdated part of the conditioning +text-prompt to accommodate the up-to-date knowledge. A straightforward +implementation of MPE (Based on in-context learning) exhibits better overall +performance than previous model editors. We hope these efforts can further +promote faithful evaluation of T2I knowledge editing methods. + +
+
+ comment: EMNLP24 Findings +
+
+
+
+
+ + ☆ Atlas-Chat: Adapting Large Language Models for Low-Resource Moroccan + Arabic Dialect + + +
+ We introduce Atlas-Chat, the first-ever collection of large language models +specifically developed for dialectal Arabic. Focusing on Moroccan Arabic, also +known as Darija, we construct our instruction dataset by consolidating existing +Darija language resources, creating novel datasets both manually and +synthetically, and translating English instructions with stringent quality +control. Atlas-Chat-9B and 2B models, fine-tuned on the dataset, exhibit +superior ability in following Darija instructions and performing standard NLP +tasks. Notably, our models outperform both state-of-the-art and +Arabic-specialized LLMs like LLaMa, Jais, and AceGPT, e.g., achieving a 13% +performance boost over a larger 13B model on DarijaMMLU, in our newly +introduced evaluation suite for Darija covering both discriminative and +generative tasks. Furthermore, we perform an experimental analysis of various +fine-tuning strategies and base model choices to determine optimal +configurations. All our resources are publicly accessible, and we believe our +work offers comprehensive design methodologies of instruction-tuning for +low-resource language variants, which are often neglected in favor of data-rich +languages by contemporary LLMs. + +
+
+
+
+
+ + ☆ Revisiting Acoustic Similarity in Emotional Speech and Music via + Self-Supervised Representations + + +
+ Emotion recognition from speech and music shares similarities due to their +acoustic overlap, which has led to interest in transferring knowledge between +these domains. However, the shared acoustic cues between speech and music, +particularly those encoded by Self-Supervised Learning (SSL) models, remain +largely unexplored, given the fact that SSL models for speech and music have +rarely been applied in cross-domain research. In this work, we revisit the +acoustic similarity between emotion speech and music, starting with an analysis +of the layerwise behavior of SSL models for Speech Emotion Recognition (SER) +and Music Emotion Recognition (MER). Furthermore, we perform cross-domain +adaptation by comparing several approaches in a two-stage fine-tuning process, +examining effective ways to utilize music for SER and speech for MER. Lastly, +we explore the acoustic similarities between emotional speech and music using +Frechet audio distance for individual emotions, uncovering the issue of emotion +bias in both speech and music SSL models. Our findings reveal that while speech +and music SSL models do capture shared acoustic features, their behaviors can +vary depending on different emotions due to their training strategies and +domain-specificities. Additionally, parameter-efficient fine-tuning can enhance +SER and MER performance by leveraging knowledge from each other. This study +provides new insights into the acoustic similarity between emotional speech and +music, and highlights the potential for cross-domain generalization to improve +SER and MER systems. + +
+
+
+
+
+ + ☆ EMMA-500: Enhancing Massively Multilingual Adaptation of Large Language + Models + + +
+ In this work, we introduce EMMA-500, a large-scale multilingual language +model continue-trained on texts across 546 languages designed for enhanced +multilingual performance, focusing on improving language coverage for +low-resource languages. To facilitate continual pre-training, we compile the +MaLA corpus, a comprehensive multilingual dataset enriched with curated +datasets across diverse domains. Leveraging this corpus, we conduct extensive +continual pre-training of the Llama 2 7B model, resulting in EMMA-500, which +demonstrates robust performance across a wide collection of benchmarks, +including a comprehensive set of multilingual tasks and PolyWrite, an +open-ended generation benchmark developed in this study. Our results highlight +the effectiveness of continual pre-training in expanding large language models' +language capacity, particularly for underrepresented languages, demonstrating +significant gains in cross-lingual transfer, task generalization, and language +adaptability. + +
+
+
+
+
+ + ☆ Implementing a Nordic-Baltic Federated Health Data Network: a case + report + + +
+ Background: Centralized collection and processing of healthcare data across +national borders pose significant challenges, including privacy concerns, data +heterogeneity and legal barriers. To address some of these challenges, we +formed an interdisciplinary consortium to develop a feder-ated health data +network, comprised of six institutions across five countries, to facilitate +Nordic-Baltic cooperation on secondary use of health data. The objective of +this report is to offer early insights into our experiences developing this +network. Methods: We used a mixed-method ap-proach, combining both experimental +design and implementation science to evaluate the factors affecting the +implementation of our network. Results: Technically, our experiments indicate +that the network functions without significant performance degradation compared +to centralized simu-lation. Conclusion: While use of interdisciplinary +approaches holds a potential to solve challeng-es associated with establishing +such collaborative networks, our findings turn the spotlight on the uncertain +regulatory landscape playing catch up and the significant operational costs. + +
+
+ comment: 24 pages (including appendices), 1 figure +
+
+
+
+
+ + ☆ PEDRO: Parameter-Efficient Fine-tuning with Prompt DEpenDent + Representation MOdification + + +
+ Due to their substantial sizes, large language models (LLMs) are typically +deployed within a single-backbone multi-tenant framework. In this setup, a +single instance of an LLM backbone must cater to multiple users or tasks +through the application of various parameter-efficient fine-tuning (PEFT) +models. Despite the availability of numerous effective PEFT techniques such as +LoRA, there remains a need for a PEFT approach that achieves both high +efficiency during inference and competitive performance on downstream tasks. In +this research, we introduce a new and straightforward PEFT methodology named +\underline{P}rompt D\underline{E}pen\underline{D}ent \underline{R}epresentation +M\underline{O}dification (PEDRO). The proposed method involves integrating a +lightweight vector generator into each Transformer layer, which generates +vectors contingent upon the input prompts. These vectors then modify the hidden +representations created by the LLM through a dot product operation, thereby +influencing the semantic output and generated content of the model. Extensive +experimentation across a variety of tasks indicates that: (a) PEDRO surpasses +recent PEFT benchmarks when using a similar number of tunable parameters. (b) +Under the single-backbone multi-tenant deployment model, PEDRO exhibits +superior efficiency compared to LoRA, indicating significant industrial +potential. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2405.18203 +
+
+
+
+
+ + ☆ BeanCounter: A low-toxicity, large-scale, and open dataset of + business-oriented text + + +
+ Many of the recent breakthroughs in language modeling have resulted from +scaling effectively the same model architecture to larger datasets. In this +vein, recent work has highlighted performance gains from increasing training +dataset size and quality, suggesting a need for novel sources of large-scale +datasets. In this work, we introduce BeanCounter, a public dataset consisting +of more than 159B tokens extracted from businesses' disclosures. We show that +this data is indeed novel: less than 0.1% of BeanCounter appears in Common +Crawl-based datasets and it is an order of magnitude larger than datasets +relying on similar sources. Given the data's provenance, we hypothesize that +BeanCounter is comparatively more factual and less toxic than web-based +datasets. Exploring this hypothesis, we find that many demographic identities +occur with similar prevalence in BeanCounter but with significantly less toxic +context relative to other datasets. To demonstrate the utility of BeanCounter, +we evaluate and compare two LLMs continually pre-trained on BeanCounter with +their base models. We find an 18-33% reduction in toxic generation and improved +performance within the finance domain for the continually pretrained models. +Collectively, our work suggests that BeanCounter is a novel source of +low-toxicity and high-quality domain-specific data with sufficient scale to +train multi-billion parameter LLMs. + +
+
+
+
+
+ + ☆ Inference-Time Language Model Alignment via Integrated Value Guidance EMNLP 2024 + + +
+ Large language models are typically fine-tuned to align with human +preferences, but tuning large models is computationally intensive and complex. +In this work, we introduce $\textit{Integrated Value Guidance}$ (IVG), a method +that uses implicit and explicit value functions to guide language model +decoding at token and chunk-level respectively, efficiently aligning large +language models purely at inference time. This approach circumvents the +complexities of direct fine-tuning and outperforms traditional methods. +Empirically, we demonstrate the versatility of IVG across various tasks. In +controlled sentiment generation and summarization tasks, our method +significantly improves the alignment of large models using inference-time +guidance from $\texttt{gpt2}$-based value functions. Moreover, in a more +challenging instruction-following benchmark AlpacaEval 2.0, we show that both +specifically tuned and off-the-shelf value functions greatly improve the +length-controlled win rates of large models against $\texttt{gpt-4-turbo}$ +(e.g., $19.51\% \rightarrow 26.51\%$ for $\texttt{Mistral-7B-Instruct-v0.2}$ +and $25.58\% \rightarrow 33.75\%$ for $\texttt{Mixtral-8x7B-Instruct-v0.1}$ +with Tulu guidance). + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ☆ Self-supervised Preference Optimization: Enhance Your Language Model + with Preference Degree Awareness EMNLP 2024 + + +
+ Recently, there has been significant interest in replacing the reward model +in Reinforcement Learning with Human Feedback (RLHF) methods for Large Language +Models (LLMs), such as Direct Preference Optimization (DPO) and its variants. +These approaches commonly use a binary cross-entropy mechanism on pairwise +samples, i.e., minimizing and maximizing the loss based on preferred or +dis-preferred responses, respectively. However, while this training strategy +omits the reward model, it also overlooks the varying preference degrees within +different responses. We hypothesize that this is a key factor hindering LLMs +from sufficiently understanding human preferences. To address this problem, we +propose a novel Self-supervised Preference Optimization (SPO) framework, which +constructs a self-supervised preference degree loss combined with the alignment +loss, thereby helping LLMs improve their ability to understand the degree of +preference. Extensive experiments are conducted on two widely used datasets of +different tasks. The results demonstrate that SPO can be seamlessly integrated +with existing preference optimization methods and significantly boost their +performance to achieve state-of-the-art performance. We also conduct detailed +analyses to offer comprehensive insights into SPO, which verifies its +effectiveness. The code is available at https://github.com/lijian16/SPO. + +
+
+ comment: Accepted at EMNLP 2024 Findings +
+
+
+
+
+ + ☆ Faithfulness and the Notion of Adversarial Sensitivity in NLP + Explanations EMNLP 2024 + + +
+ Faithfulness is arguably the most critical metric to assess the reliability +of explainable AI. In NLP, current methods for faithfulness evaluation are +fraught with discrepancies and biases, often failing to capture the true +reasoning of models. We introduce Adversarial Sensitivity as a novel approach +to faithfulness evaluation, focusing on the explainer's response when the model +is under adversarial attack. Our method accounts for the faithfulness of +explainers by capturing sensitivity to adversarial input changes. This work +addresses significant limitations in existing evaluation techniques, and +furthermore, quantifies faithfulness from a crucial yet underexplored paradigm. + +
+
+ comment: Accepted as a Full Paper at EMNLP 2024 Workshop BlackBoxNLP +
+
+
+
+
+ + ☆ Integrating Hierarchical Semantic into Iterative Generation Model for + Entailment Tree Explanation + + +
+ Manifestly and logically displaying the line of reasoning from evidence to +answer is significant to explainable question answering (QA). The entailment +tree exhibits the lines structurally, which is different from the +self-explanation principle in large-scale language models. Existing methods +rarely consider the semantic association of sentences between and within +hierarchies within the tree structure, which is prone to apparent mistakes in +combinations. In this work, we propose an architecture of integrating the +Hierarchical Semantics of sentences under the framework of Controller-Generator +(HiSCG) to explain answers. The HiSCG designs a hierarchical mapping between +hypotheses and facts, discriminates the facts involved in tree constructions, +and optimizes single-step entailments. To the best of our knowledge, We are the +first to notice hierarchical semantics of sentences between the same layer and +adjacent layers to yield improvements. The proposed method achieves comparable +performance on all three settings of the EntailmentBank dataset. The +generalization results on two out-of-domain datasets also demonstrate the +effectiveness of our method. + +
+
+
+
+
+ + ☆ SECURE: Semantics-aware Embodied Conversation under Unawareness for + Lifelong Robot Learning + + +
+ This paper addresses a challenging interactive task learning scenario we call +rearrangement under unawareness: to manipulate a rigid-body environment in a +context where the robot is unaware of a concept that's key to solving the +instructed task. We propose SECURE, an interactive task learning framework +designed to solve such problems by fixing a deficient domain model using +embodied conversation. Through dialogue, the robot discovers and then learns to +exploit unforeseen possibilities. Using SECURE, the robot not only learns from +the user's corrective feedback when it makes a mistake, but it also learns to +make strategic dialogue decisions for revealing useful evidence about novel +concepts for solving the instructed task. Together, these abilities allow the +robot to generalise to subsequent tasks using newly acquired knowledge. We +demonstrate that a robot that is semantics-aware -- that is, it exploits the +logical consequences of both sentence and discourse semantics in the learning +and inference process -- learns to solve rearrangement under unawareness more +effectively than a robot that lacks such capabilities. + +
+
+ comment: 10 pages,4 figures, 2 tables +
+
+
+
+
+ + ☆ Are Transformers in Pre-trained LM A Good ASR Encoder? An Empirical + Study + + +
+ In this study, we delve into the efficacy of transformers within pre-trained +language models (PLMs) when repurposed as encoders for Automatic Speech +Recognition (ASR). Our underlying hypothesis posits that, despite being +initially trained on text-based corpora, these transformers possess a +remarkable capacity to extract effective features from the input sequence. This +inherent capability, we argue, is transferrable to speech data, thereby +augmenting the acoustic modeling ability of ASR. Through rigorous empirical +analysis, our findings reveal a notable improvement in Character Error Rate +(CER) and Word Error Rate (WER) across diverse ASR tasks when transformers from +pre-trained LMs are incorporated. Particularly, they serve as an advantageous +starting point for initializing ASR encoders. Furthermore, we uncover that +these transformers, when integrated into a well-established ASR encoder, can +significantly boost performance, especially in scenarios where profound +semantic comprehension is pivotal. This underscores the potential of leveraging +the semantic prowess embedded within pre-trained transformers to advance ASR +systems' capabilities. + +
+
+ comment: 8pages +
+
+
+
+
+ + ☆ Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval + Model EMNLP 2024 + + +
+ A supervised ranking model, despite its advantage of being effective, usually +involves complex processing - typically multiple stages of task-specific +pre-training and fine-tuning. This has motivated researchers to explore simpler +pipelines leveraging large language models (LLMs) that are capable of working +in a zero-shot manner. However, since zero-shot inference does not make use of +a training set of pairs of queries and their relevant documents, its +performance is mostly worse than that of supervised models, which are trained +on such example pairs. Motivated by the existing findings that training +examples generally improve zero-shot performance, in our work, we explore if +this also applies to ranking models. More specifically, given a query and a +pair of documents, the preference prediction task is improved by augmenting +examples of preferences for similar queries from a training set. Our proposed +pairwise few-shot ranker demonstrates consistent improvements over the +zero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset) +retrieval benchmarks. Our method also achieves a close performance to that of a +supervised model without requiring any complex training pipeline. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ MIO: A Foundation Model on Multimodal Tokens + + +
+ In this paper, we introduce MIO, a novel foundation model built on multimodal +tokens, capable of understanding and generating speech, text, images, and +videos in an end-to-end, autoregressive manner. While the emergence of large +language models (LLMs) and multimodal large language models (MM-LLMs) propels +advancements in artificial general intelligence through their versatile +capabilities, they still lack true any-to-any understanding and generation. +Recently, the release of GPT-4o has showcased the remarkable potential of +any-to-any LLMs for complex real-world tasks, enabling omnidirectional input +and output across images, speech, and text. However, it is closed-source and +does not support the generation of multimodal interleaved sequences. To address +this gap, we present MIO, which is trained on a mixture of discrete tokens +across four modalities using causal multimodal modeling. MIO undergoes a +four-stage training process: (1) alignment pre-training, (2) interleaved +pre-training, (3) speech-enhanced pre-training, and (4) comprehensive +supervised fine-tuning on diverse textual, visual, and speech tasks. Our +experimental results indicate that MIO exhibits competitive, and in some cases +superior, performance compared to previous dual-modal baselines, any-to-any +model baselines, and even modality-specific baselines. Moreover, MIO +demonstrates advanced capabilities inherent to its any-to-any feature, such as +interleaved video-text generation, chain-of-visual-thought reasoning, visual +guideline generation, instructional image editing, etc. + +
+
+ comment: Technical Report. Codes and models will be available soon +
+
+
+
+
+ + ☆ Zero- and Few-shot Named Entity Recognition and Text Expansion in + Medication Prescriptions using ChatGPT + + +
+ Introduction: Medication prescriptions are often in free text and include a +mix of two languages, local brand names, and a wide range of idiosyncratic +formats and abbreviations. Large language models (LLMs) have shown promising +ability to generate text in response to input prompts. We use ChatGPT 3.5 to +automatically structure and expand medication statements in discharge summaries +and thus make them easier to interpret for people and machines. Methods: +Named-entity Recognition (NER) and Text Expansion (EX) are used in a zero- and +few-shot setting with different prompt strategies. 100 medication statements +were manually annotated and curated. NER performance was measured by using +strict and partial matching. For the task EX, two experts interpreted the +results by assessing semantic equivalence between original and expanded +statements. The model performance was measured by precision, recall, and F1 +score. Results: For NER, the best-performing prompt reached an average F1 score +of 0.94 in the test set. For EX, the few-shot prompt showed superior +performance among other prompts, with an average F1 score of 0.87. Conclusion: +Our study demonstrates good performance for NER and EX tasks in free-text +medication statements using ChatGPT. Compared to a zero-shot baseline, a +few-shot approach prevented the system from hallucinating, which would be +unacceptable when processing safety-relevant medication data. + +
+
+
+
+
+ + ☆ Cross-lingual Human-Preference Alignment for Neural Machine Translation + with Direct Quality Optimization + + +
+ Reinforcement Learning from Human Feedback (RLHF) and derivative techniques +like Direct Preference Optimization (DPO) are task-alignment algorithms used to +repurpose general, foundational models for specific tasks. We show that +applying task-alignment to neural machine translation (NMT) addresses an +existing task--data mismatch in NMT, leading to improvements across all +languages of a multilingual model, even when task-alignment is only applied to +a subset of those languages. We do so by introducing Direct Quality +Optimization (DQO), a variant of DPO leveraging a pre-trained translation +quality estimation model as a proxy for human preferences, and verify the +improvements with both automatic metrics and human evaluation. + +
+
+ comment: 17 pages, 1 figure +
+
+
+
+
+ + ☆ Digital Twin Ecosystem for Oncology Clinical Operations + + +
+ Artificial Intelligence (AI) and Large Language Models (LLMs) hold +significant promise in revolutionizing healthcare, especially in clinical +applications. Simultaneously, Digital Twin technology, which models and +simulates complex systems, has gained traction in enhancing patient care. +However, despite the advances in experimental clinical settings, the potential +of AI and digital twins to streamline clinical operations remains largely +untapped. This paper introduces a novel digital twin framework specifically +designed to enhance oncology clinical operations. We propose the integration of +multiple specialized digital twins, such as the Medical Necessity Twin, Care +Navigator Twin, and Clinical History Twin, to enhance workflow efficiency and +personalize care for each patient based on their unique data. Furthermore, by +synthesizing multiple data sources and aligning them with the National +Comprehensive Cancer Network (NCCN) guidelines, we create a dynamic Cancer Care +Path, a continuously evolving knowledge base that enables these digital twins +to provide precise, tailored clinical recommendations. + +
+
+ comment: Pre Print +
+
+
+
+
+ + ☆ Efficient In-Domain Question Answering for Resource-Constrained + Environments + + +
+ Retrieval Augmented Generation (RAG) is a common method for integrating +external knowledge into pretrained Large Language Models (LLMs) to enhance +accuracy and relevancy in question answering (QA) tasks. However, prompt +engineering and resource efficiency remain significant bottlenecks in +developing optimal and robust RAG solutions for real-world QA applications. +Recent studies have shown success in using fine tuning to address these +problems; in particular, Retrieval Augmented Fine Tuning (RAFT) applied to +smaller 7B models has demonstrated superior performance compared to RAG setups +with much larger models such as GPT-3.5. The combination of RAFT with +parameter-efficient fine tuning (PEFT) techniques, such as Low-Rank Adaptation +(LoRA), promises an even more efficient solution, yet remains an unexplored +area. In this work, we combine RAFT with LoRA to reduce fine tuning and storage +requirements and gain faster inference times while maintaining comparable RAG +performance. This results in a more compute-efficient RAFT, or CRAFT, which is +particularly useful for knowledge-intensive QA tasks in resource-constrained +environments where internet access may be restricted and hardware resources +limited. + +
+
+ comment: 6 pages, 2 tables +
+
+
+
+
+ + ☆ T3: A Novel Zero-shot Transfer Learning Framework Iteratively Training + on an Assistant Task for a Target Task + + +
+ Long text summarization, gradually being essential for efficiently processing +large volumes of information, stays challenging for Large Language Models +(LLMs) such as GPT and LLaMA families because of the insufficient open-sourced +training datasets and the high requirement of contextual details dealing. To +address the issue, we design a novel zero-shot transfer learning framework, +abbreviated as T3, to iteratively training a baseline LLM on an assistant task +for the target task, where the former should own richer data resources and +share structural or semantic similarity with the latter. In practice, T3 is +approached to deal with the long text summarization task by utilizing question +answering as the assistant task, and further validated its effectiveness on the +BBC summary, NarraSum, FairytaleQA, and NLQuAD datasets, with up to nearly 14% +improvement in ROUGE, 35% improvement in BLEU, and 16% improvement in Factscore +compared to three baseline LLMs, demonstrating its potential for more +assistant-target task combinations. + +
+
+
+
+
+ + ☆ ZALM3: Zero-Shot Enhancement of Vision-Language Alignment via In-Context + Information in Multi-Turn Multimodal Medical Dialogue + + +
+ The rocketing prosperity of large language models (LLMs) in recent years has +boosted the prevalence of vision-language models (VLMs) in the medical sector. +In our online medical consultation scenario, a doctor responds to the texts and +images provided by a patient in multiple rounds to diagnose her/his health +condition, forming a multi-turn multimodal medical dialogue format. Unlike +high-quality images captured by professional equipment in traditional medical +visual question answering (Med-VQA), the images in our case are taken by +patients' mobile phones. These images have poor quality control, with issues +such as excessive background elements and the lesion area being significantly +off-center, leading to degradation of vision-language alignment in the model +training phase. In this paper, we propose ZALM3, a Zero-shot strategy to +improve vision-language ALignment in Multi-turn Multimodal Medical dialogue. +Since we observe that the preceding text conversations before an image can +infer the regions of interest (RoIs) in the image, ZALM3 employs an LLM to +summarize the keywords from the preceding context and a visual grounding model +to extract the RoIs. The updated images eliminate unnecessary background noise +and provide more effective vision-language alignment. To better evaluate our +proposed method, we design a new subjective assessment metric for multi-turn +unimodal/multimodal medical dialogue to provide a fine-grained performance +comparison. Our experiments across three different clinical departments +remarkably demonstrate the efficacy of ZALM3 with statistical significance. + +
+
+
+
+
+ + ☆ Deep CLAS: Deep Contextual Listen, Attend and Spell + + +
+ Contextual-LAS (CLAS) has been shown effective in improving Automatic Speech +Recognition (ASR) of rare words. It relies on phrase-level contextual modeling +and attention-based relevance scoring without explicit contextual constraint +which lead to insufficient use of contextual information. In this work, we +propose deep CLAS to use contextual information better. We introduce bias loss +forcing model to focus on contextual information. The query of bias attention +is also enriched to improve the accuracy of the bias attention score. To get +fine-grained contextual information, we replace phrase-level encoding with +character-level encoding and encode contextual information with conformer +rather than LSTM. Moreover, we directly use the bias attention score to correct +the output probability distribution of the model. Experiments using the public +AISHELL-1 and AISHELL-NER. On AISHELL-1, compared to CLAS baselines, deep CLAS +obtains a 65.78% relative recall and a 53.49% relative F1-score increase in the +named entity recognition scene. + +
+
+ comment: Accepted by NCMMSC 2022 +
+
+
+
+
+ + ☆ DualCoTs: Dual Chain-of-Thoughts Prompting for Sentiment Lexicon + Expansion of Idioms + + +
+ Idioms represent a ubiquitous vehicle for conveying sentiments in the realm +of everyday discourse, rendering the nuanced analysis of idiom sentiment +crucial for a comprehensive understanding of emotional expression within +real-world texts. Nevertheless, the existing corpora dedicated to idiom +sentiment analysis considerably limit research in text sentiment analysis. In +this paper, we propose an innovative approach to automatically expand the +sentiment lexicon for idioms, leveraging the capabilities of large language +models through the application of Chain-of-Thought prompting. To demonstrate +the effectiveness of this approach, we integrate multiple existing resources +and construct an emotional idiom lexicon expansion dataset (called EmoIdiomE), +which encompasses a comprehensive repository of Chinese and English idioms. +Then we designed the Dual Chain-of-Thoughts (DualCoTs) method, which combines +insights from linguistics and psycholinguistics, to demonstrate the +effectiveness of using large models to automatically expand the sentiment +lexicon for idioms. Experiments show that DualCoTs is effective in idioms +sentiment lexicon expansion in both Chinese and English. For reproducibility, +we will release the data and code upon acceptance. + +
+
+
+
+
+ + ☆ Leveraging Annotator Disagreement for Text Classification + + +
+ It is common practice in text classification to only use one majority label +for model training even if a dataset has been annotated by multiple annotators. +Doing so can remove valuable nuances and diverse perspectives inherent in the +annotators' assessments. This paper proposes and compares three different +strategies to leverage annotator disagreement for text classification: a +probability-based multi-label method, an ensemble system, and instruction +tuning. All three approaches are evaluated on the tasks of hate speech and +abusive conversation detection, which inherently entail a high degree of +subjectivity. Moreover, to evaluate the effectiveness of embracing annotation +disagreements for model training, we conduct an online survey that compares the +performance of the multi-label model against a baseline model, which is trained +with the majority label. + The results show that in hate speech detection, the multi-label method +outperforms the other two approaches, while in abusive conversation detection, +instruction tuning achieves the best performance. The results of the survey +also show that the outputs from the multi-label models are considered a better +representation of the texts than the single-label model. + +
+
+
+
+
+ + ☆ Modulated Intervention Preference Optimization (MIPO): Keey the Easy, + Refine the Difficult AAAI 2025 + + +
+ Preference optimization methods typically begin training with a well-trained +SFT model as a reference model. In RLHF and DPO, a regularization term is used +during the preference optimization process to prevent the policy model from +deviating too far from the reference model's distribution, thereby avoiding the +generation of anomalous responses. When the reference model is already +well-aligned with the given data or only requires slight adjustments, this +approach can produce a well-aligned model. However, if the reference model is +not aligned with the given data and requires significant deviation from its +current state, a regularization term may actually hinder the model alignment. +In this study, we propose \textbf{Modulated Intervention Preference +Optimization (MIPO)} to address this issue. MIPO modulates the degree of +intervention from the reference model based on how well the given data is +aligned with it. If the data is well-aligned, the intervention is increased to +prevent the policy model from diverging significantly from reference model. +Conversely, if the alignment is poor, the interference is reduced to facilitate +more extensive training. We compare the performance of MIPO and DPO using +Mistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental +results demonstrate that MIPO consistently outperforms DPO across various +evaluation scenarios. + +
+
+ comment: 8pages, submitted to AAAI 2025 +
+
+
+
+
+ + ☆ Logic-of-Thought: Injecting Logic into Contexts for Full Reasoning in + Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities across +various tasks but their performance in complex logical reasoning tasks remains +unsatisfactory. Although some prompting methods, such as Chain-of-Thought, can +improve the reasoning ability of LLMs to some extent, they suffer from an +unfaithful issue where derived conclusions may not align with the generated +reasoning chain. To address this issue, some studies employ the approach of +propositional logic to further enhance logical reasoning abilities of LLMs. +However, the potential omissions in the extraction of logical expressions in +these methods can cause information loss in the logical reasoning process, +thereby generating incorrect results. To this end, we propose Logic-of-Thought +(LoT) prompting which employs propositional logic to generate expanded logical +information from input context, and utilizes the generated logical information +as an additional augmentation to the input prompts, thereby enhancing the +capability of logical reasoning. The LoT is orthogonal to existing prompting +methods and can be seamlessly integrated with them. Extensive experiments +demonstrate that LoT boosts the performance of various prompting methods with a +striking margin across five logical reasoning tasks. In particular, the LoT +enhances Chain-of-Thought's performance on the ReClor dataset by +4.35%; +moreover, it improves Chain-of-Thought with Self-Consistency's performance on +LogiQA by +5%; additionally, it boosts performance of Tree-of-Thoughts on +ProofWriter dataset by +8%. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ On the Implicit Relation Between Low-Rank Adaptation and Differential + Privacy + + +
+ A significant approach in natural language processing involves large-scale +pre-training on general domain data followed by adaptation to specific tasks or +domains. As models grow in size, full fine-tuning all parameters becomes +increasingly impractical. To address this, some methods for low-rank task +adaptation of language models have been proposed, e.g. LoRA and FLoRA. These +methods keep the pre-trained model weights fixed and incorporate trainable +low-rank decomposition matrices into some layers of the transformer +architecture, called adapters. This approach significantly reduces the number +of trainable parameters required for downstream tasks compared to full +fine-tuning all parameters. In this work, we look at low-rank adaptation from +the lens of data privacy. We show theoretically that the low-rank adaptation +used in LoRA and FLoRA is equivalent to injecting some random noise into the +batch gradients w.r.t the adapter parameters coming from their full +fine-tuning, and we quantify the variance of the injected noise. By +establishing a Berry-Esseen type bound on the total variation distance between +the noise distribution and a Gaussian distribution with the same variance, we +show that the dynamics of LoRA and FLoRA are very close to differentially +private full fine-tuning the adapters, which suggests that low-rank adaptation +implicitly provides privacy w.r.t the fine-tuning data. Finally, using +Johnson-Lindenstrauss lemma, we show that when augmented with gradient +clipping, low-rank adaptation is almost equivalent to differentially private +full fine-tuning adapters with a fixed noise scale. + +
+
+
+
+
+ + ☆ MUSE: Integrating Multi-Knowledge for Knowledge Graph Completion + + +
+ Knowledge Graph Completion (KGC) aims to predict the missing [relation] part +of (head entity)--[relation]->(tail entity) triplet. Most existing KGC methods +focus on single features (e.g., relation types) or sub-graph aggregation. +However, they do not fully explore the Knowledge Graph (KG) features and +neglect the guidance of external semantic knowledge. To address these +shortcomings, we propose a knowledge-aware reasoning model (MUSE), which +designs a novel multi-knowledge representation learning mechanism for missing +relation prediction. Our model develops a tailored embedding space through +three parallel components: 1) Prior Knowledge Learning for enhancing the +triplets' semantic representation by fine-tuning BERT; 2) Context Message +Passing for enhancing the context messages of KG; 3) Relational Path +Aggregation for enhancing the path representation from the head entity to the +tail entity. The experimental results show that MUSE significantly outperforms +other baselines on four public datasets, achieving over 5.50% H@1 improvement +and 4.20% MRR improvement on the NELL995 dataset. The code and datasets will be +released via https://github.com/SUSTech-TP/ADMA2024-MUSE.git. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2408.05283 +
+
+
+
+
+ + ☆ Data Proportion Detection for Optimized Data Management for Large + Language Models + + +
+ Large language models (LLMs) have demonstrated exceptional performance across +a wide range of tasks and domains, with data preparation playing a critical +role in achieving these results. Pre-training data typically combines +information from multiple domains. To maximize performance when integrating +data from various domains, determining the optimal data proportion is +essential. However, state-of-the-art (SOTA) LLMs rarely disclose details about +their pre-training data, making it difficult for researchers to identify ideal +data proportions. In this paper, we introduce a new topic, \textit{data +proportion detection}, which enables the automatic estimation of pre-training +data proportions by analyzing the generated outputs of LLMs. We provide +rigorous theoretical proofs, practical algorithms, and preliminary experimental +results for data proportion detection. Based on these findings, we offer +valuable insights into the challenges and future directions for effective data +proportion detection and data management. + +
+
+
+
+
+ + ☆ When A Man Says He Is Pregnant: ERP Evidence for A Rational Account of + Speaker-contextualized Language Comprehension + + +
+ Spoken language is often, if not always, understood in a context that +includes the identities of speakers. For instance, we can easily make sense of +an utterance such as "I'm going to have a manicure this weekend" or "The first +time I got pregnant I had a hard time" when the utterance is spoken by a woman, +but it would be harder to understand when it is spoken by a man. Previous +event-related potential (ERP) studies have shown mixed results regarding the +neurophysiological responses to such speaker-mismatched utterances, with some +reporting an N400 effect and others a P600 effect. In an experiment involving +64 participants, we showed that these different ERP effects reflect distinct +cognitive processes employed to resolve the speaker-message mismatch. When +possible, the message is integrated with the speaker context to arrive at an +interpretation, as in the case of violations of social stereotypes (e.g., men +getting a manicure), resulting in an N400 effect. However, when such +integration is impossible due to violations of biological knowledge (e.g., men +getting pregnant), listeners engage in an error correction process to revise +either the perceived utterance or the speaker context, resulting in a P600 +effect. Additionally, we found that the social N400 effect decreased as a +function of the listener's personality trait of openness, while the biological +P600 effect remained robust. Our findings help to reconcile the empirical +inconsistencies in the literature and provide a rational account of +speaker-contextualized language comprehension. + +
+
+
+
+
+ + ☆ Comparing Unidirectional, Bidirectional, and Word2vec Models for + Discovering Vulnerabilities in Compiled Lifted Code + + +
+ Ransomware and other forms of malware cause significant financial and +operational damage to organizations by exploiting long-standing and often +difficult-to-detect software vulnerabilities. To detect vulnerabilities such as +buffer overflows in compiled code, this research investigates the application +of unidirectional transformer-based embeddings, specifically GPT-2. Using a +dataset of LLVM functions, we trained a GPT-2 model to generate embeddings, +which were subsequently used to build LSTM neural networks to differentiate +between vulnerable and non-vulnerable code. Our study reveals that embeddings +from the GPT-2 model significantly outperform those from bidirectional models +of BERT and RoBERTa, achieving an accuracy of 92.5% and an F1-score of 89.7%. +LSTM neural networks were developed with both frozen and unfrozen embedding +model layers. The model with the highest performance was achieved when the +embedding layers were unfrozen. Further, the research finds that, in exploring +the impact of different optimizers within this domain, the SGD optimizer +demonstrates superior performance over Adam. Overall, these findings reveal +important insights into the potential of unidirectional transformer-based +approaches in enhancing cybersecurity defenses. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ☆ HaloScope: Harnessing Unlabeled LLM Generations for Hallucination + Detection NeurIPS 2024 + + +
+ The surge in applications of large language models (LLMs) has prompted +concerns about the generation of misleading or fabricated information, known as +hallucinations. Therefore, detecting hallucinations has become critical to +maintaining trust in LLM-generated content. A primary challenge in learning a +truthfulness classifier is the lack of a large amount of labeled truthful and +hallucinated data. To address the challenge, we introduce HaloScope, a novel +learning framework that leverages the unlabeled LLM generations in the wild for +hallucination detection. Such unlabeled data arises freely upon deploying LLMs +in the open world, and consists of both truthful and hallucinated information. +To harness the unlabeled data, we present an automated membership estimation +score for distinguishing between truthful and untruthful generations within +unlabeled mixture data, thereby enabling the training of a binary truthfulness +classifier on top. Importantly, our framework does not require extra data +collection and human annotations, offering strong flexibility and practicality +for real-world applications. Extensive experiments show that HaloScope can +achieve superior hallucination detection performance, outperforming the +competitive rivals by a significant margin. Code is available at +https://github.com/deeplearningwisc/haloscope. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ☆ MaskLLM: Learnable Semi-Structured Sparsity for Large Language Models NeurIPS 2024 + + +
+ Large Language Models (LLMs) are distinguished by their massive parameter +counts, which typically result in significant redundancy. This work introduces +MaskLLM, a learnable pruning method that establishes Semi-structured (or +``N:M'') Sparsity in LLMs, aimed at reducing computational overhead during +inference. Instead of developing a new importance criterion, MaskLLM explicitly +models N:M patterns as a learnable distribution through Gumbel Softmax +sampling. This approach facilitates end-to-end training on large-scale datasets +and offers two notable advantages: 1) High-quality Masks - our method +effectively scales to large datasets and learns accurate masks; 2) +Transferability - the probabilistic modeling of mask distribution enables the +transfer learning of sparsity across domains or tasks. We assessed MaskLLM +using 2:4 sparsity on various LLMs, including LLaMA-2, Nemotron-4, and GPT-3, +with sizes ranging from 843M to 15B parameters, and our empirical results show +substantial improvements over state-of-the-art methods. For instance, leading +approaches achieve a perplexity (PPL) of 10 or greater on Wikitext compared to +the dense model's 5.12 PPL, but MaskLLM achieves a significantly lower 6.72 PPL +solely by learning the masks with frozen weights. Furthermore, MaskLLM's +learnable nature allows customized masks for lossless application of 2:4 +sparsity to downstream tasks or domains. Code is available at +\url{https://github.com/NVlabs/MaskLLM}. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ☆ Reducing and Exploiting Data Augmentation Noise through Meta Reweighting + Contrastive Learning for Text Classification + + +
+ Data augmentation has shown its effectiveness in resolving the data-hungry +problem and improving model's generalization ability. However, the quality of +augmented data can be varied, especially compared with the raw/original data. +To boost deep learning models' performance given augmented data/samples in text +classification tasks, we propose a novel framework, which leverages both meta +learning and contrastive learning techniques as parts of our design for +reweighting the augmented samples and refining their feature representations +based on their quality. As part of the framework, we propose novel +weight-dependent enqueue and dequeue algorithms to utilize augmented samples' +weight/quality information effectively. Through experiments, we show that our +framework can reasonably cooperate with existing deep learning models (e.g., +RoBERTa-base and Text-CNN) and augmentation techniques (e.g., Wordnet and +Easydata) for specific supervised learning tasks. Experiment results show that +our framework achieves an average of 1.6%, up to 4.3% absolute improvement on +Text-CNN encoders and an average of 1.4%, up to 4.4% absolute improvement on +RoBERTa-base encoders on seven GLUE benchmark datasets compared with the best +baseline. We present an indepth analysis of our framework design, revealing the +non-trivial contributions of our network components. Our code is publicly +available for better reproducibility. + +
+
+ comment: IEEE BigData 2021 +
+
+
+
+
+ + ☆ Autoregressive Multi-trait Essay Scoring via Reinforcement Learning with + Scoring-aware Multiple Rewards EMNLP 2024 + + +
+ Recent advances in automated essay scoring (AES) have shifted towards +evaluating multiple traits to provide enriched feedback. Like typical AES +systems, multi-trait AES employs the quadratic weighted kappa (QWK) to measure +agreement with human raters, aligning closely with the rating schema; however, +its non-differentiable nature prevents its direct use in neural network +training. In this paper, we propose Scoring-aware Multi-reward Reinforcement +Learning (SaMRL), which integrates actual evaluation schemes into the training +process by designing QWK-based rewards with a mean-squared error penalty for +multi-trait AES. Existing reinforcement learning (RL) applications in AES are +limited to classification models despite associated performance degradation, as +RL requires probability distributions; instead, we adopt an autoregressive +score generation framework to leverage token generation probabilities for +robust multi-trait score predictions. Empirical analyses demonstrate that SaMRL +facilitates model training, notably enhancing scoring of previously inferior +prompts. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ☆ What is the social benefit of hate speech detection research? A + Systematic Review + + +
+ While NLP research into hate speech detection has grown exponentially in the +last three decades, there has been minimal uptake or engagement from policy +makers and non-profit organisations. We argue the absence of ethical frameworks +have contributed to this rift between current practice and best practice. By +adopting appropriate ethical frameworks, NLP researchers may enable the social +impact potential of hate speech research. This position paper is informed by +reviewing forty-eight hate speech detection systems associated with +thirty-seven publications from different venues. + +
+
+ comment: Accepted to the 3rd Workshop on NLP for Positive Impact +
+
+
+
+
+ + ☆ RED QUEEN: Safeguarding Large Language Models against Concealed + Multi-Turn Jailbreaking + + +
+ The rapid progress of Large Language Models (LLMs) has opened up new +opportunities across various domains and applications; yet it also presents +challenges related to potential misuse. To mitigate such risks, red teaming has +been employed as a proactive security measure to probe language models for +harmful outputs via jailbreak attacks. However, current jailbreak attack +approaches are single-turn with explicit malicious queries that do not fully +capture the complexity of real-world interactions. In reality, users can engage +in multi-turn interactions with LLM-based chat assistants, allowing them to +conceal their true intentions in a more covert manner. To bridge this gap, we, +first, propose a new jailbreak approach, RED QUEEN ATTACK. This method +constructs a multi-turn scenario, concealing the malicious intent under the +guise of preventing harm. We craft 40 scenarios that vary in turns and select +14 harmful categories to generate 56k multi-turn attack data points. We conduct +comprehensive experiments on the RED QUEEN ATTACK with four representative LLM +families of different sizes. Our experiments reveal that all LLMs are +vulnerable to RED QUEEN ATTACK, reaching 87.62% attack success rate on GPT-4o +and 75.4% on Llama3-70B. Further analysis reveals that larger models are more +susceptible to the RED QUEEN ATTACK, with multi-turn structures and concealment +strategies contributing to its success. To prioritize safety, we introduce a +straightforward mitigation strategy called RED QUEEN GUARD, which aligns LLMs +to effectively counter adversarial attacks. This approach reduces the attack +success rate to below 1% while maintaining the model's performance across +standard benchmarks. Full implementation and dataset are publicly accessible at +https://github.com/kriti-hippo/red_queen. + +
+
+
+
+
+ + ☆ Navigating the Shortcut Maze: A Comprehensive Analysis of Shortcut + Learning in Text Classification by Language Models + + +
+ Language models (LMs), despite their advances, often depend on spurious +correlations, undermining their accuracy and generalizability. This study +addresses the overlooked impact of subtler, more complex shortcuts that +compromise model reliability beyond oversimplified shortcuts. We introduce a +comprehensive benchmark that categorizes shortcuts into occurrence, style, and +concept, aiming to explore the nuanced ways in which these shortcuts influence +the performance of LMs. Through extensive experiments across traditional LMs, +large language models, and state-of-the-art robust models, our research +systematically investigates models' resilience and susceptibilities to +sophisticated shortcuts. Our benchmark and code can be found at: +https://github.com/yuqing-zhou/shortcut-learning-in-text-classification. + +
+
+
+
+
+ + ☆ Description-based Controllable Text-to-Speech with Cross-Lingual Voice + Control ICASSP 2025 + + +
+ We propose a novel description-based controllable text-to-speech (TTS) method +with cross-lingual control capability. To address the lack of audio-description +paired data in the target language, we combine a TTS model trained on the +target language with a description control model trained on another language, +which maps input text descriptions to the conditional features of the TTS +model. These two models share disentangled timbre and style representations +based on self-supervised learning (SSL), allowing for disentangled voice +control, such as controlling speaking styles while retaining the original +timbre. Furthermore, because the SSL-based timbre and style representations are +language-agnostic, combining the TTS and description control models while +sharing the same embedding space effectively enables cross-lingual control of +voice characteristics. Experiments on English and Japanese TTS demonstrate that +our method achieves high naturalness and controllability for both languages, +even though no Japanese audio-description pairs are used. + +
+
+ comment: Submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Enhancing Financial Sentiment Analysis with Expert-Designed Hint + + +
+ This paper investigates the role of expert-designed hint in enhancing +sentiment analysis on financial social media posts. We explore the capability +of large language models (LLMs) to empathize with writer perspectives and +analyze sentiments. Our findings reveal that expert-designed hint, i.e., +pointing out the importance of numbers, significantly improve performances +across various LLMs, particularly in cases requiring perspective-taking skills. +Further analysis on tweets containing different types of numerical data +demonstrates that the inclusion of expert-designed hint leads to notable +improvements in sentiment analysis performance, especially for tweets with +monetary-related numbers. Our findings contribute to the ongoing discussion on +the applicability of Theory of Mind in NLP and open new avenues for improving +sentiment analysis in financial domains through the strategic use of expert +knowledge. + +
+
+
+
+
+ + ♻ ☆ Is It Good Data for Multilingual Instruction Tuning or Just Bad + Multilingual Evaluation for Large Language Models? EMNLP 2024 + + +
+ Multilingual large language models are designed, claimed, and expected to +cater to speakers of varied languages. We hypothesise that the current +practices of fine-tuning and evaluating these models may not perfectly align +with this objective owing to a heavy reliance on translation, which cannot +cover language-specific knowledge but can introduce translation defects. It +remains unknown whether the nature of the instruction data has an impact on the +model output; conversely, it is questionable whether translated test sets can +capture such nuances. Due to the often coupled practices of using translated +data in both stages, such imperfections could have been overlooked. This work +investigates these issues using controlled native or translated data during the +instruction tuning and evaluation stages. We show that native or generation +benchmarks reveal a notable difference between native and translated +instruction data especially when model performance is high, whereas other types +of test sets cannot. The comparison between round-trip and single-pass +translations reflects the importance of knowledge from language-native +resources. Finally, we demonstrate that regularization is beneficial to +bridging this gap on structured but not generative tasks. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ KAG: Boosting LLMs in Professional Domains via Knowledge Augmented + Generation + + +
+ The recently developed retrieval-augmented generation (RAG) technology has +enabled the efficient construction of domain-specific applications. However, it +also has limitations, including the gap between vector similarity and the +relevance of knowledge reasoning, as well as insensitivity to knowledge logic, +such as numerical values, temporal relations, expert rules, and others, which +hinder the effectiveness of professional knowledge services. In this work, we +introduce a professional domain knowledge service framework called Knowledge +Augmented Generation (KAG). KAG is designed to address the aforementioned +challenges with the motivation of making full use of the advantages of +knowledge graph(KG) and vector retrieval, and to improve generation and +reasoning performance by bidirectionally enhancing large language models (LLMs) +and KGs through five key aspects: (1) LLM-friendly knowledge representation, +(2) mutual-indexing between knowledge graphs and original chunks, (3) +logical-form-guided hybrid reasoning engine, (4) knowledge alignment with +semantic reasoning, and (5) model capability enhancement for KAG. We compared +KAG with existing RAG methods in multihop question answering and found that it +significantly outperforms state-of-theart methods, achieving a relative +improvement of 19.6% on 2wiki and 33.5% on hotpotQA in terms of F1 score. We +have successfully applied KAG to two professional knowledge Q&A tasks of Ant +Group, including E-Government Q&A and E-Health Q&A, achieving significant +improvement in professionalism compared to RAG methods. + +
+
+ comment: 33 pages +
+
+
+
+
+ + ♻ ☆ Recent Trends in Unsupervised Summarization + + +
+ Unsupervised summarization is a powerful technique that enables training +summarizing models without requiring labeled datasets. This survey covers +different recent techniques and models used for unsupervised summarization. We +cover extractive, abstractive, and hybrid models and strategies used to achieve +unsupervised summarization. While the main focus of this survey is on recent +research, we also cover some of the important previous research. We +additionally introduce a taxonomy, classifying different research based on +their approach to unsupervised training. Finally, we discuss the current +approaches and mention some datasets and evaluation methods. + +
+
+
+
+
+ + ♻ ☆ Language agents achieve superhuman synthesis of scientific knowledge + + +
+ Language models are known to hallucinate incorrect information, and it is +unclear if they are sufficiently accurate and reliable for use in scientific +research. We developed a rigorous human-AI comparison methodology to evaluate +language model agents on real-world literature search tasks covering +information retrieval, summarization, and contradiction detection tasks. We +show that PaperQA2, a frontier language model agent optimized for improved +factuality, matches or exceeds subject matter expert performance on three +realistic literature research tasks without any restrictions on humans (i.e., +full access to internet, search tools, and time). PaperQA2 writes cited, +Wikipedia-style summaries of scientific topics that are significantly more +accurate than existing, human-written Wikipedia articles. We also introduce a +hard benchmark for scientific literature research called LitQA2 that guided +design of PaperQA2, leading to it exceeding human performance. Finally, we +apply PaperQA2 to identify contradictions within the scientific literature, an +important scientific task that is challenging for humans. PaperQA2 identifies +2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of +which 70% are validated by human experts. These results demonstrate that +language model agents are now capable of exceeding domain experts across +meaningful tasks on scientific literature. + +
+
+
+
+
+ + ♻ ☆ Granularity is crucial when applying differential privacy to text: An + investigation for neural machine translation EMNLP + + +
+ Applying differential privacy (DP) by means of the DP-SGD algorithm to +protect individual data points during training is becoming increasingly popular +in NLP. However, the choice of granularity at which DP is applied is often +neglected. For example, neural machine translation (NMT) typically operates on +the sentence-level granularity. From the perspective of DP, this setup assumes +that each sentence belongs to a single person and any two sentences in the +training dataset are independent. This assumption is however violated in many +real-world NMT datasets, e.g., those including dialogues. For proper +application of DP we thus must shift from sentences to entire documents. In +this paper, we investigate NMT at both the sentence and document levels, +analyzing the privacy/utility trade-off for both scenarios, and evaluating the +risks of not using the appropriate privacy granularity in terms of leaking +personally identifiable information (PII). Our findings indicate that the +document-level NMT system is more resistant to membership inference attacks, +emphasizing the significance of using the appropriate granularity when working +with DP. + +
+
+ comment: Accepted at EMNLP Findings 2024 +
+
+
+
+
+ + ♻ ☆ Transformers, Contextualism, and Polysemy + + +
+ The transformer architecture, introduced by Vaswani et al. (2017), is at the +heart of the remarkable recent progress in the development of language models, +including widely-used chatbots such as Chat-GPT and Claude. In this paper, I +argue that we can extract from the way the transformer architecture works a +theory of the relationship between context and meaning. I call this the +transformer theory, and I argue that it is novel with regard to two related +philosophical debates: the contextualism debate regarding the extent of +context-sensitivity across natural language, and the polysemy debate regarding +how polysemy should be captured within an account of word meaning. + +
+
+
+
+
+ + ♻ ☆ Investigating OCR-Sensitive Neurons to Improve Entity Recognition in + Historical Documents + + +
+ This paper investigates the presence of OCR-sensitive neurons within the +Transformer architecture and their influence on named entity recognition (NER) +performance on historical documents. By analysing neuron activation patterns in +response to clean and noisy text inputs, we identify and then neutralise +OCR-sensitive neurons to improve model performance. Based on two open access +large language models (Llama2 and Mistral), experiments demonstrate the +existence of OCR-sensitive regions and show improvements in NER performance on +historical newspapers and classical commentaries, highlighting the potential of +targeted neuron modulation to improve models' performance on noisy text. + +
+
+
+
+
+ + ♻ ☆ AC4: Algebraic Computation Checker for Circuit Constraints in ZKPs + + +
+ Zero-knowledge proof (ZKP) systems have surged attention and held a +fundamental role in contemporary cryptography. Zero-knowledge succinct +non-interactive argument of knowledge (zk-SNARK) protocols dominate the ZKP +usage, implemented through arithmetic circuit programming paradigm. However, +underconstrained or overconstrained circuits may lead to bugs. The former +refers to circuits that lack the necessary constraints, resulting in unexpected +solutions and causing the verifier to accept a bogus witness, and the latter +refers to circuits that are constrained excessively, resulting in lacking +necessary solutions and causing the verifier to accept no witness. This paper +introduces a novel approach for pinpointing two distinct types of bugs in ZKP +circuits. The method involves encoding the arithmetic circuit constraints to +polynomial equation systems and solving them over finite fields by the computer +algebra system. The classification of verification results is refined, greatly +enhancing the expressive power of the system. A tool, AC4, is proposed to +represent the implementation of the method. Experiments show that AC4 +demonstrates a increase in the checked ratio, showing a 29% improvement over +Picus, a checker for Circom circuits, and a 10% improvement over +halo2-analyzer, a checker for halo2 circuits. Within a solvable range, the +checking time has also exhibited noticeable improvement, demonstrating a +magnitude increase compared to previous efforts. + +
+
+ comment: 24 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ GTSinger: A Global Multi-Technique Singing Corpus with Realistic Music + Scores for All Singing Tasks NeurIPS 2024 + + +
+ The scarcity of high-quality and multi-task singing datasets significantly +hinders the development of diverse controllable and personalized singing tasks, +as existing singing datasets suffer from low quality, limited diversity of +languages and singers, absence of multi-technique information and realistic +music scores, and poor task suitability. To tackle these problems, we present +GTSinger, a large global, multi-technique, free-to-use, high-quality singing +corpus with realistic music scores, designed for all singing tasks, along with +its benchmarks. Particularly, (1) we collect 80.59 hours of high-quality +singing voices, forming the largest recorded singing dataset; (2) 20 +professional singers across nine widely spoken languages offer diverse timbres +and styles; (3) we provide controlled comparison and phoneme-level annotations +of six commonly used singing techniques, helping technique modeling and +control; (4) GTSinger offers realistic music scores, assisting real-world +musical composition; (5) singing voices are accompanied by manual +phoneme-to-audio alignments, global style labels, and 16.16 hours of paired +speech for various singing tasks. Moreover, to facilitate the use of GTSinger, +we conduct four benchmark experiments: technique-controllable singing voice +synthesis, technique recognition, style transfer, and speech-to-singing +conversion. The corpus and demos can be found at http://gtsinger.github.io. We +provide the dataset and the code for processing data and conducting benchmarks +at https://huggingface.co/datasets/GTSinger/GTSinger and +https://github.com/GTSinger/GTSinger. + +
+
+ comment: Accepted by NeurIPS 2024 (Spotlight) +
+
+
+
+
+ + ♻ ☆ EfficientRAG: Efficient Retriever for Multi-Hop Question Answering + + +
+ Retrieval-augmented generation (RAG) methods encounter difficulties when +addressing complex questions like multi-hop queries. While iterative retrieval +methods improve performance by gathering additional information, current +approaches often rely on multiple calls of large language models (LLMs). In +this paper, we introduce EfficientRAG, an efficient retriever for multi-hop +question answering. EfficientRAG iteratively generates new queries without the +need for LLM calls at each iteration and filters out irrelevant information. +Experimental results demonstrate that EfficientRAG surpasses existing RAG +methods on three open-domain multi-hop question-answering datasets. + +
+
+ comment: 20 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ ICON: Improving Inter-Report Consistency in Radiology Report Generation + via Lesion-aware Mixup Augmentation + + +
+ Previous research on radiology report generation has made significant +progress in terms of increasing the clinical accuracy of generated reports. In +this paper, we emphasize another crucial quality that it should possess, i.e., +inter-report consistency, which refers to the capability of generating +consistent reports for semantically equivalent radiographs. This quality is +even of greater significance than the overall report accuracy in terms of +ensuring the system's credibility, as a system prone to providing conflicting +results would severely erode users' trust. Regrettably, existing approaches +struggle to maintain inter-report consistency, exhibiting biases towards common +patterns and susceptibility to lesion variants. To address this issue, we +propose ICON, which improves the inter-report consistency of radiology report +generation. Aiming to enhance the system's ability to capture similarities in +semantically equivalent lesions, our approach first involves extracting lesions +from input images and examining their characteristics. Then, we introduce a +lesion-aware mixup technique to ensure that the representations of the +semantically equivalent lesions align with the same attributes, achieved +through a linear combination during the training phase. Extensive experiments +on three publicly available chest X-ray datasets verify the effectiveness of +our approach, both in terms of improving the consistency and accuracy of the +generated reports. + +
+
+
+
+
+ + ♻ ☆ An Empirical Study on Cross-lingual Vocabulary Adaptation for Efficient + Language Model Inference EMNLP 2024 + + +
+ The development of state-of-the-art generative large language models (LLMs) +disproportionately relies on English-centric tokenizers, vocabulary and +pre-training data. Despite the fact that some LLMs have multilingual +capabilities, recent studies have shown that their inference efficiency +deteriorates when generating text in languages other than English. This results +in increased inference time and costs. Cross-lingual vocabulary adaptation +(CVA) methods have been proposed for adapting models to a target language +aiming to improve downstream performance. However, the effectiveness of these +methods on increasing inference efficiency of generative LLMs has yet to be +explored. In this paper, we perform an empirical study of five CVA methods on +four generative LLMs (including monolingual and multilingual models) across +four typologically-diverse languages and four natural language understanding +tasks. We find that CVA substantially contributes to LLM inference speedups of +up to 271.5\%. We also show that adapting LLMs that have been pre-trained on +more balanced multilingual data results in downstream performance comparable to +the original models. + +
+
+ comment: Accepted at EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Abstraction-of-Thought Makes Language Models Better Reasoners EMNLP 2024 + + +
+ Abstract reasoning, the ability to reason from the abstract essence of a +problem, serves as a key to generalization in human reasoning. However, +eliciting language models to perform reasoning with abstraction remains +unexplored. This paper seeks to bridge this gap by introducing a novel +structured reasoning format called Abstraction-of-Thought (AoT). The uniqueness +of AoT lies in its explicit requirement for varying levels of abstraction +within the reasoning process. This approach could elicit language models to +first contemplate on the abstract level before incorporating concrete details, +which is overlooked by the prevailing step-by-step Chain-of-Thought (CoT) +method. To align models with the AoT format, we present AoT Collection, a +generic finetuning dataset consisting of 348k high-quality samples with AoT +reasoning processes, collected via an automated and scalable pipeline. We +finetune a wide range of language models with AoT Collection and conduct +extensive evaluations on 23 unseen tasks from the challenging benchmark +Big-Bench Hard. Experimental results indicate that models aligned to AoT +reasoning format substantially outperform those aligned to CoT in many +reasoning tasks. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ DAPE: Data-Adaptive Positional Encoding for Length Extrapolation NeurIPS 2024 + + +
+ Positional encoding plays a crucial role in transformers, significantly +impacting model performance and length generalization. Prior research has +introduced absolute positional encoding (APE) and relative positional encoding +(RPE) to distinguish token positions in given sequences. However, both APE and +RPE remain fixed after model training regardless of input data, limiting their +adaptability and flexibility. Hence, we expect that the desired positional +encoding should be data-adaptive and can be dynamically adjusted with the given +attention. In this paper, we propose a Data-Adaptive Positional Encoding (DAPE) +method, which dynamically and semantically adjusts based on input context and +learned fixed priors. Experimental validation on real-world datasets (Arxiv, +Books3, and CHE) demonstrates that DAPE enhances model performances in terms of +trained length and length generalization, where the improvements are +statistically significant. The model visualization suggests that our model can +keep both local and anti-local information. Finally, we successfully train the +model on sequence length 128 and achieve better performance at evaluation +sequence length 8192, compared with other static positional encoding methods, +revealing the benefit of the adaptive positional encoding method. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ On the Design and Analysis of LLM-Based Algorithms + + +
+ We initiate a formal investigation into the design and analysis of LLM-based +algorithms, i.e. algorithms that contain one or multiple calls of large +language models (LLMs) as sub-routines and critically rely on the capabilities +of LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt +engineering to complicated LLM-powered agent systems and compound AI systems, +have achieved remarkable empirical success, the design and optimization of them +have mostly relied on heuristics and trial-and-errors, which is largely due to +a lack of formal and analytical study for these algorithms. To fill this gap, +we start by identifying the computational-graph representation of LLM-based +algorithms, the design principle of task decomposition, and some key +abstractions, which then facilitate our formal analysis for the accuracy and +efficiency of LLM-based algorithms, despite the black-box nature of LLMs. +Through extensive analytical and empirical investigation in a series of case +studies, we demonstrate that the proposed framework is broadly applicable to a +wide range of scenarios and diverse patterns of LLM-based algorithms, such as +parallel, hierarchical and recursive task decomposition. Our proposed framework +holds promise for advancing LLM-based algorithms, by revealing the reasons +behind curious empirical phenomena, guiding the choices of hyperparameters, +predicting the empirical performance of algorithms, and inspiring new algorithm +design. To promote further study of LLM-based algorithms, we release our source +code at +https://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm. + +
+
+
+
+
+ + ♻ ☆ Fine Tuning vs. Retrieval Augmented Generation for Less Popular + Knowledge + + +
+ Language Models (LMs) memorize a vast amount of factual knowledge, exhibiting +strong performance across diverse tasks and domains. However, it has been +observed that the performance diminishes when dealing with less-popular or +low-frequency concepts and entities, for example in domain specific +applications. The two prominent approaches to enhance the performance of LMs on +low-frequent topics are: Retrieval Augmented Generation (RAG) and fine-tuning +(FT) over synthetic data. This paper explores and evaluates the impact of RAG +and FT on customizing LMs in handling low-frequency entities on question +answering tasks. We conduct extensive experiments on twelve LMs of varying size +and type and different fine tuning, data augmentation, and retrieval models. +Our findings indicate that while FT boosts the performance across entities of +varying popularity, RAG surpasses FT by a large margin particularly for least +popular factual knowledge. Additionally, the success of both RAG and FT +approaches is amplified by improving retrieval and data augmentation +techniques. Fine tuning, while beneficial for small LMs, requires extensive +resources. To address this issue, we propose the new Stimulus RAG approach that +surpasses the effectiveness of fine tuning based approaches, thereby +eliminating the need for the costly data augmentation and fine tuning step for +enriching LMs with less popular factual knowledge. + +
+
+
+
+
+ + ♻ ☆ J2N -- Nominal Adjective Identification and its Application + + +
+ This paper explores the challenges posed by nominal adjectives (NAs) in +natural language processing (NLP) tasks, particularly in part-of-speech (POS) +tagging. We propose treating NAs as a distinct POS tag, "JN," and investigate +its impact on POS tagging, BIO chunking, and coreference resolution. Our study +shows that reclassifying NAs can improve the accuracy of syntactic analysis and +structural understanding in NLP. We present experimental results using Hidden +Markov Models (HMMs), Maximum Entropy (MaxEnt) models, and Spacy, demonstrating +the feasibility and potential benefits of this approach. Additionally we +trained a bert model to identify the NA in untagged text. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ NumeroLogic: Number Encoding for Enhanced LLMs' Numerical Reasoning + + +
+ Language models struggle with handling numerical data and performing +arithmetic operations. We hypothesize that this limitation can be partially +attributed to non-intuitive textual numbers representation. When a digit is +read or generated by a causal language model it does not know its place value +(e.g. thousands vs. hundreds) until the entire number is processed. To address +this issue, we propose a simple adjustment to how numbers are represented by +including the count of digits before each number. For instance, instead of +"42", we suggest using "{2:42}" as the new format. This approach, which we term +NumeroLogic, offers an added advantage in number generation by serving as a +Chain of Thought (CoT). By requiring the model to consider the number of digits +first, it enhances the reasoning process before generating the actual number. +We use arithmetic tasks to demonstrate the effectiveness of the NumeroLogic +formatting. We further demonstrate NumeroLogic applicability to general natural +language modeling, improving language understanding performance in the MMLU +benchmark. + +
+
+
+
+
+ + ♻ ☆ Leveraging summary of radiology reports with transformers + + +
+ Two fundamental problems in health-care stem from patient handoff and triage. +Doctors are often required to perform complex findings summarization to +facilitate efficient communication with specialists and decision making on the +urgency of each case. To address these challenges, we present a state of the +art radiology report summarization model utilizing adjusted bidirectional +encoder representation from transformers BERTtoBERT encoder and decoder +architecture. We also provide a data processing pipeline for future models +developed on the the MIMIC CXR dataset. Our approach includes a novel method +for augmenting medical data and a comprehensive performance analysis. Our best +performing model achieved a recall oriented understudy for gisting evaluation L +F1 score of 58.75/100, outperforming specialized checkpoints with more +sophisticated attention mechanisms. We also provide a data processing pipeline +for future models developed on the MIMIC chest X-ray dataset. The model +introduced in this paper demonstrates significantly improved capacity in +radiology report summarization, highlighting the potential for ensuring better +clinical workflows and enhanced patient care. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ QRMeM: Unleash the Length Limitation through Question then Reflection + Memory Mechanism EMNLP 2024 + + +
+ While large language models (LLMs) have made notable advancements in natural +language processing, they continue to struggle with processing extensive text. +Memory mechanism offers a flexible solution for managing long contexts, +utilizing techniques such as compression, summarization, and structuring to +facilitate nuanced and efficient handling of large volumes of text. However, +existing techniques face challenges with static knowledge integration, leading +to insufficient adaptation to task-specific needs and missing +multi-segmentation relationships, which hinders the dynamic reorganization and +logical combination of relevant segments during the response process. To +address these issues, we introduce a novel strategy, Question then Reflection +Memory Mechanism (QRMeM), incorporating a dual-structured memory pool. This +pool synergizes static textual content with structured graph guidance, +fostering a reflective trial-and-error approach for navigating and identifying +relevant segments. Our evaluation across multiple-choice questions (MCQ) and +multi-document question answering (Multi-doc QA) benchmarks showcases QRMeM +enhanced performance compared to existing approaches. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ MMCode: Benchmarking Multimodal Large Language Models for Code + Generation with Visually Rich Programming Problems EMNLP 2024 + + +
+ Programming often involves converting detailed and complex specifications +into code, a process during which developers typically utilize visual aids to +more effectively convey concepts. While recent developments in Large Multimodal +Models have demonstrated remarkable abilities in visual reasoning and +mathematical tasks, there is little work on investigating whether these models +can effectively interpret visual elements for code generation. To this end, we +present MMCode, the first multi-modal coding dataset for evaluating algorithmic +problem-solving skills in visually rich contexts. MMCode contains 3,548 +questions and 6,620 images collected from real-world programming challenges +harvested from 10 code competition websites, presenting significant challenges +due to the extreme demand for reasoning abilities. Our experiment results show +that current state-of-the-art models struggle to solve these problems. The +results highlight the lack of powerful vision-code models, and we hope MMCode +can serve as an inspiration for future works in this domain. The data and code +are publicly available at https://github.com/likaixin2000/MMCode. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Explanation Regularisation through the Lens of Attributions + + +
+ Explanation regularisation (ER) has been introduced as a way to guide text +classifiers to form their predictions relying on input tokens that humans +consider plausible. This is achieved by introducing an auxiliary explanation +loss that measures how well the output of an input attribution technique for +the model agrees with human-annotated rationales. The guidance appears to +benefit performance in out-of-domain (OOD) settings, presumably due to an +increased reliance on "plausible" tokens. However, previous work has +under-explored the impact of guidance on that reliance, particularly when +reliance is measured using attribution techniques different from those used to +guide the model. In this work, we seek to close this gap, and also explore the +relationship between reliance on plausible features and OOD performance. We +find that the connection between ER and the ability of a classifier to rely on +plausible features has been overstated and that a stronger reliance on +plausible tokens does not seem to be the cause for OOD improvements. + +
+
+ comment: 22 pages, 14 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ AutoScraper: A Progressive Understanding Web Agent for Web Scraper + Generation EMNLP 2024 + + +
+ Web scraping is a powerful technique that extracts data from websites, +enabling automated data collection, enhancing data analysis capabilities, and +minimizing manual data entry efforts. Existing methods, wrappers-based methods +suffer from limited adaptability and scalability when faced with a new website, +while language agents, empowered by large language models (LLMs), exhibit poor +reusability in diverse web environments. In this work, we introduce the +paradigm of generating web scrapers with LLMs and propose AutoScraper, a +two-stage framework that can handle diverse and changing web environments more +efficiently. AutoScraper leverages the hierarchical structure of HTML and +similarity across different web pages for generating web scrapers. Besides, we +propose a new executability metric for better measuring the performance of web +scraper generation tasks. We conduct comprehensive experiments with multiple +LLMs and demonstrate the effectiveness of our framework. Resources of this +paper can be found at \url{https://github.com/EZ-hwh/AutoScraper} + +
+
+ comment: 19 pages, 4 figures, 18 tables. Accepted to EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Can Large Language Models Faithfully Express Their Intrinsic Uncertainty + in Words? EMNLP 2024 + + +
+ We posit that large language models (LLMs) should be capable of expressing +their intrinsic uncertainty in natural language. For example, if the LLM is +equally likely to output two contradicting answers to the same question, then +its generated response should reflect this uncertainty by hedging its answer +(e.g., "I'm not sure, but I think..."). We formalize faithful response +uncertainty based on the gap between the model's intrinsic confidence in the +assertions it makes and the decisiveness by which they are conveyed. This +example-level metric reliably indicates whether the model reflects its +uncertainty, as it penalizes both excessive and insufficient hedging. We +evaluate a variety of aligned LLMs at faithfully communicating uncertainty on +several knowledge-intensive question answering tasks. Our results provide +strong evidence that modern LLMs are poor at faithfully conveying their +uncertainty, and that better alignment is necessary to improve their +trustworthiness. + +
+
+ comment: To appear in EMNLP 2024 (main conference) +
+
+
+
+
+ + ♻ ☆ Soda-Eval: Open-Domain Dialogue Evaluation in the age of LLMs EMNLP2024 + + +
+ Although human evaluation remains the gold standard for open-domain dialogue +evaluation, the growing popularity of automated evaluation using Large Language +Models (LLMs) has also extended to dialogue. However, most frameworks leverage +benchmarks that assess older chatbots on aspects such as fluency and relevance, +which are not reflective of the challenges associated with contemporary models. +In fact, a qualitative analysis on Soda, a GPT-3.5 generated dialogue dataset, +suggests that current chatbots may exhibit several recurring issues related to +coherence and commonsense knowledge, but generally produce highly fluent and +relevant responses. + Noting the aforementioned limitations, this paper introduces Soda-Eval, an +annotated dataset based on Soda that covers over 120K turn-level assessments +across 10K dialogues, where the annotations were generated by GPT-4. Using +Soda-Eval as a benchmark, we then study the performance of several open-access +instruction-tuned LLMs, finding that dialogue evaluation remains challenging. +Fine-tuning these models improves performance over few-shot inferences, both in +terms of correlation and explanation. + +
+
+ comment: Accepted to EMNLP2024 (findings) +
+
+
+
+
+ + ♻ ☆ Neuro-Symbolic Integration Brings Causal and Reliable Reasoning Proofs + + +
+ Two lines of approaches are adopted for complex reasoning with LLMs. One line +of work prompts LLMs with various reasoning structures, while the structural +outputs can be naturally regarded as intermediate reasoning steps. Another line +of work adopt LLM-free declarative solvers to do the reasoning task, rendering +higher reasoning accuracy but lacking interpretability due to the black-box +nature of the solvers. Aiming to resolve the trade-off between answer accuracy +and interpretability, we present a simple extension to the latter line of work. +Specifically, we showcase that the intermediate search logs generated by Prolog +interpreters can be accessed and interpreted into human-readable reasoning +proofs. As long as LLMs correctly translate problem descriptions into Prolog +representations, the corresponding reasoning proofs are ensured to be causal +and reliable. On two logical reasoning and one arithmetic reasoning datasets, +our framework obtains significant improvements in terms of both answer accuracy +and reasoning proof accuracy. Our code is released at +https://github.com/DAMO-NLP-SG/CaRing + +
+
+
+
+
+ + ♻ ☆ Archon: An Architecture Search Framework for Inference-Time Techniques + + +
+ Inference-time techniques are emerging as highly effective tools to increase +large language model (LLM) capabilities. However, there is still limited +understanding of the best practices for developing systems that combine +inference-time techniques with one or more LLMs, with challenges including: (1) +effectively allocating inference compute budget, (2) understanding the +interactions between different combinations of inference-time techniques and +their impact on downstream performance, and 3) efficiently searching over the +large space of model choices, inference-time techniques, and their +compositions. To address these challenges, we introduce Archon, an automated +framework for designing inference-time architectures. Archon defines an +extensible design space, encompassing methods such as generation ensembling, +multi-sampling, ranking, fusion, critiquing, verification, and unit testing. It +then transforms the problem of selecting and combining LLMs and inference-time +techniques into a hyperparameter optimization objective. To optimize this +objective, we introduce automated Inference-Time Architecture Search (ITAS) +algorithms. Given target benchmark(s), an inference compute budget, and +available LLMs, ITAS outputs optimized architectures. We evaluate Archon +architectures across a wide range of instruction-following and reasoning +benchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval, +MixEval Hard, MATH, and CodeContests. We show that automatically designed +inference-time architectures by Archon outperform strong models such as GPT-4o +and Claude 3.5 Sonnet on these benchmarks, achieving an average increase of +15.1 and 11.2 percentage points with all-source models and open-source models, +respectively. We make our code and datasets available publicly on Github: +https://github.com/ScalingIntelligence/Archon. + +
+
+
+
+
+ + ♻ ☆ Quality Matters: Evaluating Synthetic Data for Tool-Using LLMs + + +
+ Training large language models (LLMs) for external tool usage is a rapidly +expanding field, with recent research focusing on generating synthetic data to +address the shortage of available data. However, the absence of systematic data +quality checks poses complications for properly training and testing models. To +that end, we propose two approaches for assessing the reliability of data for +training LLMs to use external tools. The first approach uses intuitive, +human-defined correctness criteria. The second approach uses a model-driven +assessment with in-context evaluation. We conduct a thorough evaluation of data +quality on two popular benchmarks, followed by an extrinsic evaluation that +showcases the impact of data quality on model performance. Our results +demonstrate that models trained on high-quality data outperform those trained +on unvalidated data, even when trained with a smaller quantity of data. These +findings empirically support the significance of assessing and ensuring the +reliability of training data for tool-using LLMs. + +
+
+
+
+
+ + ♻ ☆ Unused information in token probability distribution of generative LLM: + improving LLM reading comprehension through calculation of expected values + + +
+ LLM text decoding is key component for perceived LLM quality. We demonstrate +two experiments showing that decoding methods could be improved by manipulation +of token probabilities. First, we test few LLM on SummEval summary scoring +dataset, to measure reading comprehension. We compare scores from greedy +decoding to expected values over the next token distribution. We scale logits +by large temperature to increase the entropy of scores. This allows strong +improvement of performance on SummEval (in terms of correlations to human +judgement). We see improvement from 6-8% to 13-28% for 7B Mistral and from +20%-46% to 37%-56% for Mixtral, beating GPT 4 0314 result on two metrics. Part +of the gain seems related to positional bias. Secondly, we use +probability-based tree sampling algorithm, to examine all most probable +generations for given prompt. + +
+
+ comment: 7 pages, 1 figure, presented at FEDCSIS 2024 conference, +
+
+
+
+
+ + ♻ ☆ How does Architecture Influence the Base Capabilities of Pre-trained + Language Models? A Case Study Based on FFN-Wider and MoE Transformers + + +
+ Pre-trained language models have been proven to possess strong base +capabilities, which not only excel in in-distribution language modeling but +also show powerful abilities in out-of-distribution language modeling, transfer +learning and few-shot learning. Unlike existing work focusing on the influence +of scale on base capabilities, our work examines the influence of architecture +on those. Specifically, our concern is: How does architecture influence the +base capabilities of pre-trained language models? In this work, we attempt to +explain and reverse the decline in base capabilities caused by the architecture +of FFN-Wider Transformers, seeking to provide some insights. Through analysis, +we found the contribution ratio of Multi-Head Attention (a combination +function) to pre-trained language modeling is a key factor affecting base +capabilities. FFN-Wider Transformers reduce the contribution ratio of this +combination function, leading to a decline in base capabilities. We confirmed +this by experiments and proposed Combination Enhanced Architecture (CEA) to +address the decline in base capabilities of such models. Significantly, we +extended our explanation and CEA to Mixture of Experts (MoE) Transformers. We +successfully achieved significant improvements in base capabilities on a 14B +parameter MoE model, demonstrating the practical application value of our work. +This also indicates that our analysis has a certain guiding significance for +architecture analysis, architecture improvement and architecture design. + +
+
+
+
+
+ + ♻ ☆ CHIQ: Contextual History Enhancement for Improving Query Rewriting in + Conversational Search EMNLP 2024 + + +
+ In this paper, we study how open-source large language models (LLMs) can be +effectively deployed for improving query rewriting in conversational search, +especially for ambiguous queries. We introduce CHIQ, a two-step method that +leverages the capabilities of LLMs to resolve ambiguities in the conversation +history before query rewriting. This approach contrasts with prior studies that +predominantly use closed-source LLMs to directly generate search queries from +conversation history. We demonstrate on five well-established benchmarks that +CHIQ leads to state-of-the-art results across most settings, showing highly +competitive performances with systems leveraging closed-source LLMs. Our study +provides a first step towards leveraging open-source LLMs in conversational +search, as a competitive alternative to the prevailing reliance on commercial +LLMs. Data, models, and source code will be publicly available upon acceptance +at https://github.com/fengranMark/CHIQ. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ MPCODER: Multi-user Personalized Code Generator with Explicit and + Implicit Style Representation Learning ACL 2024 + + +
+ Large Language Models (LLMs) have demonstrated great potential for assisting +developers in their daily development. However, most research focuses on +generating correct code, how to use LLMs to generate personalized code has +seldom been investigated. To bridge this gap, we proposed MPCoder (Multi-user +Personalized Code Generator) to generate personalized code for multiple users. +To better learn coding style features, we utilize explicit coding style +residual learning to capture the syntax code style standards and implicit style +learning to capture the semantic code style conventions. We train a multi-user +style adapter to better differentiate the implicit feature representations of +different users through contrastive learning, ultimately enabling personalized +code generation for multiple users. We further propose a novel evaluation +metric for estimating similarities between codes of different coding styles. +The experimental results show the effectiveness of our approach for this novel +task. + +
+
+ comment: Accepted by ACL 2024, Main Conference +
+
+
+
+
+ + ♻ ☆ TCSinger: Zero-Shot Singing Voice Synthesis with Style Transfer and + Multi-Level Style Control EMNLP 2024 + + +
+ Zero-shot singing voice synthesis (SVS) with style transfer and style control +aims to generate high-quality singing voices with unseen timbres and styles +(including singing method, emotion, rhythm, technique, and pronunciation) from +audio and text prompts. However, the multifaceted nature of singing styles +poses a significant challenge for effective modeling, transfer, and control. +Furthermore, current SVS models often fail to generate singing voices rich in +stylistic nuances for unseen singers. To address these challenges, we introduce +TCSinger, the first zero-shot SVS model for style transfer across cross-lingual +speech and singing styles, along with multi-level style control. Specifically, +TCSinger proposes three primary modules: 1) the clustering style encoder +employs a clustering vector quantization model to stably condense style +information into a compact latent space; 2) the Style and Duration Language +Model (S\&D-LM) concurrently predicts style information and phoneme duration, +which benefits both; 3) the style adaptive decoder uses a novel mel-style +adaptive normalization method to generate singing voices with enhanced details. +Experimental results show that TCSinger outperforms all baseline models in +synthesis quality, singer similarity, and style controllability across various +tasks, including zero-shot style transfer, multi-level style control, +cross-lingual style transfer, and speech-to-singing style transfer. Singing +voice samples can be accessed at https://tcsinger.github.io/. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Humans or LLMs as the Judge? A Study on Judgement Biases EMNLP2024 + + +
+ Adopting human and large language models (LLM) as judges (a.k.a human- and +LLM-as-a-judge) for evaluating the performance of LLMs has recently gained +attention. Nonetheless, this approach concurrently introduces potential biases +from human and LLMs, questioning the reliability of the evaluation results. In +this paper, we propose a novel framework that is free from referencing +groundtruth annotations for investigating Misinformation Oversight Bias, Gender +Bias, Authority Bias and Beauty Bias on LLM and human judges. We curate a +dataset referring to the revised Bloom's Taxonomy and conduct thousands of +evaluations. Results show that human and LLM judges are vulnerable to +perturbations to various degrees, and that even the cutting-edge judges possess +considerable biases. We further exploit these biases to conduct attacks on LLM +judges. We hope that our work can notify the community of the bias and +vulnerability of human- and LLM-as-a-judge, as well as the urgency of +developing robust evaluation systems. + +
+
+ comment: EMNLP2024 +
+
+
+
+
+ + ♻ ☆ Can AI writing be salvaged? Mitigating Idiosyncrasies and Improving + Human-AI Alignment in the Writing Process through Edits + + +
+ LLM-based applications are helping people write, and LLM-generated text is +making its way into social media, journalism, and our classrooms. However, the +differences between LLM-generated and human-written text remain unclear. To +explore this, we hired professional writers to edit paragraphs in several +creative domains. We first found these writers agree on undesirable +idiosyncrasies in LLM-generated text, formalizing it into a seven-category +taxonomy (e.g. cliches, unnecessary exposition). Second, we curated the LAMP +corpus: 1,057 LLM-generated paragraphs edited by professional writers according +to our taxonomy. Analysis of LAMP reveals that none of the LLMs used in our +study (GPT4o, Claude-3.5-Sonnet, Llama-3.1-70b) outperform each other in terms +of writing quality, revealing common limitations across model families. Third, +we explored automatic editing methods to improve LLM-generated text. A +large-scale preference annotation confirms that although experts largely prefer +text edited by other experts, automatic editing methods show promise in +improving alignment between LLM-generated and human-written text. + +
+
+ comment: NLP+HCI, Behavioral Science +
+
+
+
+
+ + ♻ ☆ Unleashing the Power of Emojis in Texts via Self-supervised Graph + Pre-Training EMNLP 2024 + + +
+ Emojis have gained immense popularity on social platforms, serving as a +common means to supplement or replace text. However, existing data mining +approaches generally either completely ignore or simply treat emojis as +ordinary Unicode characters, which may limit the model's ability to grasp the +rich semantic information in emojis and the interaction between emojis and +texts. Thus, it is necessary to release the emoji's power in social media data +mining. To this end, we first construct a heterogeneous graph consisting of +three types of nodes, i.e. post, word and emoji nodes to improve the +representation of different elements in posts. The edges are also well-defined +to model how these three elements interact with each other. To facilitate the +sharing of information among post, word and emoji nodes, we propose a graph +pre-train framework for text and emoji co-modeling, which contains two graph +pre-training tasks: node-level graph contrastive learning and edge-level link +reconstruction learning. Extensive experiments on the Xiaohongshu and Twitter +datasets with two types of downstream tasks demonstrate that our approach +proves significant improvement over previous strong baseline methods. + +
+
+ comment: Accepted by EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ LAViTeR: Learning Aligned Visual and Textual Representations Assisted by + Image and Caption Generation + + +
+ Pre-training visual and textual representations from large-scale image-text +pairs is becoming a standard approach for many downstream vision-language +tasks. The transformer-based models learn inter and intra-modal attention +through a list of self-supervised learning tasks. This paper proposes LAViTeR, +a novel architecture for visual and textual representation learning. The main +module, Visual Textual Alignment (VTA) will be assisted by two auxiliary tasks, +GAN-based image synthesis and Image Captioning. We also propose a new +evaluation metric measuring the similarity between the learnt visual and +textual embedding. The experimental results on two public datasets, CUB and +MS-COCO, demonstrate superior visual and textual representation alignment in +the joint feature embedding space + +
+
+ comment: 15 pages, 10 Figures, 5 Tables. Oral Presentation at Irish Machine + Vision and Image Processing Conference Proceedings, 2024 +
+
+
+
+
+ + ♻ ☆ View From Above: A Framework for Evaluating Distribution Shifts in Model + Behavior + + +
+ When large language models (LLMs) are asked to perform certain tasks, how can +we be sure that their learned representations align with reality? We propose a +domain-agnostic framework for systematically evaluating distribution shifts in +LLMs decision-making processes, where they are given control of mechanisms +governed by pre-defined rules. While individual LLM actions may appear +consistent with expected behavior, across a large number of trials, +statistically significant distribution shifts can emerge. To test this, we +construct a well-defined environment with known outcome logic: blackjack. In +more than 1,000 trials, we uncover statistically significant evidence +suggesting behavioral misalignment in the learned representations of LLM. + +
+
+
+
+
+ + ♻ ☆ Enhancing Post-Hoc Attributions in Long Document Comprehension via + Coarse Grained Answer Decomposition + + +
+ Accurately attributing answer text to its source document is crucial for +developing a reliable question-answering system. However, attribution for long +documents remains largely unexplored. Post-hoc attribution systems are designed +to map answer text back to the source document, yet the granularity of this +mapping has not been addressed. Furthermore, a critical question arises: What +exactly should be attributed? This involves identifying the specific +information units within an answer that require grounding. In this paper, we +propose and investigate a novel approach to the factual decomposition of +generated answers for attribution, employing template-based in-context +learning. To accomplish this, we utilize the question and integrate negative +sampling during few-shot in-context learning for decomposition. This approach +enhances the semantic understanding of both abstractive and extractive answers. +We examine the impact of answer decomposition by providing a thorough +examination of various attribution approaches, ranging from retrieval-based +techniques to LLM-based attributors. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ FlowTurbo: Towards Real-time Flow-Based Image Generation with Velocity + Refiner NeurIPS 2024 + + +
+ Building on the success of diffusion models in visual generation, flow-based +models reemerge as another prominent family of generative models that have +achieved competitive or better performance in terms of both visual quality and +inference speed. By learning the velocity field through flow-matching, +flow-based models tend to produce a straighter sampling trajectory, which is +advantageous during the sampling process. However, unlike diffusion models for +which fast samplers are well-developed, efficient sampling of flow-based +generative models has been rarely explored. In this paper, we propose a +framework called FlowTurbo to accelerate the sampling of flow-based models +while still enhancing the sampling quality. Our primary observation is that the +velocity predictor's outputs in the flow-based models will become stable during +the sampling, enabling the estimation of velocity via a lightweight velocity +refiner. Additionally, we introduce several techniques including a pseudo +corrector and sample-aware compilation to further reduce inference time. Since +FlowTurbo does not change the multi-step sampling paradigm, it can be +effectively applied for various tasks such as image editing, inpainting, etc. +By integrating FlowTurbo into different flow-based models, we obtain an +acceleration ratio of 53.1%$\sim$58.3% on class-conditional generation and +29.8%$\sim$38.5% on text-to-image generation. Notably, FlowTurbo reaches an FID +of 2.12 on ImageNet with 100 (ms / img) and FID of 3.93 with 38 (ms / img), +achieving the real-time image generation and establishing the new +state-of-the-art. Code is available at https://github.com/shiml20/FlowTurbo. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ EgoLM: Multi-Modal Language Model of Egocentric Motions + + +
+ As the prevalence of wearable devices, learning egocentric motions becomes +essential to develop contextual AI. In this work, we present EgoLM, a versatile +framework that tracks and understands egocentric motions from multi-modal +inputs, e.g., egocentric videos and motion sensors. EgoLM exploits rich +contexts for the disambiguation of egomotion tracking and understanding, which +are ill-posed under single modality conditions. To facilitate the versatile and +multi-modal framework, our key insight is to model the joint distribution of +egocentric motions and natural languages using large language models (LLM). +Multi-modal sensor inputs are encoded and projected to the joint latent space +of language models, and used to prompt motion generation or text generation for +egomotion tracking or understanding, respectively. Extensive experiments on +large-scale multi-modal human motion dataset validate the effectiveness of +EgoLM as a generalist model for universal egocentric learning. + +
+
+ comment: Project Page: https://hongfz16.github.io/projects/EgoLM +
+
+
+
+
+ + ☆ LLaVA-3D: A Simple yet Effective Pathway to Empowering LMMs with + 3D-awareness + + +
+ Recent advancements in Large Multimodal Models (LMMs) have greatly enhanced +their proficiency in 2D visual understanding tasks, enabling them to +effectively process and understand images and videos. However, the development +of LMMs with 3D-awareness for 3D scene understanding has been hindered by the +lack of large-scale 3D vision-language datasets and powerful 3D encoders. In +this paper, we introduce a simple yet effective framework called LLaVA-3D. +Leveraging the strong 2D understanding priors from LLaVA, our LLaVA-3D +efficiently adapts LLaVA for 3D scene understanding without compromising 2D +understanding capabilities. To achieve this, we employ a simple yet effective +representation, 3D Patch, which connects 2D CLIP patch features with their +corresponding positions in 3D space. By integrating the 3D Patches into 2D LMMs +and employing joint 2D and 3D vision-language instruction tuning, we establish +a unified architecture for both 2D image understanding and 3D scene +understanding. Experimental results show that LLaVA-3D converges 3.5x faster +than existing 3D LMMs when trained on 3D vision-language datasets. Moreover, +LLaVA-3D not only achieves state-of-the-art performance across various 3D tasks +but also maintains comparable 2D image understanding and vision-language +conversation capabilities with LLaVA. + +
+
+ comment: Project page: https://zcmax.github.io/projects/LLaVA-3D/ +
+
+
+
+
+ + ☆ Lotus: Diffusion-based Visual Foundation Model for High-quality Dense + Prediction + + +
+ Leveraging the visual priors of pre-trained text-to-image diffusion models +offers a promising solution to enhance zero-shot generalization in dense +prediction tasks. However, existing methods often uncritically use the original +diffusion formulation, which may not be optimal due to the fundamental +differences between dense prediction and image generation. In this paper, we +provide a systemic analysis of the diffusion formulation for the dense +prediction, focusing on both quality and efficiency. And we find that the +original parameterization type for image generation, which learns to predict +noise, is harmful for dense prediction; the multi-step noising/denoising +diffusion process is also unnecessary and challenging to optimize. Based on +these insights, we introduce Lotus, a diffusion-based visual foundation model +with a simple yet effective adaptation protocol for dense prediction. +Specifically, Lotus is trained to directly predict annotations instead of +noise, thereby avoiding harmful variance. We also reformulate the diffusion +process into a single-step procedure, simplifying optimization and +significantly boosting inference speed. Additionally, we introduce a novel +tuning strategy called detail preserver, which achieves more accurate and +fine-grained predictions. Without scaling up the training data or model +capacity, Lotus achieves SoTA performance in zero-shot depth and normal +estimation across various datasets. It also significantly enhances efficiency, +being hundreds of times faster than most existing diffusion-based methods. + +
+
+ comment: Project page: https://lotus3d.github.io/ +
+
+
+
+
+ + ☆ Robot See Robot Do: Imitating Articulated Object Manipulation with + Monocular 4D Reconstruction CoRL 2024 + + +
+ Humans can learn to manipulate new objects by simply watching others; +providing robots with the ability to learn from such demonstrations would +enable a natural interface specifying new behaviors. This work develops Robot +See Robot Do (RSRD), a method for imitating articulated object manipulation +from a single monocular RGB human demonstration given a single static +multi-view object scan. We first propose 4D Differentiable Part Models +(4D-DPM), a method for recovering 3D part motion from a monocular video with +differentiable rendering. This analysis-by-synthesis approach uses part-centric +feature fields in an iterative optimization which enables the use of geometric +regularizers to recover 3D motions from only a single video. Given this 4D +reconstruction, the robot replicates object trajectories by planning bimanual +arm motions that induce the demonstrated object part motion. By representing +demonstrations as part-centric trajectories, RSRD focuses on replicating the +demonstration's intended behavior while considering the robot's own +morphological limits, rather than attempting to reproduce the hand's motion. We +evaluate 4D-DPM's 3D tracking accuracy on ground truth annotated 3D part +trajectories and RSRD's physical execution performance on 9 objects across 10 +trials each on a bimanual YuMi robot. Each phase of RSRD achieves an average of +87% success rate, for a total end-to-end success rate of 60% across 90 trials. +Notably, this is accomplished using only feature fields distilled from large +pretrained vision models -- without any task-specific training, fine-tuning, +dataset collection, or annotation. Project page: +https://robot-see-robot-do.github.io + +
+
+ comment: CoRL 2024, Project page: https://robot-see-robot-do.github.io +
+
+
+
+
+ + EvMAPPER: High Altitude Orthomapping with Event Cameras + + +
+ Traditionally, unmanned aerial vehicles (UAVs) rely on CMOS-based cameras to +collect images about the world below. One of the most successful applications +of UAVs is to generate orthomosaics or orthomaps, in which a series of images +are integrated together to develop a larger map. However, the use of CMOS-based +cameras with global or rolling shutters mean that orthomaps are vulnerable to +challenging light conditions, motion blur, and high-speed motion of +independently moving objects under the camera. Event cameras are less sensitive +to these issues, as their pixels are able to trigger asynchronously on +brightness changes. This work introduces the first orthomosaic approach using +event cameras. In contrast to existing methods relying only on CMOS cameras, +our approach enables map generation even in challenging light conditions, +including direct sunlight and after sunset. + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ☆ Multi-View and Multi-Scale Alignment for Contrastive Language-Image + Pre-training in Mammography MICCAI 2024 + + +
+ Contrastive Language-Image Pre-training (CLIP) shows promise in medical image +analysis but requires substantial data and computational resources. Due to +these restrictions, existing CLIP applications in medical imaging focus mainly +on modalities like chest X-rays that have abundant image-report data available, +leaving many other important modalities under-explored. Here, we propose the +first adaptation of the full CLIP model to mammography, which presents +significant challenges due to labeled data scarcity, high-resolution images +with small regions of interest, and data imbalance. We first develop a +specialized supervision framework for mammography that leverages its multi-view +nature. Furthermore, we design a symmetric local alignment module to better +focus on detailed features in high-resolution images. Lastly, we incorporate a +parameter-efficient fine-tuning approach for large language models pre-trained +with medical knowledge to address data limitations. Our multi-view and +multi-scale alignment (MaMA) method outperforms state-of-the-art baselines for +three different tasks on two large real-world mammography datasets, EMBED and +RSNA-Mammo, with only 52% model size compared with the largest baseline. + +
+
+ comment: This work is also the basis of the overall best solution for the + MICCAI 2024 CXR-LT Challenge +
+
+
+
+
+ + ☆ EdgeRunner: Auto-regressive Auto-encoder for Artistic Mesh Generation + + +
+ Current auto-regressive mesh generation methods suffer from issues such as +incompleteness, insufficient detail, and poor generalization. In this paper, we +propose an Auto-regressive Auto-encoder (ArAE) model capable of generating +high-quality 3D meshes with up to 4,000 faces at a spatial resolution of +$512^3$. We introduce a novel mesh tokenization algorithm that efficiently +compresses triangular meshes into 1D token sequences, significantly enhancing +training efficiency. Furthermore, our model compresses variable-length +triangular meshes into a fixed-length latent space, enabling training latent +diffusion models for better generalization. Extensive experiments demonstrate +the superior quality, diversity, and generalization capabilities of our model +in both point cloud and image-conditioned mesh generation tasks. + +
+
+ comment: Project Page: https://research.nvidia.com/labs/dir/edgerunner/ +
+
+
+
+
+ + ☆ E.T. Bench: Towards Open-Ended Event-Level Video-Language Understanding NeurIPS 2024 + + +
+ Recent advances in Video Large Language Models (Video-LLMs) have demonstrated +their great potential in general-purpose video understanding. To verify the +significance of these models, a number of benchmarks have been proposed to +diagnose their capabilities in different scenarios. However, existing +benchmarks merely evaluate models through video-level question-answering, +lacking fine-grained event-level assessment and task diversity. To fill this +gap, we introduce E.T. Bench (Event-Level & Time-Sensitive Video Understanding +Benchmark), a large-scale and high-quality benchmark for open-ended event-level +video understanding. Categorized within a 3-level task taxonomy, E.T. Bench +encompasses 7.3K samples under 12 tasks with 7K videos (251.4h total length) +under 8 domains, providing comprehensive evaluations. We extensively evaluated +8 Image-LLMs and 12 Video-LLMs on our benchmark, and the results reveal that +state-of-the-art models for coarse-level (video-level) understanding struggle +to solve our fine-grained tasks, e.g., grounding event-of-interests within +videos, largely due to the short video context length, improper time +representations, and lack of multi-event training data. Focusing on these +issues, we further propose a strong baseline model, E.T. Chat, together with an +instruction-tuning dataset E.T. Instruct 164K tailored for fine-grained +event-level understanding. Our simple but effective solution demonstrates +superior performance in multiple scenarios. + +
+
+ comment: Accepted to NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ☆ Find Rhinos without Finding Rhinos: Active Learning with Multimodal + Imagery of South African Rhino Habitats IJCAI 2023 + + +
+ Much of Earth's charismatic megafauna is endangered by human activities, +particularly the rhino, which is at risk of extinction due to the poaching +crisis in Africa. Monitoring rhinos' movement is crucial to their protection +but has unfortunately proven difficult because rhinos are elusive. Therefore, +instead of tracking rhinos, we propose the novel approach of mapping communal +defecation sites, called middens, which give information about rhinos' spatial +behavior valuable to anti-poaching, management, and reintroduction efforts. +This paper provides the first-ever mapping of rhino midden locations by +building classifiers to detect them using remotely sensed thermal, RGB, and +LiDAR imagery in passive and active learning settings. As existing active +learning methods perform poorly due to the extreme class imbalance in our +dataset, we design MultimodAL, an active learning system employing a ranking +technique and multimodality to achieve competitive performance with passive +learning models with 94% fewer labels. Our methods could therefore save over 76 +hours in labeling time when used on a similarly-sized dataset. Unexpectedly, +our midden map reveals that rhino middens are not randomly distributed +throughout the landscape; rather, they are clustered. Consequently, rangers +should be targeted at areas with high midden densities to strengthen +anti-poaching efforts, in line with UN Target 15.7. + +
+
+ comment: 9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good +
+
+
+
+
+ + ☆ MALPOLON: A Framework for Deep Species Distribution Modeling + + +
+ This paper describes a deep-SDM framework, MALPOLON. Written in Python and +built upon the PyTorch library, this framework aims to facilitate training and +inferences of deep species distribution models (deep-SDM) and sharing for users +with only general Python language skills (e.g., modeling ecologists) who are +interested in testing deep learning approaches to build new SDMs. More advanced +users can also benefit from the framework's modularity to run more specific +experiments by overriding existing classes while taking advantage of +press-button examples to train neural networks on multiple classification tasks +using custom or provided raw and pre-processed datasets. The framework is +open-sourced on GitHub and PyPi along with extensive documentation and examples +of use in various scenarios. MALPOLON offers straightforward installation, +YAML-based configuration, parallel computing, multi-GPU utilization, baseline +and foundational models for benchmarking, and extensive +tutorials/documentation, aiming to enhance accessibility and performance +scalability for ecologists and researchers. + +
+
+
+
+
+ + ☆ AI-Powered Augmented Reality for Satellite Assembly, Integration and + Test + + +
+ The integration of Artificial Intelligence (AI) and Augmented Reality (AR) is +set to transform satellite Assembly, Integration, and Testing (AIT) processes +by enhancing precision, minimizing human error, and improving operational +efficiency in cleanroom environments. This paper presents a technical +description of the European Space Agency's (ESA) project "AI for AR in +Satellite AIT," which combines real-time computer vision and AR systems to +assist technicians during satellite assembly. Leveraging Microsoft HoloLens 2 +as the AR interface, the system delivers context-aware instructions and +real-time feedback, tackling the complexities of object recognition and 6D pose +estimation in AIT workflows. All AI models demonstrated over 70% accuracy, with +the detection model exceeding 95% accuracy, indicating a high level of +performance and reliability. A key contribution of this work lies in the +effective use of synthetic data for training AI models in AR applications, +addressing the significant challenges of obtaining real-world datasets in +highly dynamic satellite environments, as well as the creation of the Segmented +Anything Model for Automatic Labelling (SAMAL), which facilitates the automatic +annotation of real data, achieving speeds up to 20 times faster than manual +human annotation. The findings demonstrate the efficacy of AI-driven AR systems +in automating critical satellite assembly tasks, setting a foundation for +future innovations in the space industry. + +
+
+
+
+
+ + ☆ Self-supervised Pretraining for Cardiovascular Magnetic Resonance Cine + Segmentation MICCAI 2024 + + +
+ Self-supervised pretraining (SSP) has shown promising results in learning +from large unlabeled datasets and, thus, could be useful for automated +cardiovascular magnetic resonance (CMR) short-axis cine segmentation. However, +inconsistent reports of the benefits of SSP for segmentation have made it +difficult to apply SSP to CMR. Therefore, this study aimed to evaluate SSP +methods for CMR cine segmentation. + To this end, short-axis cine stacks of 296 subjects (90618 2D slices) were +used for unlabeled pretraining with four SSP methods; SimCLR, positional +contrastive learning, DINO, and masked image modeling (MIM). Subsets of varying +numbers of subjects were used for supervised fine-tuning of 2D models for each +SSP method, as well as to train a 2D baseline model from scratch. The +fine-tuned models were compared to the baseline using the 3D Dice similarity +coefficient (DSC) in a test dataset of 140 subjects. + The SSP methods showed no performance gains with the largest supervised +fine-tuning subset compared to the baseline (DSC = 0.89). When only 10 subjects +(231 2D slices) are available for supervised training, SSP using MIM (DSC = +0.86) improves over training from scratch (DSC = 0.82). + This study found that SSP is valuable for CMR cine segmentation when labeled +training data is scarce, but does not aid state-of-the-art deep learning +methods when ample labeled data is available. Moreover, the choice of SSP +method is important. The code is publicly available at: +https://github.com/q-cardIA/ssp-cmr-cine-segmentation + +
+
+ comment: Accepted to Data Engineering in Medical Imaging (DEMI) Workshop at + MICCAI 2024 +
+
+
+
+
+ + ☆ EfficientCrackNet: A Lightweight Model for Crack Segmentation + + +
+ Crack detection, particularly from pavement images, presents a formidable +challenge in the domain of computer vision due to several inherent complexities +such as intensity inhomogeneity, intricate topologies, low contrast, and noisy +backgrounds. Automated crack detection is crucial for maintaining the +structural integrity of essential infrastructures, including buildings, +pavements, and bridges. Existing lightweight methods often face challenges +including computational inefficiency, complex crack patterns, and difficult +backgrounds, leading to inaccurate detection and impracticality for real-world +applications. To address these limitations, we propose EfficientCrackNet, a +lightweight hybrid model combining Convolutional Neural Networks (CNNs) and +transformers for precise crack segmentation. EfficientCrackNet integrates +depthwise separable convolutions (DSC) layers and MobileViT block to capture +both global and local features. The model employs an Edge Extraction Method +(EEM) and for efficient crack edge detection without pretraining, and +Ultra-Lightweight Subspace Attention Module (ULSAM) to enhance feature +extraction. Extensive experiments on three benchmark datasets Crack500, +DeepCrack, and GAPs384 demonstrate that EfficientCrackNet achieves superior +performance compared to existing lightweight models, while requiring only 0.26M +parameters, and 0.483 FLOPs (G). The proposed model offers an optimal balance +between accuracy and computational efficiency, outperforming state-of-the-art +lightweight models, and providing a robust and adaptable solution for +real-world crack segmentation. + +
+
+
+
+
+ + ☆ DiffSSC: Semantic LiDAR Scan Completion using Denoising Diffusion + Probabilistic Models + + +
+ Perception systems play a crucial role in autonomous driving, incorporating +multiple sensors and corresponding computer vision algorithms. 3D LiDAR sensors +are widely used to capture sparse point clouds of the vehicle's surroundings. +However, such systems struggle to perceive occluded areas and gaps in the scene +due to the sparsity of these point clouds and their lack of semantics. To +address these challenges, Semantic Scene Completion (SSC) jointly predicts +unobserved geometry and semantics in the scene given raw LiDAR measurements, +aiming for a more complete scene representation. Building on promising results +of diffusion models in image generation and super-resolution tasks, we propose +their extension to SSC by implementing the noising and denoising diffusion +processes in the point and semantic spaces individually. To control the +generation, we employ semantic LiDAR point clouds as conditional input and +design local and global regularization losses to stabilize the denoising +process. We evaluate our approach on autonomous driving datasets and our +approach outperforms the state-of-the-art for SSC. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Stable Video Portraits ECCV 2024 + + +
+ Rapid advances in the field of generative AI and text-to-image methods in +particular have transformed the way we interact with and perceive +computer-generated imagery today. In parallel, much progress has been made in +3D face reconstruction, using 3D Morphable Models (3DMM). In this paper, we +present SVP, a novel hybrid 2D/3D generation method that outputs photorealistic +videos of talking faces leveraging a large pre-trained text-to-image prior +(2D), controlled via a 3DMM (3D). Specifically, we introduce a person-specific +fine-tuning of a general 2D stable diffusion model which we lift to a video +model by providing temporal 3DMM sequences as conditioning and by introducing a +temporal denoising procedure. As an output, this model generates temporally +smooth imagery of a person with 3DMM-based controls, i.e., a person-specific +avatar. The facial appearance of this person-specific avatar can be edited and +morphed to text-defined celebrities, without any fine-tuning at test time. The +method is analyzed quantitatively and qualitatively, and we show that our +method outperforms state-of-the-art monocular head avatar methods. + +
+
+ comment: Accepted at ECCV 2024, Project: https://svp.is.tue.mpg.de +
+
+
+
+
+ + ☆ SKT: Integrating State-Aware Keypoint Trajectories with Vision-Language + Models for Robotic Garment Manipulation + + +
+ Automating garment manipulation poses a significant challenge for assistive +robotics due to the diverse and deformable nature of garments. Traditional +approaches typically require separate models for each garment type, which +limits scalability and adaptability. In contrast, this paper presents a unified +approach using vision-language models (VLMs) to improve keypoint prediction +across various garment categories. By interpreting both visual and semantic +information, our model enables robots to manage different garment states with a +single model. We created a large-scale synthetic dataset using advanced +simulation techniques, allowing scalable training without extensive real-world +data. Experimental results indicate that the VLM-based method significantly +enhances keypoint detection accuracy and task success rates, providing a more +flexible and general solution for robotic garment manipulation. In addition, +this research also underscores the potential of VLMs to unify various garment +manipulation tasks within a single framework, paving the way for broader +applications in home automation and assistive robotics for future. + +
+
+
+
+
+ + ☆ FreeEdit: Mask-free Reference-based Image Editing with Multi-modal + Instruction + + +
+ Introducing user-specified visual concepts in image editing is highly +practical as these concepts convey the user's intent more precisely than +text-based descriptions. We propose FreeEdit, a novel approach for achieving +such reference-based image editing, which can accurately reproduce the visual +concept from the reference image based on user-friendly language instructions. +Our approach leverages the multi-modal instruction encoder to encode language +instructions to guide the editing process. This implicit way of locating the +editing area eliminates the need for manual editing masks. To enhance the +reconstruction of reference details, we introduce the Decoupled Residual +ReferAttention (DRRA) module. This module is designed to integrate fine-grained +reference features extracted by a detail extractor into the image editing +process in a residual way without interfering with the original self-attention. +Given that existing datasets are unsuitable for reference-based image editing +tasks, particularly due to the difficulty in constructing image triplets that +include a reference image, we curate a high-quality dataset, FreeBench, using a +newly developed twice-repainting scheme. FreeBench comprises the images before +and after editing, detailed editing instructions, as well as a reference image +that maintains the identity of the edited object, encompassing tasks such as +object addition, replacement, and deletion. By conducting phased training on +FreeBench followed by quality tuning, FreeEdit achieves high-quality zero-shot +editing through convenient language instructions. We conduct extensive +experiments to evaluate the effectiveness of FreeEdit across multiple task +types, demonstrating its superiority over existing methods. The code will be +available at: https://freeedit.github.io/. + +
+
+ comment: 14 pages, 14 figures, project website: https://freeedit.github.io/ +
+
+
+
+
+ + ☆ LightAvatar: Efficient Head Avatar as Dynamic Neural Light Field ECCV'24 + + +
+ Recent works have shown that neural radiance fields (NeRFs) on top of +parametric models have reached SOTA quality to build photorealistic head +avatars from a monocular video. However, one major limitation of the NeRF-based +avatars is the slow rendering speed due to the dense point sampling of NeRF, +preventing them from broader utility on resource-constrained devices. We +introduce LightAvatar, the first head avatar model based on neural light fields +(NeLFs). LightAvatar renders an image from 3DMM parameters and a camera pose +via a single network forward pass, without using mesh or volume rendering. The +proposed approach, while being conceptually appealing, poses a significant +challenge towards real-time efficiency and training stability. To resolve them, +we introduce dedicated network designs to obtain proper representations for the +NeLF model and maintain a low FLOPs budget. Meanwhile, we tap into a +distillation-based training strategy that uses a pretrained avatar model as +teacher to synthesize abundant pseudo data for training. A warping field +network is introduced to correct the fitting error in the real data so that the +model can learn better. Extensive experiments suggest that our method can +achieve new SOTA image quality quantitatively or qualitatively, while being +significantly faster than the counterparts, reporting 174.1 FPS (512x512 +resolution) on a consumer-grade GPU (RTX3090) with no customized optimization. + +
+
+ comment: Appear in ECCV'24 CADL Workshop. Code: + https://github.com/MingSun-Tse/LightAvatar-TensorFlow +
+
+
+
+
+ + ☆ Visual Data Diagnosis and Debiasing with Concept Graphs + + +
+ The widespread success of deep learning models today is owed to the curation +of extensive datasets significant in size and complexity. However, such models +frequently pick up inherent biases in the data during the training process, +leading to unreliable predictions. Diagnosing and debiasing datasets is thus a +necessity to ensure reliable model performance. In this paper, we present +CONBIAS, a novel framework for diagnosing and mitigating Concept co-occurrence +Biases in visual datasets. CONBIAS represents visual datasets as knowledge +graphs of concepts, enabling meticulous analysis of spurious concept +co-occurrences to uncover concept imbalances across the whole dataset. +Moreover, we show that by employing a novel clique-based concept balancing +strategy, we can mitigate these imbalances, leading to enhanced performance on +downstream tasks. Extensive experiments show that data augmentation based on a +balanced concept distribution augmented by CONBIAS improves generalization +performance across multiple datasets compared to state-of-the-art methods. We +will make our code and data publicly available. + +
+
+
+
+
+ + ☆ Revisit Anything: Visual Place Recognition via Image Segment Retrieval ECCV 2024 + + +
+ Accurately recognizing a revisited place is crucial for embodied agents to +localize and navigate. This requires visual representations to be distinct, +despite strong variations in camera viewpoint and scene appearance. Existing +visual place recognition pipelines encode the "whole" image and search for +matches. This poses a fundamental challenge in matching two images of the same +place captured from different camera viewpoints: "the similarity of what +overlaps can be dominated by the dissimilarity of what does not overlap". We +address this by encoding and searching for "image segments" instead of the +whole images. We propose to use open-set image segmentation to decompose an +image into `meaningful' entities (i.e., things and stuff). This enables us to +create a novel image representation as a collection of multiple overlapping +subgraphs connecting a segment with its neighboring segments, dubbed +SuperSegment. Furthermore, to efficiently encode these SuperSegments into +compact vector representations, we propose a novel factorized representation of +feature aggregation. We show that retrieving these partial representations +leads to significantly higher recognition recall than the typical whole image +based retrieval. Our segments-based approach, dubbed SegVLAD, sets a new +state-of-the-art in place recognition on a diverse selection of benchmark +datasets, while being applicable to both generic and task-specialized image +encoders. Finally, we demonstrate the potential of our method to ``revisit +anything'' by evaluating our method on an object instance retrieval task, which +bridges the two disparate areas of research: visual place recognition and +object-goal navigation, through their common aim of recognizing goal objects +specific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything. + +
+
+ comment: Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures +
+
+
+
+
+ + ☆ IFCap: Image-like Retrieval and Frequency-based Entity Filtering for + Zero-shot Captioning EMNLP 2024 + + +
+ Recent advancements in image captioning have explored text-only training +methods to overcome the limitations of paired image-text data. However, +existing text-only training methods often overlook the modality gap between +using text data during training and employing images during inference. To +address this issue, we propose a novel approach called Image-like Retrieval, +which aligns text features with visually relevant features to mitigate the +modality gap. Our method further enhances the accuracy of generated captions by +designing a Fusion Module that integrates retrieved captions with input +features. Additionally, we introduce a Frequency-based Entity Filtering +technique that significantly improves caption quality. We integrate these +methods into a unified framework, which we refer to as IFCap +($\textbf{I}$mage-like Retrieval and $\textbf{F}$requency-based Entity +Filtering for Zero-shot $\textbf{Cap}$tioning). Through extensive +experimentation, our straightforward yet powerful approach has demonstrated its +efficacy, outperforming the state-of-the-art methods by a significant margin in +both image captioning and video captioning compared to zero-shot captioning +based on text-only training. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ EMOVA: Empowering Language Models to See, Hear and Speak with Vivid + Emotions + + +
+ GPT-4o, an omni-modal model that enables vocal conversations with diverse +emotions and tones, marks a milestone for omni-modal foundation models. +However, empowering Large Language Models to perceive and generate images, +texts, and speeches end-to-end with publicly available data remains challenging +in the open-source community. Existing vision-language models rely on external +tools for the speech processing, while speech-language models still suffer from +limited or even without vision-understanding abilities. To address this gap, we +propose EMOVA (EMotionally Omni-present Voice Assistant), to enable Large +Language Models with end-to-end speech capabilities while maintaining the +leading vision-language performance. With a semantic-acoustic disentangled +speech tokenizer, we notice surprisingly that omni-modal alignment can further +enhance vision-language and speech abilities compared with the corresponding +bi-modal aligned counterparts. Moreover, a lightweight style module is proposed +for flexible speech style controls (e.g., emotions and pitches). For the first +time, EMOVA achieves state-of-the-art performance on both the vision-language +and speech benchmarks, and meanwhile, supporting omni-modal spoken dialogue +with vivid emotions. + +
+
+ comment: Project Page: https://emova-ollm.github.io/ +
+
+
+
+
+ + ☆ ReliOcc: Towards Reliable Semantic Occupancy Prediction via Uncertainty + Learning + + +
+ Vision-centric semantic occupancy prediction plays a crucial role in +autonomous driving, which requires accurate and reliable predictions from +low-cost sensors. Although having notably narrowed the accuracy gap with LiDAR, +there is still few research effort to explore the reliability in predicting +semantic occupancy from camera. In this paper, we conduct a comprehensive +evaluation of existing semantic occupancy prediction models from a reliability +perspective for the first time. Despite the gradual alignment of camera-based +models with LiDAR in term of accuracy, a significant reliability gap persists. +To addresses this concern, we propose ReliOcc, a method designed to enhance the +reliability of camera-based occupancy networks. ReliOcc provides a +plug-and-play scheme for existing models, which integrates hybrid uncertainty +from individual voxels with sampling-based noise and relative voxels through +mix-up learning. Besides, an uncertainty-aware calibration strategy is devised +to further enhance model reliability in offline mode. Extensive experiments +under various settings demonstrate that ReliOcc significantly enhances model +reliability while maintaining the accuracy of both geometric and semantic +predictions. Importantly, our proposed approach exhibits robustness to sensor +failures and out of domain noises during inference. + +
+
+ comment: Technical report. Work in progress +
+
+
+
+
+ + ☆ Transferring disentangled representations: bridging the gap between + synthetic and real images + + +
+ Developing meaningful and efficient representations that separate the +fundamental structure of the data generation mechanism is crucial in +representation learning. However, Disentangled Representation Learning has not +fully shown its potential on real images, because of correlated generative +factors, their resolution and limited access to ground truth labels. +Specifically on the latter, we investigate the possibility of leveraging +synthetic data to learn general-purpose disentangled representations applicable +to real data, discussing the effect of fine-tuning and what properties of +disentanglement are preserved after the transfer. We provide an extensive +empirical study to address these issues. In addition, we propose a new +interpretable intervention-based metric, to measure the quality of factors +encoding in the representation. Our results indicate that some level of +disentanglement, transferring a representation from synthetic to real data, is +possible and effective. + +
+
+
+
+
+ + ☆ PhoCoLens: Photorealistic and Consistent Reconstruction in Lensless + Imaging NeurIPS 2024 + + +
+ Lensless cameras offer significant advantages in size, weight, and cost +compared to traditional lens-based systems. Without a focusing lens, lensless +cameras rely on computational algorithms to recover the scenes from multiplexed +measurements. However, current algorithms struggle with inaccurate forward +imaging models and insufficient priors to reconstruct high-quality images. To +overcome these limitations, we introduce a novel two-stage approach for +consistent and photorealistic lensless image reconstruction. The first stage of +our approach ensures data consistency by focusing on accurately reconstructing +the low-frequency content with a spatially varying deconvolution method that +adjusts to changes in the Point Spread Function (PSF) across the camera's field +of view. The second stage enhances photorealism by incorporating a generative +prior from pre-trained diffusion models. By conditioning on the low-frequency +content retrieved in the first stage, the diffusion model effectively +reconstructs the high-frequency details that are typically lost in the lensless +imaging process, while also maintaining image fidelity. Our method achieves a +superior balance between data fidelity and visual quality compared to existing +methods, as demonstrated with two popular lensless systems, PhlatCam and +DiffuserCam. Project website: https://phocolens.github.io/. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ☆ InterNet: Unsupervised Cross-modal Homography Estimation Based on + Interleaved Modality Transfer and Self-supervised Homography Prediction + + +
+ We propose a novel unsupervised cross-modal homography estimation framework, +based on interleaved modality transfer and self-supervised homography +prediction, named InterNet. InterNet integrates modality transfer and +self-supervised homography estimation, introducing an innovative interleaved +optimization framework to alternately promote both components. The modality +transfer gradually narrows the modality gaps, facilitating the self-supervised +homography estimation to fully leverage the synthetic intra-modal data. The +self-supervised homography estimation progressively achieves reliable +predictions, thereby providing robust cross-modal supervision for the modality +transfer. To further boost the estimation accuracy, we also formulate a +fine-grained homography feature loss to improve the connection between two +components. Furthermore, we employ a simple yet effective distillation training +technique to reduce model parameters and improve cross-domain generalization +ability while maintaining comparable performance. Experiments reveal that +InterNet achieves the state-of-the-art (SOTA) performance among unsupervised +methods, and even outperforms many supervised methods such as MHN and +LocalTrans. + +
+
+
+
+
+ + ☆ Deblur e-NeRF: NeRF from Motion-Blurred Events under High-speed or + Low-light Conditions ECCV 2024 + + +
+ The stark contrast in the design philosophy of an event camera makes it +particularly ideal for operating under high-speed, high dynamic range and +low-light conditions, where standard cameras underperform. Nonetheless, event +cameras still suffer from some amount of motion blur, especially under these +challenging conditions, in contrary to what most think. This is attributed to +the limited bandwidth of the event sensor pixel, which is mostly proportional +to the light intensity. Thus, to ensure that event cameras can truly excel in +such conditions where it has an edge over standard cameras, it is crucial to +account for event motion blur in downstream applications, especially +reconstruction. However, none of the recent works on reconstructing Neural +Radiance Fields (NeRFs) from events, nor event simulators, have considered the +full effects of event motion blur. To this end, we propose, Deblur e-NeRF, a +novel method to directly and effectively reconstruct blur-minimal NeRFs from +motion-blurred events generated under high-speed motion or low-light +conditions. The core component of this work is a physically-accurate pixel +bandwidth model proposed to account for event motion blur under arbitrary speed +and lighting conditions. We also introduce a novel threshold-normalized total +variation loss to improve the regularization of large textureless patches. +Experiments on real and novel realistically simulated sequences verify our +effectiveness. Our code, event simulator and synthetic event dataset will be +open-sourced. + +
+
+ comment: Accepted to ECCV 2024. Project website is accessible at + https://wengflow.github.io/deblur-e-nerf. arXiv admin note: text overlap with + arXiv:2006.07722 by other authors +
+
+
+
+
+ + ☆ LLM4Brain: Training a Large Language Model for Brain Video Understanding ECCV2024 + + +
+ Decoding visual-semantic information from brain signals, such as functional +MRI (fMRI), across different subjects poses significant challenges, including +low signal-to-noise ratio, limited data availability, and cross-subject +variability. Recent advancements in large language models (LLMs) show +remarkable effectiveness in processing multimodal information. In this study, +we introduce an LLM-based approach for reconstructing visual-semantic +information from fMRI signals elicited by video stimuli. Specifically, we +employ fine-tuning techniques on an fMRI encoder equipped with adaptors to +transform brain responses into latent representations aligned with the video +stimuli. Subsequently, these representations are mapped to textual modality by +LLM. In particular, we integrate self-supervised domain adaptation methods to +enhance the alignment between visual-semantic information and brain responses. +Our proposed method achieves good results using various quantitative semantic +metrics, while yielding similarity with ground-truth information. + +
+
+ comment: ECCV2024 Workshop +
+
+
+
+
+ + ☆ BlinkTrack: Feature Tracking over 100 FPS via Events and Images + + +
+ Feature tracking is crucial for, structure from motion (SFM), simultaneous +localization and mapping (SLAM), object tracking and various computer vision +tasks. Event cameras, known for their high temporal resolution and ability to +capture asynchronous changes, have gained significant attention for their +potential in feature tracking, especially in challenging conditions. However, +event cameras lack the fine-grained texture information that conventional +cameras provide, leading to error accumulation in tracking. To address this, we +propose a novel framework, BlinkTrack, which integrates event data with RGB +images for high-frequency feature tracking. Our method extends the traditional +Kalman filter into a learning-based framework, utilizing differentiable Kalman +filters in both event and image branches. This approach improves +single-modality tracking, resolves ambiguities, and supports asynchronous data +fusion. We also introduce new synthetic and augmented datasets to better +evaluate our model. Experimental results indicate that BlinkTrack significantly +outperforms existing event-based methods, exceeding 100 FPS with preprocessed +event data and 80 FPS with multi-modality data. + +
+
+
+
+
+ + ☆ HydraViT: Stacking Heads for a Scalable ViT + + +
+ The architecture of Vision Transformers (ViTs), particularly the Multi-head +Attention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs +on devices with varying constraints, such as mobile phones, requires multiple +models of different sizes. However, this approach has limitations, such as +training and storing each required model separately. This paper introduces +HydraViT, a novel approach that addresses these limitations by stacking +attention heads to achieve a scalable ViT. By repeatedly changing the size of +the embedded dimensions throughout each layer and their corresponding number of +attention heads in MHA during training, HydraViT induces multiple subnetworks. +Thereby, HydraViT achieves adaptability across a wide spectrum of hardware +environments while maintaining performance. Our experimental results +demonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10 +subnetworks, covering a wide range of resource constraints. HydraViT achieves +up to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy +with the same throughput on ImageNet-1K compared to the baselines, making it an +effective solution for scenarios where hardware availability is diverse or +varies over time. Source code available at https://github.com/ds-kiel/HydraViT. + +
+
+
+
+
+ + ☆ Cross-Modality Attack Boosted by Gradient-Evolutionary Multiform + Optimization + + +
+ In recent years, despite significant advancements in adversarial attack +research, the security challenges in cross-modal scenarios, such as the +transferability of adversarial attacks between infrared, thermal, and RGB +images, have been overlooked. These heterogeneous image modalities collected by +different hardware devices are widely prevalent in practical applications, and +the substantial differences between modalities pose significant challenges to +attack transferability. In this work, we explore a novel cross-modal +adversarial attack strategy, termed multiform attack. We propose a dual-layer +optimization framework based on gradient-evolution, facilitating efficient +perturbation transfer between modalities. In the first layer of optimization, +the framework utilizes image gradients to learn universal perturbations within +each modality and employs evolutionary algorithms to search for shared +perturbations with transferability across different modalities through +secondary optimization. Through extensive testing on multiple heterogeneous +datasets, we demonstrate the superiority and robustness of Multiform Attack +compared to existing techniques. This work not only enhances the +transferability of cross-modal adversarial attacks but also provides a new +perspective for understanding security vulnerabilities in cross-modal systems. + +
+
+
+
+
+ + ☆ CNCA: Toward Customizable and Natural Generation of Adversarial + Camouflage for Vehicle Detectors + + +
+ Prior works on physical adversarial camouflage against vehicle detectors +mainly focus on the effectiveness and robustness of the attack. The current +most successful methods optimize 3D vehicle texture at a pixel level. However, +this results in conspicuous and attention-grabbing patterns in the generated +camouflage, which humans can easily identify. To address this issue, we propose +a Customizable and Natural Camouflage Attack (CNCA) method by leveraging an +off-the-shelf pre-trained diffusion model. By sampling the optimal texture +image from the diffusion model with a user-specific text prompt, our method can +generate natural and customizable adversarial camouflage while maintaining high +attack performance. With extensive experiments on the digital and physical +worlds and user studies, the results demonstrate that our proposed method can +generate significantly more natural-looking camouflage than the +state-of-the-art baselines while achieving competitive attack performance. Our +code is available at +\href{https://anonymous.4open.science/r/CNCA-1D54}{https://anonymous.4open.science/r/CNCA-1D54} + +
+
+
+
+
+ + ☆ The Hard Positive Truth about Vision-Language Compositionality ECCV 2024 + + +
+ Several benchmarks have concluded that our best vision-language models (e.g., +CLIP) are lacking in compositionality. Given an image, these benchmarks probe a +model's ability to identify its associated caption amongst a set of +compositional distractors. In response, a surge of recent proposals show +improvements by finetuning CLIP with distractors as hard negatives. Our +investigations reveal that these improvements have, in fact, been significantly +overstated -- because existing benchmarks do not probe whether finetuned +vision-language models remain invariant to hard positives. By curating an +evaluation dataset with 112,382 hard negatives and hard positives, we uncover +that including hard positives decreases CLIP's performance by 12.9%, while +humans perform effortlessly at 99%. CLIP finetuned with hard negatives results +in an even larger decrease, up to 38.7%. With this finding, we then produce a +1,775,259 image-text training set with both hard negative and hard positive +captions. By training with both, we see improvements on existing benchmarks +while simultaneously improving performance on hard positives, indicating a more +robust improvement in compositionality. Our work suggests the need for future +research to rigorously test and improve CLIP's understanding of semantic +relationships between related "positive" concepts. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + Spatial Hierarchy and Temporal Attention Guided Cross Masking for + Self-supervised Skeleton-based Action Recognition + + +
+ In self-supervised skeleton-based action recognition, the mask reconstruction +paradigm is gaining interest in enhancing model refinement and robustness +through effective masking. However, previous works primarily relied on a single +masking criterion, resulting in the model overfitting specific features and +overlooking other effective information. In this paper, we introduce a +hierarchy and attention guided cross-masking framework (HA-CM) that applies +masking to skeleton sequences from both spatial and temporal perspectives. +Specifically, in spatial graphs, we utilize hyperbolic space to maintain joint +distinctions and effectively preserve the hierarchical structure of +high-dimensional skeletons, employing joint hierarchy as the masking criterion. +In temporal flows, we substitute traditional distance metrics with the global +attention of joints for masking, addressing the convergence of distances in +high-dimensional space and the lack of a global perspective. Additionally, we +incorporate cross-contrast loss based on the cross-masking framework into the +loss function to enhance the model's learning of instance-level features. HA-CM +shows efficiency and universality on three public large-scale datasets, NTU-60, +NTU-120, and PKU-MMD. The source code of our HA-CM is available at +https://github.com/YinxPeng/HA-CM-main. + +
+
+ comment: 12 pages,6 figures,IEEE Trans +
+
+
+
+
+ + ☆ Perturb, Attend, Detect and Localize (PADL): Robust Proactive Image + Defense + + +
+ Image manipulation detection and localization have received considerable +attention from the research community given the blooming of Generative Models +(GMs). Detection methods that follow a passive approach may overfit to specific +GMs, limiting their application in real-world scenarios, due to the growing +diversity of generative models. Recently, approaches based on a proactive +framework have shown the possibility of dealing with this limitation. However, +these methods suffer from two main limitations, which raises concerns about +potential vulnerabilities: i) the manipulation detector is not robust to noise +and hence can be easily fooled; ii) the fact that they rely on fixed +perturbations for image protection offers a predictable exploit for malicious +attackers, enabling them to reverse-engineer and evade detection. To overcome +this issue we propose PADL, a new solution able to generate image-specific +perturbations using a symmetric scheme of encoding and decoding based on +cross-attention, which drastically reduces the possibility of reverse +engineering, even when evaluated with adaptive attack [31]. Additionally, PADL +is able to pinpoint manipulated areas, facilitating the identification of +specific regions that have undergone alterations, and has more generalization +power than prior art on held-out generative models. Indeed, although being +trained only on an attribute manipulation GAN model [15], our method +generalizes to a range of unseen models with diverse architectural designs, +such as StarGANv2, BlendGAN, DiffAE, StableDiffusion and StableDiffusionXL. +Additionally, we introduce a novel evaluation protocol, which offers a fair +evaluation of localisation performance in function of detection accuracy and +better captures real-world scenarios. + +
+
+
+
+
+ + ☆ Neural Light Spheres for Implicit Image Stitching and View Synthesis + + +
+ Challenging to capture, and challenging to display on a cellphone screen, the +panorama paradoxically remains both a staple and underused feature of modern +mobile camera applications. In this work we address both of these challenges +with a spherical neural light field model for implicit panoramic image +stitching and re-rendering; able to accommodate for depth parallax, +view-dependent lighting, and local scene motion and color changes during +capture. Fit during test-time to an arbitrary path panoramic video capture -- +vertical, horizontal, random-walk -- these neural light spheres jointly +estimate the camera path and a high-resolution scene reconstruction to produce +novel wide field-of-view projections of the environment. Our single-layer model +avoids expensive volumetric sampling, and decomposes the scene into compact +view-dependent ray offset and color components, with a total model size of 80 +MB per scene, and real-time (50 FPS) rendering at 1080p resolution. We +demonstrate improved reconstruction quality over traditional image stitching +and radiance field methods, with significantly higher tolerance to scene motion +and non-ideal capture settings. + +
+
+ comment: Project site: https://light.princeton.edu/publication/neuls/ +
+
+
+
+
+ + ☆ Resolving Multi-Condition Confusion for Finetuning-Free Personalized + Image Generation + + +
+ Personalized text-to-image generation methods can generate customized images +based on the reference images, which have garnered wide research interest. +Recent methods propose a finetuning-free approach with a decoupled +cross-attention mechanism to generate personalized images requiring no +test-time finetuning. However, when multiple reference images are provided, the +current decoupled cross-attention mechanism encounters the object confusion +problem and fails to map each reference image to its corresponding object, +thereby seriously limiting its scope of application. To address the object +confusion problem, in this work we investigate the relevance of different +positions of the latent image features to the target object in diffusion model, +and accordingly propose a weighted-merge method to merge multiple reference +image features into the corresponding objects. Next, we integrate this +weighted-merge method into existing pre-trained models and continue to train +the model on a multi-object dataset constructed from the open-sourced SA-1B +dataset. To mitigate object confusion and reduce training costs, we propose an +object quality score to estimate the image quality for the selection of +high-quality training samples. Furthermore, our weighted-merge training +framework can be employed on single-object generation when a single object has +multiple reference images. The experiments verify that our method achieves +superior performance to the state-of-the-arts on the Concept101 dataset and +DreamBooth dataset of multi-object personalized image generation, and +remarkably improves the performance on single-object personalized image +generation. Our code is available at https://github.com/hqhQAQ/MIP-Adapter. + +
+
+
+
+
+ + ☆ WaSt-3D: Wasserstein-2 Distance for Scene-to-Scene Stylization on 3D + Gaussians + + +
+ While style transfer techniques have been well-developed for 2D image +stylization, the extension of these methods to 3D scenes remains relatively +unexplored. Existing approaches demonstrate proficiency in transferring colors +and textures but often struggle with replicating the geometry of the scenes. In +our work, we leverage an explicit Gaussian Splatting (GS) representation and +directly match the distributions of Gaussians between style and content scenes +using the Earth Mover's Distance (EMD). By employing the entropy-regularized +Wasserstein-2 distance, we ensure that the transformation maintains spatial +smoothness. Additionally, we decompose the scene stylization problem into +smaller chunks to enhance efficiency. This paradigm shift reframes stylization +from a pure generative process driven by latent space losses to an explicit +matching of distributions between two Gaussian representations. Our method +achieves high-resolution 3D stylization by faithfully transferring details from +3D style scenes onto the content scene. Furthermore, WaSt-3D consistently +delivers results across diverse content and style scenes without necessitating +any training, as it relies solely on optimization-based techniques. See our +project page for additional results and source code: +$\href{https://compvis.github.io/wast3d/}{https://compvis.github.io/wast3d/}$. + +
+
+
+
+
+ + ☆ LKA-ReID:Vehicle Re-Identification with Large Kernel Attention ICASSP 2025 + + +
+ With the rapid development of intelligent transportation systems and the +popularity of smart city infrastructure, Vehicle Re-ID technology has become an +important research field. The vehicle Re-ID task faces an important challenge, +which is the high similarity between different vehicles. Existing methods use +additional detection or segmentation models to extract differentiated local +features. However, these methods either rely on additional annotations or +greatly increase the computational cost. Using attention mechanism to capture +global and local features is crucial to solve the challenge of high similarity +between classes in vehicle Re-ID tasks. In this paper, we propose LKA-ReID with +large kernel attention. Specifically, the large kernel attention (LKA) utilizes +the advantages of self-attention and also benefits from the advantages of +convolution, which can extract the global and local features of the vehicle +more comprehensively. We also introduce hybrid channel attention (HCA) combines +channel attention with spatial information, so that the model can better focus +on channels and feature regions, and ignore background and other disturbing +information. Experiments on VeRi-776 dataset demonstrated the effectiveness of +LKA-ReID, with mAP reaches 86.65% and Rank-1 reaches 98.03%. + +
+
+ comment: The paper is under consideration at 2025 IEEE International + Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025) +
+
+
+
+
+ + ☆ Self-supervised Monocular Depth Estimation with Large Kernel Attention ICASSP 2025 + + +
+ Self-supervised monocular depth estimation has emerged as a promising +approach since it does not rely on labeled training data. Most methods combine +convolution and Transformer to model long-distance dependencies to estimate +depth accurately. However, Transformer treats 2D image features as 1D +sequences, and positional encoding somewhat mitigates the loss of spatial +information between different feature blocks, tending to overlook channel +features, which limit the performance of depth estimation. In this paper, we +propose a self-supervised monocular depth estimation network to get finer +details. Specifically, we propose a decoder based on large kernel attention, +which can model long-distance dependencies without compromising the +two-dimension structure of features while maintaining feature channel +adaptivity. In addition, we introduce a up-sampling module to accurately +recover the fine details in the depth map. Our method achieves competitive +results on the KITTI dataset. + +
+
+ comment: The paper is under consideration at 2025 IEEE International + Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025) +
+
+
+
+
+ + ☆ Upper-Body Pose-based Gaze Estimation for Privacy-Preserving 3D Gaze + Target Detection ECCV 2024 + + +
+ Gaze Target Detection (GTD), i.e., determining where a person is looking +within a scene from an external viewpoint, is a challenging task, particularly +in 3D space. Existing approaches heavily rely on analyzing the person's +appearance, primarily focusing on their face to predict the gaze target. This +paper presents a novel approach to tackle this problem by utilizing the +person's upper-body pose and available depth maps to extract a 3D gaze +direction and employing a multi-stage or an end-to-end pipeline to predict the +gazed target. When predicted accurately, the human body pose can provide +valuable information about the head pose, which is a good approximation of the +gaze direction, as well as the position of the arms and hands, which are linked +to the activity the person is performing and the objects they are likely +focusing on. Consequently, in addition to performing gaze estimation in 3D, we +are also able to perform GTD simultaneously. We demonstrate state-of-the-art +results on the most comprehensive publicly accessible 3D gaze target detection +dataset without requiring images of the person's face, thus promoting privacy +preservation in various application contexts. The code is available at +https://github.com/intelligolabs/privacy-gtd-3D. + +
+
+ comment: Accepted in the T-CAP workshop at ECCV 2024 +
+
+
+
+
+ + ☆ Self-Distilled Depth Refinement with Noisy Poisson Fusion NeurIPS 2024 + + +
+ Depth refinement aims to infer high-resolution depth with fine-grained edges +and details, refining low-resolution results of depth estimation models. The +prevailing methods adopt tile-based manners by merging numerous patches, which +lacks efficiency and produces inconsistency. Besides, prior arts suffer from +fuzzy depth boundaries and limited generalizability. Analyzing the fundamental +reasons for these limitations, we model depth refinement as a noisy Poisson +fusion problem with local inconsistency and edge deformation noises. We propose +the Self-distilled Depth Refinement (SDDR) framework to enforce robustness +against the noises, which mainly consists of depth edge representation and +edge-based guidance. With noisy depth predictions as input, SDDR generates +low-noise depth edge representations as pseudo-labels by coarse-to-fine +self-distillation. Edge-based guidance with edge-guided gradient loss and +edge-based fusion loss serves as the optimization objective equivalent to +Poisson fusion. When depth maps are better refined, the labels also become more +noise-free. Our model can acquire strong robustness to the noises, achieving +significant improvements in accuracy, edge quality, efficiency, and +generalizability on five different benchmarks. Moreover, directly training +another model with edge labels produced by SDDR brings improvements, suggesting +that our method could help with training robust refinement models in future +works. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Visualization of Age Distributions as Elements of Medical Data-Stories + + +
+ In various fields, including medicine, age distributions are crucial. Despite +widespread media coverage of health topics, there remains a need to enhance +health communication. Narrative medical visualization is promising for +improving information comprehension and retention. This study explores the most +effective ways to present age distributions of diseases through narrative +visualizations. We conducted a thorough analysis of existing visualizations, +held workshops with a broad audience, and reviewed relevant literature. From +this, we identified design choices focusing on comprehension, aesthetics, +engagement, and memorability. We specifically tested three pictogram variants: +pictograms as bars, stacked pictograms, and annotations. After evaluating 18 +visualizations with 72 participants and three expert reviews, we determined +that annotations were most effective for comprehension and aesthetics. However, +traditional bar charts were preferred for engagement, and other variants were +more memorable. The study provides a set of design recommendations based on +these insights. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ A New Dataset for Monocular Depth Estimation Under Viewpoint Shifts ECCV 2024 + + +
+ Monocular depth estimation is a critical task for autonomous driving and many +other computer vision applications. While significant progress has been made in +this field, the effects of viewpoint shifts on depth estimation models remain +largely underexplored. This paper introduces a novel dataset and evaluation +methodology to quantify the impact of different camera positions and +orientations on monocular depth estimation performance. We propose a ground +truth strategy based on homography estimation and object detection, eliminating +the need for expensive lidar sensors. We collect a diverse dataset of road +scenes from multiple viewpoints and use it to assess the robustness of a modern +depth estimation model to geometric shifts. After assessing the validity of our +strategy on a public dataset, we provide valuable insights into the limitations +of current models and highlight the importance of considering viewpoint +variations in real-world applications. + +
+
+ comment: 17 pages, 5 figures. Accepted at ECCV 2024 2nd Workshop on + Vision-Centric Autonomous Driving (VCAD) +
+
+
+
+
+ + ☆ Unsupervised Learning Based Multi-Scale Exposure Fusion + + +
+ Unsupervised learning based multi-scale exposure fusion (ULMEF) is efficient +for fusing differently exposed low dynamic range (LDR) images into a higher +quality LDR image for a high dynamic range (HDR) scene. Unlike supervised +learning, loss functions play a crucial role in the ULMEF. In this paper, novel +loss functions are proposed for the ULMEF and they are defined by using all the +images to be fused and other differently exposed images from the same HDR +scene. The proposed loss functions can guide the proposed ULMEF to learn more +reliable information from the HDR scene than existing loss functions which are +defined by only using the set of images to be fused. As such, the quality of +the fused image is significantly improved. The proposed ULMEF also adopts a +multi-scale strategy that includes a multi-scale attention module to +effectively preserve the scene depth and local contrast in the fused image. +Meanwhile, the proposed ULMEF can be adopted to achieve exposure interpolation +and exposure extrapolation. Extensive experiments show that the proposed ULMEF +algorithm outperforms state-of-the-art exposure fusion algorithms. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Kendall's $τ$ Coefficient for Logits Distillation + + +
+ Knowledge distillation typically employs the Kullback-Leibler (KL) divergence +to constrain the student model's output to match the soft labels provided by +the teacher model exactly. However, sometimes the optimization direction of the +KL divergence loss is not always aligned with the task loss, where a smaller KL +divergence could lead to erroneous predictions that diverge from the soft +labels. This limitation often results in suboptimal optimization for the +student. Moreover, even under temperature scaling, the KL divergence loss +function tends to overly focus on the larger-valued channels in the logits, +disregarding the rich inter-class information provided by the multitude of +smaller-valued channels. This hard constraint proves too challenging for +lightweight students, hindering further knowledge distillation. To address this +issue, we propose a plug-and-play ranking loss based on Kendall's $\tau$ +coefficient, called Rank-Kendall Knowledge Distillation (RKKD). RKKD balances +the attention to smaller-valued channels by constraining the order of channel +values in student logits, providing more inter-class relational information. +The rank constraint on the top-valued channels helps avoid suboptimal traps +during optimization. We also discuss different differentiable forms of +Kendall's $\tau$ coefficient and demonstrate that the proposed ranking loss +function shares a consistent optimization objective with the KL divergence. +Extensive experiments on the CIFAR-100 and ImageNet datasets show that our RKKD +can enhance the performance of various knowledge distillation baselines and +offer broad improvements across multiple teacher-student architecture +combinations. + +
+
+
+
+
+ + ☆ Cascade Prompt Learning for Vision-Language Model Adaptation ECCV2024 + + +
+ Prompt learning has surfaced as an effective approach to enhance the +performance of Vision-Language Models (VLMs) like CLIP when applied to +downstream tasks. However, current learnable prompt tokens are primarily used +for the single phase of adapting to tasks (i.e., adapting prompt), easily +leading to overfitting risks. In this work, we propose a novel Cascade Prompt +Learning CasPL framework to enable prompt learning to serve both generic and +specific expertise (i.e., boosting and adapting prompt) simultaneously. +Specifically, CasPL is a new learning paradigm comprising two distinct phases +of learnable prompts: the first boosting prompt is crafted to extract +domain-general knowledge from a senior larger CLIP teacher model by aligning +their predicted logits using extensive unlabeled domain images. The second +adapting prompt is then cascaded with the frozen first set to fine-tune the +downstream tasks, following the approaches employed in prior research. In this +manner, CasPL can effectively capture both domain-general and task-specific +representations into explicitly different gradual groups of prompts, thus +potentially alleviating overfitting issues in the target domain. It's worth +noting that CasPL serves as a plug-and-play module that can seamlessly +integrate into any existing prompt learning approach. CasPL achieves a +significantly better balance between performance and inference speed, which is +especially beneficial for deploying smaller VLM models in resource-constrained +environments. Compared to the previous state-of-the-art method PromptSRC, CasPL +shows an average improvement of 1.85% for base classes, 3.44% for novel +classes, and 2.72% for the harmonic mean over 11 image classification datasets. +Code is publicly available at: https://github.com/megvii-research/CasPL. + +
+
+ comment: ECCV2024 +
+
+
+
+
+ + ☆ Reblurring-Guided Single Image Defocus Deblurring: A Learning Framework + with Misaligned Training Pairs + + +
+ For single image defocus deblurring, acquiring well-aligned training pairs +(or training triplets), i.e., a defocus blurry image, an all-in-focus sharp +image (and a defocus blur map), is an intricate task for the development of +deblurring models. Existing image defocus deblurring methods typically rely on +training data collected by specialized imaging equipment, presupposing that +these pairs or triplets are perfectly aligned. However, in practical scenarios +involving the collection of real-world data, direct acquisition of training +triplets is infeasible, and training pairs inevitably encounter spatial +misalignment issues. In this work, we introduce a reblurring-guided learning +framework for single image defocus deblurring, enabling the learning of a +deblurring network even with misaligned training pairs. Specifically, we first +propose a baseline defocus deblurring network that utilizes spatially varying +defocus blur map as degradation prior to enhance the deblurring performance. +Then, to effectively learn the baseline defocus deblurring network with +misaligned training pairs, our reblurring module ensures spatial consistency +between the deblurred image, the reblurred image and the input blurry image by +reconstructing spatially variant isotropic blur kernels. Moreover, the +spatially variant blur derived from the reblurring module can serve as pseudo +supervision for defocus blur map during training, interestingly transforming +training pairs into training triplets. Additionally, we have collected a new +dataset specifically for single image defocus deblurring (SDD) with typical +misalignments, which not only substantiates our proposed method but also serves +as a benchmark for future research. + +
+
+ comment: The source code and dataset are available at + https://github.com/ssscrystal/Reblurring-guided-JDRL +
+
+
+
+
+ + ☆ CASPFormer: Trajectory Prediction from BEV Images with Deformable + Attention ICPR 2024 + + +
+ Motion prediction is an important aspect for Autonomous Driving (AD) and +Advance Driver Assistance Systems (ADAS). Current state-of-the-art motion +prediction methods rely on High Definition (HD) maps for capturing the +surrounding context of the ego vehicle. Such systems lack scalability in +real-world deployment as HD maps are expensive to produce and update in +real-time. To overcome this issue, we propose Context Aware Scene Prediction +Transformer (CASPFormer), which can perform multi-modal motion prediction from +rasterized Bird-Eye-View (BEV) images. Our system can be integrated with any +upstream perception module that is capable of generating BEV images. Moreover, +CASPFormer directly decodes vectorized trajectories without any postprocessing. +Trajectories are decoded recurrently using deformable attention, as it is +computationally efficient and provides the network with the ability to focus +its attention on the important spatial locations of the BEV images. In +addition, we also address the issue of mode collapse for generating multiple +scene-consistent trajectories by incorporating learnable mode queries. We +evaluate our model on the nuScenes dataset and show that it reaches +state-of-the-art across multiple metrics + +
+
+ comment: Under Review at ICPR 2024, Kolkata +
+
+
+
+
+ + ☆ Taming Diffusion Prior for Image Super-Resolution with Domain Shift SDEs NeurIPS 2024 + + +
+ Diffusion-based image super-resolution (SR) models have attracted substantial +interest due to their powerful image restoration capabilities. However, +prevailing diffusion models often struggle to strike an optimal balance between +efficiency and performance. Typically, they either neglect to exploit the +potential of existing extensive pretrained models, limiting their generative +capacity, or they necessitate a dozens of forward passes starting from random +noises, compromising inference efficiency. In this paper, we present DoSSR, a +Domain Shift diffusion-based SR model that capitalizes on the generative powers +of pretrained diffusion models while significantly enhancing efficiency by +initiating the diffusion process with low-resolution (LR) images. At the core +of our approach is a domain shift equation that integrates seamlessly with +existing diffusion models. This integration not only improves the use of +diffusion prior but also boosts inference efficiency. Moreover, we advance our +method by transitioning the discrete shift process to a continuous formulation, +termed as DoS-SDEs. This advancement leads to the fast and customized solvers +that further enhance sampling efficiency. Empirical results demonstrate that +our proposed method achieves state-of-the-art performance on synthetic and +real-world datasets, while notably requiring only 5 sampling steps. Compared to +previous diffusion prior based methods, our approach achieves a remarkable +speedup of 5-7 times, demonstrating its superior efficiency. Code: +https://github.com/QinpengCui/DoSSR. + +
+
+ comment: This paper is accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Harnessing Shared Relations via Multimodal Mixup Contrastive Learning + for Multimodal Classification + + +
+ Deep multimodal learning has shown remarkable success by leveraging +contrastive learning to capture explicit one-to-one relations across +modalities. However, real-world data often exhibits shared relations beyond +simple pairwise associations. We propose M3CoL, a Multimodal Mixup Contrastive +Learning approach to capture nuanced shared relations inherent in multimodal +data. Our key contribution is a Mixup-based contrastive loss that learns robust +representations by aligning mixed samples from one modality with their +corresponding samples from other modalities thereby capturing shared relations +between them. For multimodal classification tasks, we introduce a framework +that integrates a fusion module with unimodal prediction modules for auxiliary +supervision during training, complemented by our proposed Mixup-based +contrastive loss. Through extensive experiments on diverse datasets (N24News, +ROSMAP, BRCA, and Food-101), we demonstrate that M3CoL effectively captures +shared multimodal relations and generalizes across domains. It outperforms +state-of-the-art methods on N24News, ROSMAP, and BRCA, while achieving +comparable performance on Food-101. Our work highlights the significance of +learning shared relations for robust multimodal learning, opening up promising +avenues for future research. + +
+
+ comment: RK and RS contributed equally to this work, 20 Pages, 8 Figures, 9 + Tables +
+
+
+
+
+ + ☆ UNICORN: A Deep Learning Model for Integrating Multi-Stain Data in + Histopathology + + +
+ Background: The integration of multi-stain histopathology images through deep +learning poses a significant challenge in digital histopathology. Current +multi-modal approaches struggle with data heterogeneity and missing data. This +study aims to overcome these limitations by developing a novel transformer +model for multi-stain integration that can handle missing data during training +as well as inference. Methods: We propose UNICORN (UNiversal modality +Integration Network for CORonary classificatioN) a multi-modal transformer +capable of processing multi-stain histopathology for atherosclerosis severity +class prediction. The architecture comprises a two-stage, end-to-end trainable +model with specialized modules utilizing transformer self-attention blocks. The +initial stage employs domain-specific expert modules to extract features from +each modality. In the subsequent stage, an aggregation expert module integrates +these features by learning the interactions between the different data +modalities. Results: Evaluation was performed using a multi-class dataset of +atherosclerotic lesions from the Munich Cardiovascular Studies Biobank +(MISSION), using over 4,000 paired multi-stain whole slide images (WSIs) from +170 deceased individuals on 7 prespecified segments of the coronary tree, each +stained according to four histopathological protocols. UNICORN achieved a +classification accuracy of 0.67, outperforming other state-of-the-art models. +The model effectively identifies relevant tissue phenotypes across stainings +and implicitly models disease progression. Conclusion: Our proposed multi-modal +transformer model addresses key challenges in medical data analysis, including +data heterogeneity and missing modalities. Explainability and the model's +effectiveness in predicting atherosclerosis progression underscores its +potential for broader applications in medical research. + +
+
+
+
+
+ + ☆ Confidence intervals uncovered: Are we ready for real-world medical + imaging AI? MICCAI 2024 + + +
+ Medical imaging is spearheading the AI transformation of healthcare. +Performance reporting is key to determine which methods should be translated +into clinical practice. Frequently, broad conclusions are simply derived from +mean performance values. In this paper, we argue that this common practice is +often a misleading simplification as it ignores performance variability. Our +contribution is threefold. (1) Analyzing all MICCAI segmentation papers (n = +221) published in 2023, we first observe that more than 50\% of papers do not +assess performance variability at all. Moreover, only one (0.5\%) paper +reported confidence intervals (CIs) for model performance. (2) To address the +reporting bottleneck, we show that the unreported standard deviation (SD) in +segmentation papers can be approximated by a second-order polynomial function +of the mean Dice similarity coefficient (DSC). Based on external validation +data from 56 previous MICCAI challenges, we demonstrate that this approximation +can accurately reconstruct the CI of a method using information provided in +publications. (3) Finally, we reconstructed 95\% CIs around the mean DSC of +MICCAI 2023 segmentation papers. The median CI width was 0.03 which is three +times larger than the median performance gap between the first and second +ranked method. For more than 60\% of papers, the mean performance of the +second-ranked method was within the CI of the first-ranked method. We conclude +that current publications typically do not provide sufficient evidence to +support which models could potentially be translated into clinical practice. + +
+
+ comment: Paper accepted at MICCAI 2024 conference +
+
+
+
+
+ + ☆ LGFN: Lightweight Light Field Image Super-Resolution using Local + Convolution Modulation and Global Attention Feature Extraction + + +
+ Capturing different intensity and directions of light rays at the same scene +Light field (LF) can encode the 3D scene cues into a 4D LF image which has a +wide range of applications (i.e. post-capture refocusing and depth sensing). LF +image super-resolution (SR) aims to improve the image resolution limited by the +performance of LF camera sensor. Although existing methods have achieved +promising results the practical application of these models is limited because +they are not lightweight enough. In this paper we propose a lightweight model +named LGFN which integrates the local and global features of different views +and the features of different channels for LF image SR. Specifically owing to +neighboring regions of the same pixel position in different sub-aperture images +exhibit similar structural relationships we design a lightweight CNN-based +feature extraction module (namely DGCE) to extract local features better +through feature modulation. Meanwhile as the position beyond the boundaries in +the LF image presents a large disparity we propose an efficient spatial +attention module (namely ESAM) which uses decomposable large-kernel convolution +to obtain an enlarged receptive field and an efficient channel attention module +(namely ECAM). Compared with the existing LF image SR models with large +parameter our model has a parameter of 0.45M and a FLOPs of 19.33G which has +achieved a competitive effect. Extensive experiments with ablation studies +demonstrate the effectiveness of our proposed method which ranked the second +place in the Track 2 Fidelity & Efficiency of NTIRE2024 Light Field Super +Resolution Challenge and the seventh place in the Track 1 Fidelity. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Text Image Generation for Low-Resource Languages with Dual Translation + Learning + + +
+ Scene text recognition in low-resource languages frequently faces challenges +due to the limited availability of training datasets derived from real-world +scenes. This study proposes a novel approach that generates text images in +low-resource languages by emulating the style of real text images from +high-resource languages. Our approach utilizes a diffusion model that is +conditioned on binary states: ``synthetic'' and ``real.'' The training of this +model involves dual translation tasks, where it transforms plain text images +into either synthetic or real text images, based on the binary states. This +approach not only effectively differentiates between the two domains but also +facilitates the model's explicit recognition of characters in the target +language. Furthermore, to enhance the accuracy and variety of generated text +images, we introduce two guidance techniques: Fidelity-Diversity Balancing +Guidance and Fidelity Enhancement Guidance. Our experimental results +demonstrate that the text images generated by our proposed framework can +significantly improve the performance of scene text recognition models for +low-resource languages. + +
+
+ comment: 23 pages, 11 figures +
+
+
+
+
+ + ☆ AnyLogo: Symbiotic Subject-Driven Diffusion System with Gemini Status + + +
+ Diffusion models have made compelling progress on facilitating +high-throughput daily production. Nevertheless, the appealing customized +requirements are remain suffered from instance-level finetuning for authentic +fidelity. Prior zero-shot customization works achieve the semantic consistence +through the condensed injection of identity features, while addressing detailed +low-level signatures through complex model configurations and subject-specific +fabrications, which significantly break the statistical coherence within the +overall system and limit the applicability across various scenarios. To +facilitate the generic signature concentration with rectified efficiency, we +present \textbf{AnyLogo}, a zero-shot region customizer with remarkable detail +consistency, building upon the symbiotic diffusion system with eliminated +cumbersome designs. Streamlined as vanilla image generation, we discern that +the rigorous signature extraction and creative content generation are +promisingly compatible and can be systematically recycled within a single +denoising model. In place of the external configurations, the gemini status of +the denoising model promote the reinforced subject transmission efficiency and +disentangled semantic-signature space with continuous signature decoration. +Moreover, the sparse recycling paradigm is adopted to prevent the duplicated +risk with compressed transmission quota for diversified signature stimulation. +Extensive experiments on constructed logo-level benchmarks demonstrate the +effectiveness and practicability of our methods. + +
+
+ comment: 13 pages, 12 figures +
+
+
+
+
+ + ☆ Neural Implicit Representation for Highly Dynamic LiDAR Mapping and + Odometry + + +
+ Recent advancements in Simultaneous Localization and Mapping (SLAM) have +increasingly highlighted the robustness of LiDAR-based techniques. At the same +time, Neural Radiance Fields (NeRF) have introduced new possibilities for 3D +scene reconstruction, exemplified by SLAM systems. Among these, NeRF-LOAM has +shown notable performance in NeRF-based SLAM applications. However, despite its +strengths, these systems often encounter difficulties in dynamic outdoor +environments due to their inherent static assumptions. To address these +limitations, this paper proposes a novel method designed to improve +reconstruction in highly dynamic outdoor scenes. Based on NeRF-LOAM, the +proposed approach consists of two primary components. First, we separate the +scene into static background and dynamic foreground. By identifying and +excluding dynamic elements from the mapping process, this segmentation enables +the creation of a dense 3D map that accurately represents the static background +only. The second component extends the octree structure to support +multi-resolution representation. This extension not only enhances +reconstruction quality but also aids in the removal of dynamic objects +identified by the first module. Additionally, Fourier feature encoding is +applied to the sampled points, capturing high-frequency information and leading +to more complete reconstruction results. Evaluations on various datasets +demonstrate that our method achieves more competitive results compared to +current state-of-the-art approaches. + +
+
+
+
+
+ + ☆ AlterMOMA: Fusion Redundancy Pruning for Camera-LiDAR Fusion Models with + Alternative Modality Masking NeurIPS 2024 + + +
+ Camera-LiDAR fusion models significantly enhance perception performance in +autonomous driving. The fusion mechanism leverages the strengths of each +modality while minimizing their weaknesses. Moreover, in practice, camera-LiDAR +fusion models utilize pre-trained backbones for efficient training. However, we +argue that directly loading single-modal pre-trained camera and LiDAR backbones +into camera-LiDAR fusion models introduces similar feature redundancy across +modalities due to the nature of the fusion mechanism. Unfortunately, existing +pruning methods are developed explicitly for single-modal models, and thus, +they struggle to effectively identify these specific redundant parameters in +camera-LiDAR fusion models. In this paper, to address the issue above on +camera-LiDAR fusion models, we propose a novelty pruning framework Alternative +Modality Masking Pruning (AlterMOMA), which employs alternative masking on each +modality and identifies the redundant parameters. Specifically, when one +modality parameters are masked (deactivated), the absence of features from the +masked backbone compels the model to reactivate previous redundant features of +the other modality backbone. Therefore, these redundant features and relevant +redundant parameters can be identified via the reactivation process. The +redundant parameters can be pruned by our proposed importance score evaluation +function, Alternative Evaluation (AlterEva), which is based on the observation +of the loss changes when certain modality parameters are activated and +deactivated. Extensive experiments on the nuScene and KITTI datasets +encompassing diverse tasks, baseline models, and pruning algorithms showcase +that AlterMOMA outperforms existing pruning methods, attaining state-of-the-art +performance. + +
+
+ comment: 17 pages, 3 figures, Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Robotic-CLIP: Fine-tuning CLIP on Action Data for Robotic Applications + + +
+ Vision language models have played a key role in extracting meaningful +features for various robotic applications. Among these, Contrastive +Language-Image Pretraining (CLIP) is widely used in robotic tasks that require +both vision and natural language understanding. However, CLIP was trained +solely on static images paired with text prompts and has not yet been fully +adapted for robotic tasks involving dynamic actions. In this paper, we +introduce Robotic-CLIP to enhance robotic perception capabilities. We first +gather and label large-scale action data, and then build our Robotic-CLIP by +fine-tuning CLIP on 309,433 videos (~7.4 million frames) of action data using +contrastive learning. By leveraging action data, Robotic-CLIP inherits CLIP's +strong image performance while gaining the ability to understand actions in +robotic contexts. Intensive experiments show that our Robotic-CLIP outperforms +other CLIP-based models across various language-driven robotic tasks. +Additionally, we demonstrate the practical effectiveness of Robotic-CLIP in +real-world grasping applications. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Scene Understanding in Pick-and-Place Tasks: Analyzing Transformations + Between Initial and Final Scenes + + +
+ With robots increasingly collaborating with humans in everyday tasks, it is +important to take steps toward robotic systems capable of understanding the +environment. This work focuses on scene understanding to detect pick and place +tasks given initial and final images from the scene. To this end, a dataset is +collected for object detection and pick and place task detection. A YOLOv5 +network is subsequently trained to detect the objects in the initial and final +scenes. Given the detected objects and their bounding boxes, two methods are +proposed to detect the pick and place tasks which transform the initial scene +into the final scene. A geometric method is proposed which tracks objects' +movements in the two scenes and works based on the intersection of the bounding +boxes which moved within scenes. Contrarily, the CNN-based method utilizes a +Convolutional Neural Network to classify objects with intersected bounding +boxes into 5 classes, showing the spatial relationship between the involved +objects. The performed pick and place tasks are then derived from analyzing the +experiments with both scenes. Results show that the CNN-based method, using a +VGG16 backbone, outscores the geometric method by roughly 12 percentage points +in certain scenarios, with an overall success rate of 84.3%. + +
+
+ comment: Conference Paper, ICEE 2024, 7 pages, 5 figures +
+
+
+
+
+ + ☆ Behaviour4All: in-the-wild Facial Behaviour Analysis Toolkit + + +
+ In this paper, we introduce Behavior4All, a comprehensive, open-source +toolkit for in-the-wild facial behavior analysis, integrating Face +Localization, Valence-Arousal Estimation, Basic Expression Recognition and +Action Unit Detection, all within a single framework. Available in both +CPU-only and GPU-accelerated versions, Behavior4All leverages 12 large-scale, +in-the-wild datasets consisting of over 5 million images from diverse +demographic groups. It introduces a novel framework that leverages distribution +matching and label co-annotation to address tasks with non-overlapping +annotations, encoding prior knowledge of their relatedness. In the largest +study of its kind, Behavior4All outperforms both state-of-the-art and toolkits +in overall performance as well as fairness across all databases and tasks. It +also demonstrates superior generalizability on unseen databases and on compound +expression recognition. Finally, Behavior4All is way times faster than other +toolkits. + +
+
+
+
+
+ + ☆ MoGenTS: Motion Generation based on Spatial-Temporal Joint Modeling NeurIPS 2024 + + +
+ Motion generation from discrete quantization offers many advantages over +continuous regression, but at the cost of inevitable approximation errors. +Previous methods usually quantize the entire body pose into one code, which not +only faces the difficulty in encoding all joints within one vector but also +loses the spatial relationship between different joints. Differently, in this +work we quantize each individual joint into one vector, which i) simplifies the +quantization process as the complexity associated with a single joint is +markedly lower than that of the entire pose; ii) maintains a spatial-temporal +structure that preserves both the spatial relationships among joints and the +temporal movement patterns; iii) yields a 2D token map, which enables the +application of various 2D operations widely used in 2D images. Grounded in the +2D motion quantization, we build a spatial-temporal modeling framework, where +2D joint VQVAE, temporal-spatial 2D masking technique, and spatial-temporal 2D +attention are proposed to take advantage of spatial-temporal signals among the +2D tokens. Extensive experiments demonstrate that our method significantly +outperforms previous methods across different datasets, with a $26.6\%$ +decrease of FID on HumanML3D and a $29.9\%$ decrease on KIT-ML. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Dark Miner: Defend against unsafe generation for text-to-image diffusion + models + + +
+ Text-to-image diffusion models have been demonstrated with unsafe generation +due to unfiltered large-scale training data, such as violent, sexual, and +shocking images, necessitating the erasure of unsafe concepts. Most existing +methods focus on modifying the generation probabilities conditioned on the +texts containing unsafe descriptions. However, they fail to guarantee safe +generation for unseen texts in the training phase, especially for the prompts +from adversarial attacks. In this paper, we re-analyze the erasure task and +point out that existing methods cannot guarantee the minimization of the total +probabilities of unsafe generation. To tackle this problem, we propose Dark +Miner. It entails a recurring three-stage process that comprises mining, +verifying, and circumventing. It greedily mines embeddings with maximum +generation probabilities of unsafe concepts and reduces unsafe generation more +effectively. In the experiments, we evaluate its performance on two +inappropriate concepts, two objects, and two styles. Compared with 6 previous +state-of-the-art methods, our method achieves better erasure and defense +results in most cases, especially under 4 state-of-the-art attacks, while +preserving the model's native generation capability. Our code will be available +on GitHub. + +
+
+
+
+
+ + ☆ Event-based Stereo Depth Estimation: A Survey + + +
+ Stereopsis has widespread appeal in robotics as it is the predominant way by +which living beings perceive depth to navigate our 3D world. Event cameras are +novel bio-inspired sensors that detect per-pixel brightness changes +asynchronously, with very high temporal resolution and high dynamic range, +enabling machine perception in high-speed motion and broad illumination +conditions. The high temporal precision also benefits stereo matching, making +disparity (depth) estimation a popular research area for event cameras ever +since its inception. Over the last 30 years, the field has evolved rapidly, +from low-latency, low-power circuit design to current deep learning (DL) +approaches driven by the computer vision community. The bibliography is vast +and difficult to navigate for non-experts due its highly interdisciplinary +nature. Past surveys have addressed distinct aspects of this topic, in the +context of applications, or focusing only on a specific class of techniques, +but have overlooked stereo datasets. This survey provides a comprehensive +overview, covering both instantaneous stereo and long-term methods suitable for +simultaneous localization and mapping (SLAM), along with theoretical and +empirical comparisons. It is the first to extensively review DL methods as well +as stereo datasets, even providing practical suggestions for creating new +benchmarks to advance the field. The main advantages and challenges faced by +event-based stereo depth estimation are also discussed. Despite significant +progress, challenges remain in achieving optimal performance in not only +accuracy but also efficiency, a cornerstone of event-based computing. We +identify several gaps and propose future research directions. We hope this +survey inspires future research in this area, by serving as an accessible entry +point for newcomers, as well as a practical guide for seasoned researchers in +the community. + +
+
+ comment: 28 pages, 20 figures, 7 tables +
+
+
+
+
+ + ☆ EM-Net: Efficient Channel and Frequency Learning with Mamba for 3D + Medical Image Segmentation MICCAI 2024 + + +
+ Convolutional neural networks have primarily led 3D medical image +segmentation but may be limited by small receptive fields. Transformer models +excel in capturing global relationships through self-attention but are +challenged by high computational costs at high resolutions. Recently, Mamba, a +state space model, has emerged as an effective approach for sequential +modeling. Inspired by its success, we introduce a novel Mamba-based 3D medical +image segmentation model called EM-Net. It not only efficiently captures +attentive interaction between regions by integrating and selecting channels, +but also effectively utilizes frequency domain to harmonize the learning of +features across varying scales, while accelerating training speed. +Comprehensive experiments on two challenging multi-organ datasets with other +state-of-the-art (SOTA) algorithms show that our method exhibits better +segmentation accuracy while requiring nearly half the parameter size of SOTA +models and 2x faster training speed. + +
+
+ comment: 10 pages, 3 figures, accepted by MICCAI 2024 +
+
+
+
+
+ + ☆ Self-Supervised Learning of Deviation in Latent Representation for + Co-speech Gesture Video Generation + + +
+ Gestures are pivotal in enhancing co-speech communication. While recent works +have mostly focused on point-level motion transformation or fully supervised +motion representations through data-driven approaches, we explore the +representation of gestures in co-speech, with a focus on self-supervised +representation and pixel-level motion deviation, utilizing a diffusion model +which incorporates latent motion features. Our approach leverages +self-supervised deviation in latent representation to facilitate hand gestures +generation, which are crucial for generating realistic gesture videos. Results +of our first experiment demonstrate that our method enhances the quality of +generated videos, with an improvement from 2.7 to 4.5% for FGD, DIV, and FVD, +and 8.1% for PSNR, 2.5% for SSIM over the current state-of-the-art methods. + +
+
+ comment: 5 pages, 5 figures, conference +
+
+
+
+
+ + ☆ Leveraging Anthropometric Measurements to Improve Human Mesh Estimation + and Ensure Consistent Body Shapes + + +
+ The basic body shape of a person does not change within a single video. +However, most SOTA human mesh estimation (HME) models output a slightly +different body shape for each video frame, which results in inconsistent body +shapes for the same person. In contrast, we leverage anthropometric +measurements like tailors are already obtaining from humans for centuries. We +create a model called A2B that converts such anthropometric measurements to +body shape parameters of human mesh models. Moreover, we find that finetuned +SOTA 3D human pose estimation (HPE) models outperform HME models regarding the +precision of the estimated keypoints. We show that applying inverse kinematics +(IK) to the results of such a 3D HPE model and combining the resulting body +pose with the A2B body shape leads to superior and consistent human meshes for +challenging datasets like ASPset or fit3D, where we can lower the MPJPE by over +30 mm compared to SOTA HME models. Further, replacing HME models estimates of +the body shape parameters with A2B model results not only increases the +performance of these HME models, but also leads to consistent body shapes. + +
+
+
+
+
+ + ☆ Explanation Bottleneck Models + + +
+ Recent concept-based interpretable models have succeeded in providing +meaningful explanations by pre-defined concept sets. However, the dependency on +the pre-defined concepts restricts the application because of the limited +number of concepts for explanations. This paper proposes a novel interpretable +deep neural network called explanation bottleneck models (XBMs). XBMs generate +a text explanation from the input without pre-defined concepts and then predict +a final task prediction based on the generated explanation by leveraging +pre-trained vision-language encoder-decoder models. To achieve both the target +task performance and the explanation quality, we train XBMs through the target +task loss with the regularization penalizing the explanation decoder via the +distillation from the frozen pre-trained decoder. Our experiments, including a +comparison to state-of-the-art concept bottleneck models, confirm that XBMs +provide accurate and fluent natural language explanations without pre-defined +concept sets. Code will be available at https://github.com/yshinya6/xbm/. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Provable Performance Guarantees of Copy Detection Patterns + + +
+ Copy Detection Patterns (CDPs) are crucial elements in modern security +applications, playing a vital role in safeguarding industries such as food, +pharmaceuticals, and cosmetics. Current performance evaluations of CDPs +predominantly rely on empirical setups using simplistic metrics like Hamming +distances or Pearson correlation. These methods are often inadequate due to +their sensitivity to distortions, degradation, and their limitations to +stationary statistics of printing and imaging. Additionally, machine +learning-based approaches suffer from distribution biases and fail to +generalize to unseen counterfeit samples. Given the critical importance of CDPs +in preventing counterfeiting, including the counterfeit vaccines issue +highlighted during the COVID-19 pandemic, there is an urgent need for provable +performance guarantees across various criteria. This paper aims to establish a +theoretical framework to derive optimal criteria for the analysis, +optimization, and future development of CDP authentication technologies, +ensuring their reliability and effectiveness in diverse security scenarios. + +
+
+
+
+
+ + ☆ MECD: Unlocking Multi-Event Causal Discovery in Video Reasoning NeurIPS 2024 + + +
+ Video causal reasoning aims to achieve a high-level understanding of video +content from a causal perspective. However, current video reasoning tasks are +limited in scope, primarily executed in a question-answering paradigm and +focusing on short videos containing only a single event and simple causal +relationships, lacking comprehensive and structured causality analysis for +videos with multiple events. To fill this gap, we introduce a new task and +dataset, Multi-Event Causal Discovery (MECD). It aims to uncover the causal +relationships between events distributed chronologically across long videos. +Given visual segments and textual descriptions of events, MECD requires +identifying the causal associations between these events to derive a +comprehensive, structured event-level video causal diagram explaining why and +how the final result event occurred. To address MECD, we devise a novel +framework inspired by the Granger Causality method, using an efficient +mask-based event prediction model to perform an Event Granger Test, which +estimates causality by comparing the predicted result event when premise events +are masked versus unmasked. Furthermore, we integrate causal inference +techniques such as front-door adjustment and counterfactual inference to +address challenges in MECD like causality confounding and illusory causality. +Experiments validate the effectiveness of our framework in providing causal +relationships in multi-event videos, outperforming GPT-4o and VideoLLaVA by +5.7% and 4.1%, respectively. + +
+
+ comment: Accepted at NeurIPS 2024 as a spotlight paper +
+
+
+
+
+ + ☆ P4Q: Learning to Prompt for Quantization in Visual-language Models + + +
+ Large-scale pre-trained Vision-Language Models (VLMs) have gained prominence +in various visual and multimodal tasks, yet the deployment of VLMs on +downstream application platforms remains challenging due to their prohibitive +requirements of training samples and computing resources. Fine-tuning and +quantization of VLMs can substantially reduce the sample and computation costs, +which are in urgent need. There are two prevailing paradigms in quantization, +Quantization-Aware Training (QAT) can effectively quantize large-scale VLMs but +incur a huge training cost, while low-bit Post-Training Quantization (PTQ) +suffers from a notable performance drop. We propose a method that balances +fine-tuning and quantization named ``Prompt for Quantization'' (P4Q), in which +we design a lightweight architecture to leverage contrastive loss supervision +to enhance the recognition performance of a PTQ model. Our method can +effectively reduce the gap between image features and text features caused by +low-bit quantization, based on learnable prompts to reorganize textual +representations and a low-bit adapter to realign the distributions of image and +text features. We also introduce a distillation loss based on cosine similarity +predictions to distill the quantized model using a full-precision teacher. +Extensive experimental results demonstrate that our P4Q method outperforms +prior arts, even achieving comparable results to its full-precision +counterparts. For instance, our 8-bit P4Q can theoretically compress the +CLIP-ViT/B-32 by 4 $\times$ while achieving 66.94\% Top-1 accuracy, +outperforming the learnable prompt fine-tuned full-precision model by 2.24\% +with negligible additional parameters on the ImageNet dataset. + +
+
+
+
+
+ + ☆ Hand-object reconstruction via interaction-aware graph attention + mechanism ICIP 2024 + + +
+ Estimating the poses of both a hand and an object has become an important +area of research due to the growing need for advanced vision computing. The +primary challenge involves understanding and reconstructing how hands and +objects interact, such as contact and physical plausibility. Existing +approaches often adopt a graph neural network to incorporate spatial +information of hand and object meshes. However, these approaches have not fully +exploited the potential of graphs without modification of edges within and +between hand- and object-graphs. We propose a graph-based refinement method +that incorporates an interaction-aware graph-attention mechanism to account for +hand-object interactions. Using edges, we establish connections among closely +correlated nodes, both within individual graphs and across different graphs. +Experiments demonstrate the effectiveness of our proposed method with notable +improvements in the realm of physical plausibility. + +
+
+ comment: 7 pages, Accepted by ICIP 2024 +
+
+
+
+
+ + ☆ Diversity-Driven Synthesis: Enhancing Dataset Distillation through + Directed Weight Adjustment + + +
+ The sharp increase in data-related expenses has motivated research into +condensing datasets while retaining the most informative features. Dataset +distillation has thus recently come to the fore. This paradigm generates +synthetic dataset that are representative enough to replace the original +dataset in training a neural network. To avoid redundancy in these synthetic +datasets, it is crucial that each element contains unique features and remains +diverse from others during the synthesis stage. In this paper, we provide a +thorough theoretical and empirical analysis of diversity within synthesized +datasets. We argue that enhancing diversity can improve the parallelizable yet +isolated synthesizing approach. Specifically, we introduce a novel method that +employs dynamic and directed weight adjustment techniques to modulate the +synthesis process, thereby maximizing the representativeness and diversity of +each synthetic instance. Our method ensures that each batch of synthetic data +mirrors the characteristics of a large, varying subset of the original dataset. +Extensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet, +and ImageNet-1K, demonstrate the superior performance of our method, +highlighting its effectiveness in producing diverse and representative +synthetic datasets with minimal computational expense. + +
+
+
+
+
+ + ☆ ZALM3: Zero-Shot Enhancement of Vision-Language Alignment via In-Context + Information in Multi-Turn Multimodal Medical Dialogue + + +
+ The rocketing prosperity of large language models (LLMs) in recent years has +boosted the prevalence of vision-language models (VLMs) in the medical sector. +In our online medical consultation scenario, a doctor responds to the texts and +images provided by a patient in multiple rounds to diagnose her/his health +condition, forming a multi-turn multimodal medical dialogue format. Unlike +high-quality images captured by professional equipment in traditional medical +visual question answering (Med-VQA), the images in our case are taken by +patients' mobile phones. These images have poor quality control, with issues +such as excessive background elements and the lesion area being significantly +off-center, leading to degradation of vision-language alignment in the model +training phase. In this paper, we propose ZALM3, a Zero-shot strategy to +improve vision-language ALignment in Multi-turn Multimodal Medical dialogue. +Since we observe that the preceding text conversations before an image can +infer the regions of interest (RoIs) in the image, ZALM3 employs an LLM to +summarize the keywords from the preceding context and a visual grounding model +to extract the RoIs. The updated images eliminate unnecessary background noise +and provide more effective vision-language alignment. To better evaluate our +proposed method, we design a new subjective assessment metric for multi-turn +unimodal/multimodal medical dialogue to provide a fine-grained performance +comparison. Our experiments across three different clinical departments +remarkably demonstrate the efficacy of ZALM3 with statistical significance. + +
+
+
+
+
+ + ☆ Appearance Blur-driven AutoEncoder and Motion-guided Memory Module for + Video Anomaly Detection + + +
+ Video anomaly detection (VAD) often learns the distribution of normal samples +and detects the anomaly through measuring significant deviations, but the +undesired generalization may reconstruct a few anomalies thus suppressing the +deviations. Meanwhile, most VADs cannot cope with cross-dataset validation for +new target domains, and few-shot methods must laboriously rely on model-tuning +from the target domain to complete domain adaptation. To address these +problems, we propose a novel VAD method with a motion-guided memory module to +achieve cross-dataset validation with zero-shot. First, we add Gaussian blur to +the raw appearance images, thereby constructing the global pseudo-anomaly, +which serves as the input to the network. Then, we propose multi-scale residual +channel attention to deblur the pseudo-anomaly in normal samples. Next, memory +items are obtained by recording the motion features in the training phase, +which are used to retrieve the motion features from the raw information in the +testing phase. Lastly, our method can ignore the blurred real anomaly through +attention and rely on motion memory items to increase the normality gap between +normal and abnormal motion. Extensive experiments on three benchmark datasets +demonstrate the effectiveness of the proposed method. Compared with +cross-domain methods, our method achieves competitive performance without +adaptation during testing. + +
+
+ comment: 13 pages, 11 figures +
+
+
+
+
+ + ☆ Good Data Is All Imitation Learning Needs + + +
+ In this paper, we address the limitations of traditional teacher-student +models, imitation learning, and behaviour cloning in the context of +Autonomous/Automated Driving Systems (ADS), where these methods often struggle +with incomplete coverage of real-world scenarios. To enhance the robustness of +such models, we introduce the use of Counterfactual Explanations (CFEs) as a +novel data augmentation technique for end-to-end ADS. CFEs, by generating +training samples near decision boundaries through minimal input modifications, +lead to a more comprehensive representation of expert driver strategies, +particularly in safety-critical scenarios. This approach can therefore help +improve the model's ability to handle rare and challenging driving events, such +as anticipating darting out pedestrians, ultimately leading to safer and more +trustworthy decision-making for ADS. Our experiments in the CARLA simulator +demonstrate that CF-Driver outperforms the current state-of-the-art method, +achieving a higher driving score and lower infraction rates. Specifically, +CF-Driver attains a driving score of 84.2, surpassing the previous best model +by 15.02 percentage points. These results highlight the effectiveness of +incorporating CFEs in training end-to-end ADS. To foster further research, the +CF-Driver code is made publicly available. + +
+
+
+
+
+ + ☆ TA-Cleaner: A Fine-grained Text Alignment Backdoor Defense Strategy for + Multimodal Contrastive Learning + + +
+ Pre-trained large models for multimodal contrastive learning, such as CLIP, +have been widely recognized in the industry as highly susceptible to +data-poisoned backdoor attacks. This poses significant risks to downstream +model training. In response to such potential threats, finetuning offers a +simpler and more efficient defense choice compared to retraining large models +with augmented data. In the supervised learning domain, fine-tuning defense +strategies can achieve excellent defense performance. However, in the +unsupervised and semi-supervised domain, we find that when CLIP faces some +complex attack techniques, the existing fine-tuning defense strategy, +CleanCLIP, has some limitations on defense performance. The synonym +substitution of its text-augmentation is insufficient to enhance the text +feature space. To compensate for this weakness, we improve it by proposing a +fine-grained \textbf{T}ext \textbf{A}lignment \textbf{C}leaner (TA-Cleaner) to +cut off feature connections of backdoor triggers. We randomly select a few +samples for positive and negative subtext generation at each epoch of +CleanCLIP, and align the subtexts to the images to strengthen the text +self-supervision. We evaluate the effectiveness of our TA-Cleaner against six +attack algorithms and conduct comprehensive zero-shot classification tests on +ImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves +state-of-the-art defensiveness among finetuning-based defense techniques. Even +when faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms +CleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\% and 63.88\%, +respectively. + +
+
+
+
+
+ + ☆ Unifying Dimensions: A Linear Adaptive Approach to Lightweight Image + Super-Resolution + + +
+ Window-based transformers have demonstrated outstanding performance in +super-resolution tasks due to their adaptive modeling capabilities through +local self-attention (SA). However, they exhibit higher computational +complexity and inference latency than convolutional neural networks. In this +paper, we first identify that the adaptability of the Transformers is derived +from their adaptive spatial aggregation and advanced structural design, while +their high latency results from the computational costs and memory layout +transformations associated with the local SA. To simulate this aggregation +approach, we propose an effective convolution-based linear focal separable +attention (FSA), allowing for long-range dynamic modeling with linear +complexity. Additionally, we introduce an effective dual-branch structure +combined with an ultra-lightweight information exchange module (IEM) to enhance +the aggregation of information by the Token Mixer. Finally, with respect to the +structure, we modify the existing spatial-gate-based feedforward neural +networks by incorporating a self-gate mechanism to preserve high-dimensional +channel information, enabling the modeling of more complex relationships. With +these advancements, we construct a convolution-based Transformer framework +named the linear adaptive mixer network (LAMNet). Extensive experiments +demonstrate that LAMNet achieves better performance than existing SA-based +Transformer methods while maintaining the computational efficiency of +convolutional neural networks, which can achieve a \(3\times\) speedup of +inference time. The code will be publicly available at: +https://github.com/zononhzy/LAMNet. + +
+
+
+
+
+ + ☆ Improving Fast Adversarial Training via Self-Knowledge Guidance + + +
+ Adversarial training has achieved remarkable advancements in defending +against adversarial attacks. Among them, fast adversarial training (FAT) is +gaining attention for its ability to achieve competitive robustness with fewer +computing resources. Existing FAT methods typically employ a uniform strategy +that optimizes all training data equally without considering the influence of +different examples, which leads to an imbalanced optimization. However, this +imbalance remains unexplored in the field of FAT. In this paper, we conduct a +comprehensive study of the imbalance issue in FAT and observe an obvious class +disparity regarding their performances. This disparity could be embodied from a +perspective of alignment between clean and robust accuracy. Based on the +analysis, we mainly attribute the observed misalignment and disparity to the +imbalanced optimization in FAT, which motivates us to optimize different +training data adaptively to enhance robustness. Specifically, we take disparity +and misalignment into consideration. First, we introduce self-knowledge guided +regularization, which assigns differentiated regularization weights to each +class based on its training state, alleviating class disparity. Additionally, +we propose self-knowledge guided label relaxation, which adjusts label +relaxation according to the training accuracy, alleviating the misalignment and +improving robustness. By combining these methods, we formulate the +Self-Knowledge Guided FAT (SKG-FAT), leveraging naturally generated knowledge +during training to enhance the adversarial robustness without compromising +training efficiency. Extensive experiments on four standard datasets +demonstrate that the SKG-FAT improves the robustness and preserves competitive +clean accuracy, outperforming the state-of-the-art methods. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Let the Quantum Creep In: Designing Quantum Neural Network Models by + Gradually Swapping Out Classical Components + + +
+ Artificial Intelligence (AI), with its multiplier effect and wide +applications in multiple areas, could potentially be an important application +of quantum computing. Since modern AI systems are often built on neural +networks, the design of quantum neural networks becomes a key challenge in +integrating quantum computing into AI. To provide a more fine-grained +characterisation of the impact of quantum components on the performance of +neural networks, we propose a framework where classical neural network layers +are gradually replaced by quantum layers that have the same type of input and +output while keeping the flow of information between layers unchanged, +different from most current research in quantum neural network, which favours +an end-to-end quantum model. We start with a simple three-layer classical +neural network without any normalisation layers or activation functions, and +gradually change the classical layers to the corresponding quantum versions. We +conduct numerical experiments on image classification datasets such as the +MNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of +performance brought by the systematic introduction of quantum components. +Through this framework, our research sheds new light on the design of future +quantum neural network models where it could be more favourable to search for +methods and frameworks that harness the advantages from both the classical and +quantum worlds. + +
+
+ comment: 50 pages (including Appendix), many figures, accepted as a poster on + QTML2024. Code available at + https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In +
+
+
+
+
+ + ☆ ID$^3$: Identity-Preserving-yet-Diversified Diffusion Models for + Synthetic Face Recognition NeurIPS 2024 + + +
+ Synthetic face recognition (SFR) aims to generate synthetic face datasets +that mimic the distribution of real face data, which allows for training face +recognition models in a privacy-preserving manner. Despite the remarkable +potential of diffusion models in image generation, current diffusion-based SFR +models struggle with generalization to real-world faces. To address this +limitation, we outline three key objectives for SFR: (1) promoting diversity +across identities (inter-class diversity), (2) ensuring diversity within each +identity by injecting various facial attributes (intra-class diversity), and +(3) maintaining identity consistency within each identity group (intra-class +identity preservation). Inspired by these goals, we introduce a +diffusion-fueled SFR model termed $\text{ID}^3$. $\text{ID}^3$ employs an +ID-preserving loss to generate diverse yet identity-consistent facial +appearances. Theoretically, we show that minimizing this loss is equivalent to +maximizing the lower bound of an adjusted conditional log-likelihood over +ID-preserving data. This equivalence motivates an ID-preserving sampling +algorithm, which operates over an adjusted gradient vector field, enabling the +generation of fake face recognition datasets that approximate the distribution +of real-world faces. Extensive experiments across five challenging benchmarks +validate the advantages of $\text{ID}^3$. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Flexiffusion: Segment-wise Neural Architecture Search for Flexible + Denoising Schedule + + +
+ Diffusion models are cutting-edge generative models adept at producing +diverse, high-quality images. Despite their effectiveness, these models often +require significant computational resources owing to their numerous sequential +denoising steps and the significant inference cost of each step. Recently, +Neural Architecture Search (NAS) techniques have been employed to automatically +search for faster generation processes. However, NAS for diffusion is +inherently time-consuming as it requires estimating thousands of diffusion +models to search for the optimal one. In this paper, we introduce Flexiffusion, +a novel training-free NAS paradigm designed to accelerate diffusion models by +concurrently optimizing generation steps and network structures. Specifically, +we partition the generation process into isometric step segments, each +sequentially composed of a full step, multiple partial steps, and several null +steps. The full step computes all network blocks, while the partial step +involves part of the blocks, and the null step entails no computation. +Flexiffusion autonomously explores flexible step combinations for each segment, +substantially reducing search costs and enabling greater acceleration compared +to the state-of-the-art (SOTA) method for diffusion models. Our searched models +reported speedup factors of $2.6\times$ and $1.5\times$ for the original +LDM-4-G and the SOTA, respectively. The factors for Stable Diffusion V1.5 and +the SOTA are $5.1\times$ and $2.0\times$. We also verified the performance of +Flexiffusion on multiple datasets, and positive experiment results indicate +that Flexiffusion can effectively reduce redundancy in diffusion models. + +
+
+
+
+
+ + ☆ Pixel-Space Post-Training of Latent Diffusion Models + + +
+ Latent diffusion models (LDMs) have made significant advancements in the +field of image generation in recent years. One major advantage of LDMs is their +ability to operate in a compressed latent space, allowing for more efficient +training and deployment. However, despite these advantages, challenges with +LDMs still remain. For example, it has been observed that LDMs often generate +high-frequency details and complex compositions imperfectly. We hypothesize +that one reason for these flaws is due to the fact that all pre- and +post-training of LDMs are done in latent space, which is typically $8 \times 8$ +lower spatial-resolution than the output images. To address this issue, we +propose adding pixel-space supervision in the post-training process to better +preserve high-frequency details. Experimentally, we show that adding a +pixel-space objective significantly improves both supervised quality +fine-tuning and preference-based post-training by a large margin on a +state-of-the-art DiT transformer and U-Net diffusion models in both visual +quality and visual flaw metrics, while maintaining the same text alignment +quality. + +
+
+
+
+
+ + ☆ General Compression Framework for Efficient Transformer Object Tracking + + +
+ Transformer-based trackers have established a dominant role in the field of +visual object tracking. While these trackers exhibit promising performance, +their deployment on resource-constrained devices remains challenging due to +inefficiencies. To improve the inference efficiency and reduce the computation +cost, prior approaches have aimed to either design lightweight trackers or +distill knowledge from larger teacher models into more compact student +trackers. However, these solutions often sacrifice accuracy for speed. Thus, we +propose a general model compression framework for efficient transformer object +tracking, named CompressTracker, to reduce the size of a pre-trained tracking +model into a lightweight tracker with minimal performance degradation. Our +approach features a novel stage division strategy that segments the transformer +layers of the teacher model into distinct stages, enabling the student model to +emulate each corresponding teacher stage more effectively. Additionally, we +also design a unique replacement training technique that involves randomly +substituting specific stages in the student model with those from the teacher +model, as opposed to training the student model in isolation. Replacement +training enhances the student model's ability to replicate the teacher model's +behavior. To further forcing student model to emulate teacher model, we +incorporate prediction guidance and stage-wise feature mimicking to provide +additional supervision during the teacher model's compression process. Our +framework CompressTracker is structurally agnostic, making it compatible with +any transformer architecture. We conduct a series of experiment to verify the +effectiveness and generalizability of CompressTracker. Our CompressTracker-4 +with 4 transformer layers, which is compressed from OSTrack, retains about 96% +performance on LaSOT (66.1% AUC) while achieves 2.17x speed up. + +
+
+
+
+
+ + ☆ Dynamic Subframe Splitting and Spatio-Temporal Motion Entangled Sparse + Attention for RGB-E Tracking + + +
+ Event-based bionic camera asynchronously captures dynamic scenes with high +temporal resolution and high dynamic range, offering potential for the +integration of events and RGB under conditions of illumination degradation and +fast motion. Existing RGB-E tracking methods model event characteristics +utilising attention mechanism of Transformer before integrating both +modalities. Nevertheless, these methods involve aggregating the event stream +into a single event frame, lacking the utilisation of the temporal information +inherent in the event stream.Moreover, the traditional attention mechanism is +well-suited for dense semantic features, while the attention mechanism for +sparse event features require revolution. In this paper, we propose a dynamic +event subframe splitting strategy to split the event stream into more +fine-grained event clusters, aiming to capture spatio-temporal features that +contain motion cues. Based on this, we design an event-based sparse attention +mechanism to enhance the interaction of event features in temporal and spatial +dimensions. The experimental results indicate that our method outperforms +existing state-of-the-art methods on the FE240 and COESOT datasets, providing +an effective processing manner for the event data. + +
+
+ comment: 15 pages, 8 figures, conference +
+
+
+
+
+ + ☆ Advancing Open-Set Domain Generalization Using Evidential Bi-Level + Hardest Domain Scheduler NeurIPS 2024 + + +
+ In Open-Set Domain Generalization (OSDG), the model is exposed to both new +variations of data appearance (domains) and open-set conditions, where both +known and novel categories are present at test time. The challenges of this +task arise from the dual need to generalize across diverse domains and +accurately quantify category novelty, which is critical for applications in +dynamic environments. Recently, meta-learning techniques have demonstrated +superior results in OSDG, effectively orchestrating the meta-train and -test +tasks by employing varied random categories and predefined domain partition +strategies. These approaches prioritize a well-designed training schedule over +traditional methods that focus primarily on data augmentation and the +enhancement of discriminative feature learning. The prevailing meta-learning +models in OSDG typically utilize a predefined sequential domain scheduler to +structure data partitions. However, a crucial aspect that remains inadequately +explored is the influence brought by strategies of domain schedulers during +training. In this paper, we observe that an adaptive domain scheduler benefits +more in OSDG compared with prefixed sequential and random domain schedulers. We +propose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve +an adaptive domain scheduler. This method strategically sequences domains by +assessing their reliabilities in utilizing a follower network, trained with +confidence scores learned in an evidential manner, regularized by max rebiasing +discrepancy, and optimized in a bi-level manner. The results show that our +method substantially improves OSDG performance and achieves more discriminative +embeddings for both the seen and unseen categories. The source code will be +available at https://github.com/KPeng9510/EBiL-HaDS. + +
+
+ comment: Accepted to NeurIPS 2024. The source code will be available at + https://github.com/KPeng9510/EBiL-HaDS +
+
+
+
+
+ + ☆ Triple Point Masking + + +
+ Existing 3D mask learning methods encounter performance bottlenecks under +limited data, and our objective is to overcome this limitation. In this paper, +we introduce a triple point masking scheme, named TPM, which serves as a +scalable framework for pre-training of masked autoencoders to achieve +multi-mask learning for 3D point clouds. Specifically, we augment the baselines +with two additional mask choices (i.e., medium mask and low mask) as our core +insight is that the recovery process of an object can manifest in diverse ways. +Previous high-masking schemes focus on capturing the global representation but +lack the fine-grained recovery capability, so that the generated pre-trained +weights tend to play a limited role in the fine-tuning process. With the +support of the proposed TPM, available methods can exhibit more flexible and +accurate completion capabilities, enabling the potential autoencoder in the +pre-training stage to consider multiple representations of a single 3D object. +In addition, an SVM-guided weight selection module is proposed to fill the +encoder parameters for downstream networks with the optimal weight during the +fine-tuning stage, maximizing linear accuracy and facilitating the acquisition +of intricate representations for new objects. Extensive experiments show that +the four baselines equipped with the proposed TPM achieve comprehensive +performance improvements on various downstream tasks. + +
+
+
+
+
+ + ☆ CAMOT: Camera Angle-aware Multi-Object Tracking + + +
+ This paper proposes CAMOT, a simple camera angle estimator for multi-object +tracking to tackle two problems: 1) occlusion and 2) inaccurate distance +estimation in the depth direction. Under the assumption that multiple objects +are located on a flat plane in each video frame, CAMOT estimates the camera +angle using object detection. In addition, it gives the depth of each object, +enabling pseudo-3D MOT. We evaluated its performance by adding it to various 2D +MOT methods on the MOT17 and MOT20 datasets and confirmed its effectiveness. +Applying CAMOT to ByteTrack, we obtained 63.8% HOTA, 80.6% MOTA, and 78.5% IDF1 +in MOT17, which are state-of-the-art results. Its computational cost is +significantly lower than the existing deep-learning-based depth estimators for +tracking. + +
+
+
+
+
+ + ☆ SimVG: A Simple Framework for Visual Grounding with Decoupled + Multi-modal Fusion NeurIPS2024 + + +
+ Visual grounding is a common vision task that involves grounding descriptive +sentences to the corresponding regions of an image. Most existing methods use +independent image-text encoding and apply complex hand-crafted modules or +encoder-decoder architectures for modal interaction and query reasoning. +However, their performance significantly drops when dealing with complex +textual expressions. This is because the former paradigm only utilizes limited +downstream data to fit the multi-modal feature fusion. Therefore, it is only +effective when the textual expressions are relatively simple. In contrast, +given the wide diversity of textual expressions and the uniqueness of +downstream training data, the existing fusion module, which extracts multimodal +content from a visual-linguistic context, has not been fully investigated. In +this paper, we present a simple yet robust transformer-based framework, SimVG, +for visual grounding. Specifically, we decouple visual-linguistic feature +fusion from downstream tasks by leveraging existing multimodal pre-trained +models and incorporating additional object tokens to facilitate deep +integration of downstream and pre-training tasks. Furthermore, we design a +dynamic weight-balance distillation method in the multi-branch synchronous +learning process to enhance the representation capability of the simpler +branch. This branch only consists of a lightweight MLP, which simplifies the +structure and improves reasoning speed. Experiments on six widely used VG +datasets, i.e., RefCOCO/+/g, ReferIt, Flickr30K, and GRefCOCO, demonstrate the +superiority of SimVG. Finally, the proposed method not only achieves +improvements in efficiency and convergence speed but also attains new +state-of-the-art performance on these benchmarks. Codes and models will be +available at \url{https://github.com/Dmmm1997/SimVG}. + +
+
+ comment: 21pages, 11figures, NeurIPS2024 +
+
+
+
+
+ + ☆ Drone Stereo Vision for Radiata Pine Branch Detection and Distance + Measurement: Integrating SGBM and Segmentation Models + + +
+ Manual pruning of radiata pine trees presents significant safety risks due to +their substantial height and the challenging terrains in which they thrive. To +address these risks, this research proposes the development of a drone-based +pruning system equipped with specialized pruning tools and a stereo vision +camera, enabling precise detection and trimming of branches. Deep learning +algorithms, including YOLO and Mask R-CNN, are employed to ensure accurate +branch detection, while the Semi-Global Matching algorithm is integrated to +provide reliable distance estimation. The synergy between these techniques +facilitates the precise identification of branch locations and enables +efficient, targeted pruning. Experimental results demonstrate that the combined +implementation of YOLO and SGBM enables the drone to accurately detect branches +and measure their distances from the drone. This research not only improves the +safety and efficiency of pruning operations but also makes a significant +contribution to the advancement of drone technology in the automation of +agricultural and forestry practices, laying a foundational framework for +further innovations in environmental management. + +
+
+
+
+
+ + ☆ JoyType: A Robust Design for Multilingual Visual Text Creation AAAI 2025 + + +
+ Generating images with accurately represented text, especially in non-Latin +languages, poses a significant challenge for diffusion models. Existing +approaches, such as the integration of hint condition diagrams via auxiliary +networks (e.g., ControlNet), have made strides towards addressing this issue. +However, diffusion models often fall short in tasks requiring controlled text +generation, such as specifying particular fonts or producing text in small +fonts. In this paper, we introduce a novel approach for multilingual visual +text creation, named JoyType, designed to maintain the font style of text +during the image generation process. Our methodology begins with assembling a +training dataset, JoyType-1M, comprising 1 million pairs of data. Each pair +includes an image, its description, and glyph instructions corresponding to the +font style within the image. We then developed a text control network, Font +ControlNet, tasked with extracting font style information to steer the image +generation. To further enhance our model's ability to maintain font style, +notably in generating small-font text, we incorporated a multi-layer OCR-aware +loss into the diffusion process. This enhancement allows JoyType to direct text +rendering using low-level descriptors. Our evaluations, based on both visual +and accuracy metrics, demonstrate that JoyType significantly outperforms +existing state-of-the-art methods. Additionally, JoyType can function as a +plugin, facilitating the creation of varied image styles in conjunction with +other stable diffusion models on HuggingFace and CivitAI. Our project is +open-sourced on https://jdh-algo.github.io/JoyType/. + +
+
+ comment: Under Review at AAAI 2025 +
+
+
+
+
+ + ☆ EAGLE: Egocentric AGgregated Language-video Engine + + +
+ The rapid evolution of egocentric video analysis brings new insights into +understanding human activities and intentions from a first-person perspective. +Despite this progress, the fragmentation in tasks like action recognition, +procedure learning, and moment retrieval, \etc, coupled with inconsistent +annotations and isolated model development, hinders a holistic interpretation +of video content. In response, we introduce the EAGLE (Egocentric AGgregated +Language-video Engine) model and the EAGLE-400K dataset to provide a unified +framework that integrates various egocentric video understanding tasks. +EAGLE-400K, the \textit{first} large-scale instruction-tuning dataset tailored +for egocentric video, features 400K diverse samples to enhance a broad spectrum +of tasks from activity recognition to procedure knowledge learning. Moreover, +EAGLE, a strong video multimodal large language model (MLLM), is designed to +effectively capture both spatial and temporal information. In addition, we +propose a set of evaluation metrics designed to facilitate a thorough +assessment of MLLM for egocentric video understanding. Our extensive +experiments demonstrate EAGLE's superior performance over existing models, +highlighting its ability to balance task-specific understanding with holistic +video interpretation. With EAGLE, we aim to pave the way for research +opportunities and practical applications in real-world scenarios. + +
+
+ comment: Accepted by ACMMM 24 +
+
+
+
+
+ + ☆ Robotic Environmental State Recognition with Pre-Trained Vision-Language + Models and Black-Box Optimization + + +
+ In order for robots to autonomously navigate and operate in diverse +environments, it is essential for them to recognize the state of their +environment. On the other hand, the environmental state recognition has +traditionally involved distinct methods tailored to each state to be +recognized. In this study, we perform a unified environmental state recognition +for robots through the spoken language with pre-trained large-scale +vision-language models. We apply Visual Question Answering and Image-to-Text +Retrieval, which are tasks of Vision-Language Models. We show that with our +method, it is possible to recognize not only whether a room door is +open/closed, but also whether a transparent door is open/closed and whether +water is running in a sink, without training neural networks or manual +programming. In addition, the recognition accuracy can be improved by selecting +appropriate texts from the set of prepared texts based on black-box +optimization. For each state recognition, only the text set and its weighting +need to be changed, eliminating the need to prepare multiple different models +and programs, and facilitating the management of source code and computer +resource. We experimentally demonstrate the effectiveness of our method and +apply it to the recognition behavior on a mobile robot, Fetch. + +
+
+ comment: Accepted at Advanced Robotics, website - + https://haraduka.github.io/vlm-bbo/ +
+
+
+
+
+ + ☆ SCOMatch: Alleviating Overtrusting in Open-set Semi-supervised Learning ECCV 2024 + + +
+ Open-set semi-supervised learning (OSSL) leverages practical open-set +unlabeled data, comprising both in-distribution (ID) samples from seen classes +and out-of-distribution (OOD) samples from unseen classes, for semi-supervised +learning (SSL). Prior OSSL methods initially learned the decision boundary +between ID and OOD with labeled ID data, subsequently employing self-training +to refine this boundary. These methods, however, suffer from the tendency to +overtrust the labeled ID data: the scarcity of labeled data caused the +distribution bias between the labeled samples and the entire ID data, which +misleads the decision boundary to overfit. The subsequent self-training +process, based on the overfitted result, fails to rectify this problem. In this +paper, we address the overtrusting issue by treating OOD samples as an +additional class, forming a new SSL process. + Specifically, we propose SCOMatch, a novel OSSL method that 1) selects +reliable OOD samples as new labeled data with an OOD memory queue and a +corresponding update strategy and 2) integrates the new SSL process into the +original task through our Simultaneous Close-set and Open-set self-training. +SCOMatch refines the decision boundary of ID and OOD classes across the entire +dataset, thereby leading to improved results. Extensive experimental results +show that SCOMatch significantly outperforms the state-of-the-art methods on +various benchmarks. The effectiveness is further verified through ablation +studies and visualization. + +
+
+ comment: ECCV 2024 accepted +
+
+
+
+
+ + ☆ NeuroPath: A Neural Pathway Transformer for Joining the Dots of Human + Connectomes NeurIPS 2024 + + +
+ Although modern imaging technologies allow us to study connectivity between +two distinct brain regions in-vivo, an in-depth understanding of how anatomical +structure supports brain function and how spontaneous functional fluctuations +emerge remarkable cognition is still elusive. Meanwhile, tremendous efforts +have been made in the realm of machine learning to establish the nonlinear +mapping between neuroimaging data and phenotypic traits. However, the absence +of neuroscience insight in the current approaches poses significant challenges +in understanding cognitive behavior from transient neural activities. To +address this challenge, we put the spotlight on the coupling mechanism of +structural connectivity (SC) and functional connectivity (FC) by formulating +such network neuroscience question into an expressive graph representation +learning problem for high-order topology. Specifically, we introduce the +concept of topological detour to characterize how a ubiquitous instance of FC +(direct link) is supported by neural pathways (detour) physically wired by SC, +which forms a cyclic loop interacted by brain structure and function. In the +clich\'e of machine learning, the multi-hop detour pathway underlying SC-FC +coupling allows us to devise a novel multi-head self-attention mechanism within +Transformer to capture multi-modal feature representation from paired graphs of +SC and FC. Taken together, we propose a biological-inspired deep model, coined +as NeuroPath, to find putative connectomic feature representations from the +unprecedented amount of neuroimages, which can be plugged into various +downstream applications such as task recognition and disease diagnosis. We have +evaluated NeuroPath on large-scale public datasets including HCP and UK Biobank +under supervised and zero-shot learning, where the state-of-the-art performance +by our NeuroPath indicates great potential in network neuroscience. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task + Learning Via Connector-MoE + + +
+ Multi-modal large language models (MLLMs) have shown impressive capabilities +as a general-purpose interface for various visual and linguistic tasks. +However, building a unified MLLM for multi-task learning in the medical field +remains a thorny challenge. To mitigate the tug-of-war problem of multi-modal +multi-task optimization, recent advances primarily focus on improving the LLM +components, while neglecting the connector that bridges the gap between +modalities. In this paper, we introduce Uni-Med, a novel medical generalist +foundation model which consists of a universal visual feature extraction +module, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting +from the proposed CMoE that leverages a well-designed router with a mixture of +projection experts at the connector, Uni-Med achieves efficient solution to the +tug-of-war problem and can perform six different medical tasks including +question answering, visual question answering, report generation, referring +expression comprehension, referring expression generation and image +classification. To the best of our knowledge, Uni-Med is the first effort to +tackle multi-task interference at the connector. Extensive ablation experiments +validate the effectiveness of introducing CMoE under any configuration, with up +to an average 8% performance gains. We further provide interpretation analysis +of the tug-of-war problem from the perspective of gradient optimization and +parameter statistics. Compared to previous state-of-the-art medical MLLMs, +Uni-Med achieves competitive or superior evaluation metrics on diverse tasks. +Code, data and model will be soon available at GitHub. + +
+
+
+
+
+ + ☆ Shape-intensity knowledge distillation for robust medical image + segmentation + + +
+ Many medical image segmentation methods have achieved impressive results. +Yet, most existing methods do not take into account the shape-intensity prior +information. This may lead to implausible segmentation results, in particular +for images of unseen datasets. In this paper, we propose a novel approach to +incorporate joint shape-intensity prior information into the segmentation +network. Specifically, we first train a segmentation network (regarded as the +teacher network) on class-wise averaged training images to extract valuable +shape-intensity information, which is then transferred to a student +segmentation network with the same network architecture as the teacher via +knowledge distillation. In this way, the student network regarded as the final +segmentation model can effectively integrate the shape-intensity prior +information, yielding more accurate segmentation results. Despite its +simplicity, experiments on five medical image segmentation tasks of different +modalities demonstrate that the proposed Shape-Intensity Knowledge Distillation +(SIKD) consistently improves several baseline models (including recent MaxStyle +and SAMed) under intra-dataset evaluation, and significantly improves the +cross-dataset generalization ability. The code is available at +https://github.com/whdong-whu/SIKD. + +
+
+
+
+
+ + ☆ Learning Quantized Adaptive Conditions for Diffusion Models + + +
+ The curvature of ODE trajectories in diffusion models hinders their ability +to generate high-quality images in a few number of function evaluations (NFE). +In this paper, we propose a novel and effective approach to reduce trajectory +curvature by utilizing adaptive conditions. By employing a extremely +light-weight quantized encoder, our method incurs only an additional 1% of +training parameters, eliminates the need for extra regularization terms, yet +achieves significantly better sample quality. Our approach accelerates ODE +sampling while preserving the downstream task image editing capabilities of SDE +techniques. Extensive experiments verify that our method can generate high +quality results under extremely limited sampling costs. With only 6 NFE, we +achieve 5.14 FID on CIFAR-10, 6.91 FID on FFHQ 64x64 and 3.10 FID on AFHQv2. + +
+
+
+
+
+ + ☆ Global-Local Medical SAM Adaptor Based on Full Adaption + + +
+ Emerging of visual language models, such as the segment anything model (SAM), +have made great breakthroughs in the field of universal semantic segmentation +and significantly aid the improvements of medical image segmentation, in +particular with the help of Medical SAM adaptor (Med-SA). However, Med-SA still +can be improved, as it fine-tunes SAM in a partial adaption manner. To resolve +this problem, we present a novel global medical SAM adaptor (GMed-SA) with full +adaption, which can adapt SAM globally. We further combine GMed-SA and Med-SA +to propose a global-local medical SAM adaptor (GLMed-SA) to adapt SAM both +globally and locally. Extensive experiments have been performed on the +challenging public 2D melanoma segmentation dataset. The results show that +GLMed-SA outperforms several state-of-the-art semantic segmentation methods on +various evaluation metrics, demonstrating the superiority of our methods. + +
+
+
+
+
+ + ☆ Revisiting Deep Ensemble Uncertainty for Enhanced Medical Anomaly + Detection MICCAI2024 + + +
+ Medical anomaly detection (AD) is crucial in pathological identification and +localization. Current methods typically rely on uncertainty estimation in deep +ensembles to detect anomalies, assuming that ensemble learners should agree on +normal samples while exhibiting disagreement on unseen anomalies in the output +space. However, these methods may suffer from inadequate disagreement on +anomalies or diminished agreement on normal samples. To tackle these issues, we +propose D2UE, a Diversified Dual-space Uncertainty Estimation framework for +medical anomaly detection. To effectively balance agreement and disagreement +for anomaly detection, we propose Redundancy-Aware Repulsion (RAR), which uses +a similarity kernel that remains invariant to both isotropic scaling and +orthogonal transformations, explicitly promoting diversity in learners' feature +space. Moreover, to accentuate anomalous regions, we develop Dual-Space +Uncertainty (DSU), which utilizes the ensemble's uncertainty in input and +output spaces. In input space, we first calculate gradients of reconstruction +error with respect to input images. The gradients are then integrated with +reconstruction outputs to estimate uncertainty for inputs, enabling effective +anomaly discrimination even when output space disagreement is minimal. We +conduct a comprehensive evaluation of five medical benchmarks with different +backbones. Experimental results demonstrate the superiority of our method to +state-of-the-art methods and the effectiveness of each component in our +framework. Our code is available at https://github.com/Rubiscol/D2UE. + +
+
+ comment: Early accepted by MICCAI2024 +
+
+
+
+
+ + ♻ ☆ Gaussian Deja-vu: Creating Controllable 3D Gaussian Head-Avatars with + Enhanced Generalization and Personalization Abilities WACV 2025 + + +
+ Recent advancements in 3D Gaussian Splatting (3DGS) have unlocked significant +potential for modeling 3D head avatars, providing greater flexibility than +mesh-based methods and more efficient rendering compared to NeRF-based +approaches. Despite these advancements, the creation of controllable 3DGS-based +head avatars remains time-intensive, often requiring tens of minutes to hours. +To expedite this process, we here introduce the ``Gaussian D\'ej\`a-vu" +framework, which first obtains a generalized model of the head avatar and then +personalizes the result. The generalized model is trained on large 2D +(synthetic and real) image datasets. This model provides a well-initialized 3D +Gaussian head that is further refined using a monocular video to achieve the +personalized head avatar. For personalizing, we propose learnable +expression-aware rectification blendmaps to correct the initial 3D Gaussians, +ensuring rapid convergence without the reliance on neural networks. Experiments +demonstrate that the proposed method meets its objectives. It outperforms +state-of-the-art 3D Gaussian head avatars in terms of photorealistic quality as +well as reduces training time consumption to at least a quarter of the existing +methods, producing the avatar in minutes. + +
+
+ comment: 11 pages, Accepted by WACV 2025 in Round 1 +
+
+
+
+
+ + ♻ ☆ Chat-Scene: Bridging 3D Scene and Large Language Models with Object + Identifiers + + +
+ Recent advancements in 3D Large Language Models (LLMs) have demonstrated +promising capabilities for 3D scene understanding. However, previous methods +exhibit deficiencies in general referencing and grounding capabilities for +intricate scene comprehension. In this paper, we introduce the use of object +identifiers and object-centric representations to interact with scenes at the +object level. Specifically, we decompose the input 3D scene into a set of +object proposals, each assigned a unique identifier token, which enables +efficient object referencing and grounding during user-assistant interactions. +Given the scarcity of scene-language data, we model the scene embeddings as a +sequence of explicit object-level embeddings, derived from semantic-rich 2D or +3D representations. By employing object identifiers, we transform diverse 3D +scene-language tasks into a unified question-answering format, facilitating +joint training without the need for additional task-specific heads. With +minimal fine-tuning on all downstream tasks, our model significantly +outperforms existing methods on benchmarks including ScanRefer, Multi3DRefer, +Scan2Cap, ScanQA, and SQA3D. + +
+
+
+
+
+ + ♻ ☆ Exploring Event-based Human Pose Estimation with 3D Event + Representations + + +
+ Human pose estimation is a fundamental and appealing task in computer vision. +Although traditional cameras are commonly applied, their reliability decreases +in scenarios under high dynamic range or heavy motion blur, where event cameras +offer a robust solution. Predominant event-based methods accumulate events into +frames, ignoring the asynchronous and high temporal resolution that is crucial +for distinguishing distinct actions. To address this issue and to unlock the 3D +potential of event information, we introduce two 3D event representations: the +Rasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The +RasEPC aggregates events within concise temporal slices at identical positions, +preserving their 3D attributes along with statistical information, thereby +significantly reducing memory and computational demands. Meanwhile, the DEV +representation discretizes events into voxels and projects them across three +orthogonal planes, utilizing decoupled event attention to retrieve 3D cues from +the 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic +event-based dataset crafted to facilitate training and quantitative analysis in +outdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD +dataset, and our EV-3DPW dataset, with further qualitative validation via a +derived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our +code and dataset have been made publicly available at +https://github.com/MasterHow/EventPointPose. + +
+
+ comment: Accepted to Computer Vision and Image Understanding (CVPU). Extended + version of arXiv:2206.04511. The code and dataset are available at + https://github.com/MasterHow/EventPointPose +
+
+
+
+
+ + ♻ ☆ Synthesizing Environment-Specific People in Photographs ECCV 2024 + + +
+ We present ESP, a novel method for context-aware full-body generation, that +enables photo-realistic synthesis and inpainting of people wearing clothing +that is semantically appropriate for the scene depicted in an input photograph. +ESP is conditioned on a 2D pose and contextual cues that are extracted from the +photograph of the scene and integrated into the generation process, where the +clothing is modeled explicitly with human parsing masks (HPM). Generated HPMs +are used as tight guiding masks for inpainting, such that no changes are made +to the original background. Our models are trained on a dataset containing a +set of in-the-wild photographs of people covering a wide range of different +environments. The method is analyzed quantitatively and qualitatively, and we +show that ESP outperforms the state-of-the-art on the task of contextual +full-body generation. + +
+
+ comment: Accepted at ECCV 2024, Project: https://esp.is.tue.mpg.de +
+
+
+
+
+ + ♻ ☆ Valeo4Cast: A Modular Approach to End-to-End Forecasting ECCV + + +
+ Motion forecasting is crucial in autonomous driving systems to anticipate the +future trajectories of surrounding agents such as pedestrians, vehicles, and +traffic signals. In end-to-end forecasting, the model must jointly detect and +track from sensor data (cameras or LiDARs) the past trajectories of the +different elements of the scene and predict their future locations. We depart +from the current trend of tackling this task via end-to-end training from +perception to forecasting, and instead use a modular approach. We individually +build and train detection, tracking and forecasting modules. We then only use +consecutive finetuning steps to integrate the modules better and alleviate +compounding errors. We conduct an in-depth study on the finetuning strategies +and it reveals that our simple yet effective approach significantly improves +performance on the end-to-end forecasting benchmark. Consequently, our solution +ranks first in the Argoverse 2 End-to-end Forecasting Challenge, with 63.82 +mAPf. We surpass forecasting results by +17.1 points over last year's winner +and by +13.3 points over this year's runner-up. This remarkable performance in +forecasting can be explained by our modular paradigm, which integrates +finetuning strategies and significantly outperforms the end-to-end-trained +counterparts. The code, model weights and results are made available +https://github.com/valeoai/valeo4cast. + +
+
+ comment: Winning solution of the Argoverse 2 "Unified Detection, Tracking, and + Forecasting" challenge; work accepted at Road++ ECCVW 2024 +
+
+
+
+
+ + ♻ ☆ Disentangled Clothed Avatar Generation from Text Descriptions + + +
+ In this paper, we introduce a novel text-to-avatar generation method that +separately generates the human body and the clothes and allows high-quality +animation on the generated avatar. While recent advancements in text-to-avatar +generation have yielded diverse human avatars from text prompts, these methods +typically combine all elements-clothes, hair, and body-into a single 3D +representation. Such an entangled approach poses challenges for downstream +tasks like editing or animation. To overcome these limitations, we propose a +novel disentangled 3D avatar representation named Sequentially Offset-SMPL +(SO-SMPL), building upon the SMPL model. SO-SMPL represents the human body and +clothes with two separate meshes but associates them with offsets to ensure the +physical alignment between the body and the clothes. Then, we design a Score +Distillation Sampling (SDS)-based distillation framework to generate the +proposed SO-SMPL representation from text prompts. Our approach not only +achieves higher texture and geometry quality and better semantic alignment with +text prompts, but also significantly improves the visual quality of character +animation, virtual try-on, and avatar editing. Project page: +https://shanemankiw.github.io/SO-SMPL/. + +
+
+ comment: Project page: https://shanemankiw.github.io/SO-SMPL/ +
+
+
+
+
+ + ♻ ☆ Jumping through Local Minima: Quantization in the Loss Landscape of + Vision Transformers + + +
+ Quantization scale and bit-width are the most important parameters when +considering how to quantize a neural network. Prior work focuses on optimizing +quantization scales in a global manner through gradient methods (gradient +descent \& Hessian analysis). Yet, when applying perturbations to quantization +scales, we observe a very jagged, highly non-smooth test loss landscape. In +fact, small perturbations in quantization scale can greatly affect accuracy, +yielding a $0.5-0.8\%$ accuracy boost in 4-bit quantized vision transformers +(ViTs). In this regime, gradient methods break down, since they cannot reliably +reach local minima. In our work, dubbed Evol-Q, we use evolutionary search to +effectively traverse the non-smooth landscape. Additionally, we propose using +an infoNCE loss, which not only helps combat overfitting on the small +calibration dataset ($1,000$ images) but also makes traversing such a highly +non-smooth surface easier. Evol-Q improves the top-1 accuracy of a fully +quantized ViT-Base by $10.30\%$, $0.78\%$, and $0.15\%$ for $3$-bit, $4$-bit, +and $8$-bit weight quantization levels. Extensive experiments on a variety of +CNN and ViT architectures further demonstrate its robustness in extreme +quantization scenarios. Our code is available at +https://github.com/enyac-group/evol-q + +
+
+ comment: arXiv admin note: text overlap with arXiv:2211.09643 +
+
+
+
+
+ + ♻ ☆ LingoQA: Visual Question Answering for Autonomous Driving ECCV 2024 + + +
+ We introduce LingoQA, a novel dataset and benchmark for visual question +answering in autonomous driving. The dataset contains 28K unique short video +scenarios, and 419K annotations. Evaluating state-of-the-art vision-language +models on our benchmark shows that their performance is below human +capabilities, with GPT-4V responding truthfully to 59.6% of the questions +compared to 96.6% for humans. For evaluation, we propose a truthfulness +classifier, called Lingo-Judge, that achieves a 0.95 Spearman correlation +coefficient to human evaluations, surpassing existing techniques like METEOR, +BLEU, CIDEr, and GPT-4. We establish a baseline vision-language model and run +extensive ablation studies to understand its performance. We release our +dataset and benchmark as an evaluation platform for vision-language models in +autonomous driving. + +
+
+ comment: Accepted to ECCV 2024. Benchmark and dataset are available at + https://github.com/wayveai/LingoQA/ +
+
+
+
+
+ + ♻ ☆ Manydepth2: Motion-Aware Self-Supervised Monocular Depth Estimation in + Dynamic Scenes + + +
+ Despite advancements in self-supervised monocular depth estimation, +challenges persist in dynamic scenarios due to the dependence on assumptions +about a static world. In this paper, we present Manydepth2, a Motion-Guided +Cost Volume Depth Net, to achieve precise depth estimation for both dynamic +objects and static backgrounds, all while maintaining computational efficiency. +To tackle the challenges posed by dynamic content, we incorporate optical flow +and coarse monocular depth to create a novel static reference frame. This frame +is then utilized to build a motion-guided cost volume in collaboration with the +target frame. Additionally, to enhance the accuracy and resilience of the +network structure, we introduce an attention-based depth net architecture to +effectively integrate information from feature maps with varying resolutions. +Compared to methods with similar computational costs, Manydepth2 achieves a +significant reduction of approximately five percent in root-mean-square error +for self-supervised monocular depth estimation on the KITTI-2015 dataset. The +code could be found: https://github.com/kaichen-z/Manydepth2 + +
+
+ comment: Monocular Depth Estimation, Self-Supervised, Optical Flow +
+
+
+
+
+ + ♻ ☆ CollaMamba: Efficient Collaborative Perception with Cross-Agent + Spatial-Temporal State Space Model AAAI 2025 + + +
+ By sharing complementary perceptual information, multi-agent collaborative +perception fosters a deeper understanding of the environment. Recent studies on +collaborative perception mostly utilize CNNs or Transformers to learn feature +representation and fusion in the spatial dimension, which struggle to handle +long-range spatial-temporal features under limited computing and communication +resources. Holistically modeling the dependencies over extensive spatial areas +and extended temporal frames is crucial to enhancing feature quality. To this +end, we propose a resource efficient cross-agent spatial-temporal collaborative +state space model (SSM), named CollaMamba. Initially, we construct a +foundational backbone network based on spatial SSM. This backbone adeptly +captures positional causal dependencies from both single-agent and cross-agent +views, yielding compact and comprehensive intermediate features while +maintaining linear complexity. Furthermore, we devise a history-aware feature +boosting module based on temporal SSM, extracting contextual cues from extended +historical frames to refine vague features while preserving low overhead. +Extensive experiments across several datasets demonstrate that CollaMamba +outperforms state-of-the-art methods, achieving higher model accuracy while +reducing computational and communication overhead by up to 71.9% and 1/64, +respectively. This work pioneers the exploration of the Mamba's potential in +collaborative perception. The source code will be made available. + +
+
+ comment: Submitted to AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Computational Trichromacy Reconstruction: Empowering the Color-Vision + Deficient to Recognize Colors Using Augmented Reality + + +
+ We propose an assistive technology that helps individuals with Color Vision +Deficiencies (CVD) to recognize/name colors. A dichromat's color perception is +a reduced two-dimensional (2D) subset of a normal trichromat's three +dimensional color (3D) perception, leading to confusion when visual stimuli +that appear identical to the dichromat are referred to by different color +names. Using our proposed system, CVD individuals can interactively induce +distinct perceptual changes to originally confusing colors via a computational +color space transformation. By combining their original 2D precepts for colors +with the discriminative changes, a three dimensional color space is +reconstructed, where the dichromat can learn to resolve color name confusions +and accurately recognize colors. Our system is implemented as an Augmented +Reality (AR) interface on smartphones, where users interactively control the +rotation through swipe gestures and observe the induced color shifts in the +camera view or in a displayed image. Through psychophysical experiments and a +longitudinal user study, we demonstrate that such rotational color shifts have +discriminative power (initially confusing colors become distinct under +rotation) and exhibit structured perceptual shifts dichromats can learn with +modest training. The AR App is also evaluated in two real-world scenarios +(building with lego blocks and interpreting artistic works); users all report +positive experience in using the App to recognize object colors that they +otherwise could not. + +
+
+
+
+
+ + ♻ ☆ EAGLES: Efficient Accelerated 3D Gaussians with Lightweight EncodingS + + +
+ Recently, 3D Gaussian splatting (3D-GS) has gained popularity in novel-view +scene synthesis. It addresses the challenges of lengthy training times and slow +rendering speeds associated with Neural Radiance Fields (NeRFs). Through rapid, +differentiable rasterization of 3D Gaussians, 3D-GS achieves real-time +rendering and accelerated training. They, however, demand substantial memory +resources for both training and storage, as they require millions of Gaussians +in their point cloud representation for each scene. We present a technique +utilizing quantized embeddings to significantly reduce per-point memory storage +requirements and a coarse-to-fine training strategy for a faster and more +stable optimization of the Gaussian point clouds. Our approach develops a +pruning stage which results in scene representations with fewer Gaussians, +leading to faster training times and rendering speeds for real-time rendering +of high resolution scenes. We reduce storage memory by more than an order of +magnitude all while preserving the reconstruction quality. We validate the +effectiveness of our approach on a variety of datasets and scenes preserving +the visual quality while consuming 10-20x lesser memory and faster +training/inference speed. Project page and code is available +https://efficientgaussian.github.io + +
+
+ comment: Website: https://efficientgaussian.github.io Code: + https://github.com/Sharath-girish/efficientgaussian +
+
+
+
+
+ + ♻ ☆ Low-Rank Interconnected Adaptation across Layers + + +
+ Low-rank adaptation (LoRA) is a powerful parameter-efficient fine-tuning +method that utilizes low-rank projectors $A$ and $B$ to learn weight updates +$\Delta W$ for adaptation targets $W$. Previous research has shown that LoRA is +essentially a gradient compressor, performing random projections on the +gradient using a fixed projection matrix $A_0$. However, this setup restricts +the overall weight update to be low-rank, which limits the adaptation +performance. In this paper, we propose low-rank interconnected adaptation +across layers (Lily). Specifically, we employ a hierarchical framework where +low-dimensional projectors (LPs) retained for downward projection at a +particular level, while globally-shared high-dimensional projector (HP) experts +perform upward projection across all levels of layers. Lily uniquely connects +each LP to all HP experts, therefore the gradient projections are no longer +dominated by fixed projection matrices, but rather by selective combinations of +all the projectors, thereby breaking the low-rank constraint of LoRA. +Furthermore, Lily's cross-layer connections facilitate the capture of intricate +information and dependencies across different layers, thereby enhancing the +model's representational capabilities. Experiments across various modalities, +architectures, and model sizes underscore Lily's great performance and +efficiency. Code is available on github https://github.com/yibozhong/lily. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ♻ ☆ OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera + Fusion for Colorizing Point Clouds ICRA + + +
+ A Colored point cloud, as a simple and efficient 3D representation, has many +advantages in various fields, including robotic navigation and scene +reconstruction. This representation is now commonly used in 3D reconstruction +tasks relying on cameras and LiDARs. However, fusing data from these two types +of sensors is poorly performed in many existing frameworks, leading to +unsatisfactory mapping results, mainly due to inaccurate camera poses. This +paper presents OmniColor, a novel and efficient algorithm to colorize point +clouds using an independent 360-degree camera. Given a LiDAR-based point cloud +and a sequence of panorama images with initial coarse camera poses, our +objective is to jointly optimize the poses of all frames for mapping images +onto geometric reconstructions. Our pipeline works in an off-the-shelf manner +that does not require any feature extraction or matching process. Instead, we +find optimal poses by directly maximizing the photometric consistency of LiDAR +maps. In experiments, we show that our method can overcome the severe visual +distortion of omnidirectional images and greatly benefit from the wide field of +view (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy +and stability. The code will be released at +https://github.com/liubonan123/OmniColor/. + +
+
+ comment: 2024 IEEE International Conference on Robotics and Automation (ICRA) +
+
+
+
+
+ + ♻ ☆ SF-MMCN: Low-Power Sever Flow Multi-Mode Diffusion Model Accelerator + + +
+ Generative Artificial Intelligence (AI) has become incredibly popular in +recent years, and the significance of traditional accelerators in dealing with +large-scale parameters is urgent. With the diffusion model's parallel +structure, the hardware design challenge has skyrocketed because of the +multiple layers operating simultaneously. Convolution Neural Network (CNN) +accelerators have been designed and developed rapidly, especially for +high-speed inference. Often, CNN models with parallel structures are deployed. +In these CNN accelerators, many Processing Elements (PE) are required to +perform parallel computations, mainly the multiply and accumulation (MAC) +operation, resulting in high power consumption and a large silicon area. In +this work, a Server Flow Multi-Mode CNN Unit (SF-MMCN) is proposed to reduce +the number of PE while improving the operation efficiency of the CNN +accelerator. The pipelining technique is introduced into Server Flow to process +parallel computations. The proposed SF-MMCN is implemented with TSMC 90-nm CMOS +technology. It is evaluated with VGG-16, ResNet-18, and U-net. The evaluation +results show that the proposed SF-MMCN can reduce the power consumption by 92%, +and the silicon area by 70%, while improving the efficiency of operation by +nearly 81 times. A new FoM, area efficiency (GOPs/mm^2) is also introduced to +evaluate the performance of the accelerator in terms of the ratio throughput +(GOPs) and silicon area (mm^2). In this FoM, SF-MMCN improves area efficiency +by 18 times (18.42). + +
+
+ comment: 16 pages, 16 figures; extend the CNN to process Diffusion Model + (possible this is the first reported hardware Diffusion Model implementation) +
+
+
+
+
+ + ♻ ☆ 2D and 3D Deep Learning Models for MRI-based Parkinson's Disease + Classification: A Comparative Analysis of Convolutional Kolmogorov-Arnold + Networks, Convolutional Neural Networks, and Graph Convolutional Networks + + +
+ Parkinson's Disease (PD) diagnosis remains challenging. This study applies +Convolutional Kolmogorov-Arnold Networks (ConvKANs), integrating learnable +spline-based activation functions into convolutional layers, for PD +classification using structural MRI. The first 3D implementation of ConvKANs +for medical imaging is presented, comparing their performance to Convolutional +Neural Networks (CNNs) and Graph Convolutional Networks (GCNs) across three +open-source datasets. Isolated analyses assessed performance within individual +datasets, using cross-validation techniques. Holdout analyses evaluated +cross-dataset generalizability by training models on two datasets and testing +on the third, mirroring real-world clinical scenarios. In isolated analyses, 2D +ConvKANs achieved the highest AUC of 0.99 (95% CI: 0.98-0.99) on the PPMI +dataset, outperforming 2D CNNs (AUC: 0.97, p = 0.0092). 3D models showed +promise, with 3D CNN and 3D ConvKAN reaching an AUC of 0.85 on PPMI. In holdout +analyses, 3D ConvKAN demonstrated superior generalization, achieving an AUC of +0.85 on early-stage PD data. GCNs underperformed in 2D but improved in 3D +implementations. These findings highlight ConvKANs' potential for PD detection, +emphasize the importance of 3D analysis in capturing subtle brain changes, and +underscore cross-dataset generalization challenges. This study advances +AI-assisted PD diagnosis using structural MRI and emphasizes the need for +larger-scale validation. + +
+
+ comment: 7 figures +
+
+
+
+
+ + ♻ ☆ Diffusion-based Generative Image Outpainting for Recovery of + FOV-Truncated CT Images + + +
+ Field-of-view (FOV) recovery of truncated chest CT scans is crucial for +accurate body composition analysis, which involves quantifying skeletal muscle +and subcutaneous adipose tissue (SAT) on CT slices. This, in turn, enables +disease prognostication. Here, we present a method for recovering truncated CT +slices using generative image outpainting. We train a diffusion model and apply +it to truncated CT slices generated by simulating a small FOV. Our model +reliably recovers the truncated anatomy and outperforms the previous +state-of-the-art despite being trained on 87% less data. + +
+
+ comment: Shared last authorship: Florian J. Fintelmann and Philip M\"uller +
+
+
+
+
+ + ♻ ☆ Enhanced Unsupervised Image-to-Image Translation Using Contrastive + Learning and Histogram of Oriented Gradients + + +
+ Image-to-Image Translation is a vital area of computer vision that focuses on +transforming images from one visual domain to another while preserving their +core content and structure. However, this field faces two major challenges: +first, the data from the two domains are often unpaired, making it difficult to +train generative adversarial networks effectively; second, existing methods +tend to produce artifacts or hallucinations during image generation, leading to +a decline in image quality. To address these issues, this paper proposes an +enhanced unsupervised image-to-image translation method based on the +Contrastive Unpaired Translation (CUT) model, incorporating Histogram of +Oriented Gradients (HOG) features. This novel approach ensures the preservation +of the semantic structure of images, even without semantic labels, by +minimizing the loss between the HOG features of input and generated images. The +method was tested on translating synthetic game environments from GTA5 dataset +to realistic urban scenes in cityscapes dataset, demonstrating significant +improvements in reducing hallucinations and enhancing image quality. + +
+
+ comment: Critical Errors in Data or Analysis +
+
+
+
+
+ + ♻ ☆ Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation CoRL 2024 + + +
+ Given the high cost of collecting robotic data in the real world, sample +efficiency is a consistently compelling pursuit in robotics. In this paper, we +introduce SGRv2, an imitation learning framework that enhances sample +efficiency through improved visual and action representations. Central to the +design of SGRv2 is the incorporation of a critical inductive bias-action +locality, which posits that robot's actions are predominantly influenced by the +target object and its interactions with the local environment. Extensive +experiments in both simulated and real-world settings demonstrate that action +locality is essential for boosting sample efficiency. SGRv2 excels in RLBench +tasks with keyframe control using merely 5 demonstrations and surpasses the RVT +baseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and +MimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR. +In real-world environments, with only eight demonstrations, SGRv2 can perform a +variety of tasks at a markedly higher success rate compared to baseline models. +Project website: http://sgrv2-robot.github.io + +
+
+ comment: CoRL 2024. Project website: http://sgrv2-robot.github.io +
+
+
+
+
+ + ♻ ☆ AnoVox: A Benchmark for Multimodal Anomaly Detection in Autonomous + Driving ECCV 2024 + + +
+ The scale-up of autonomous vehicles depends heavily on their ability to deal +with anomalies, such as rare objects on the road. In order to handle such +situations, it is necessary to detect anomalies in the first place. Anomaly +detection for autonomous driving has made great progress in the past years but +suffers from poorly designed benchmarks with a strong focus on camera data. In +this work, we propose AnoVox, the largest benchmark for ANOmaly detection in +autonomous driving to date. AnoVox incorporates large-scale multimodal sensor +data and spatial VOXel ground truth, allowing for the comparison of methods +independent of their used sensor. We propose a formal definition of normality +and provide a compliant training dataset. AnoVox is the first benchmark to +contain both content and temporal anomalies. + +
+
+ comment: Daniel Bogdoll, Iramm Hamdard, and Lukas Namgyu R\"o{\ss}ler + contributed equally. Accepted for publication at ECCV 2024 W-CODA workshop +
+
+
+
+
+ + ♻ ☆ Interpretable Vision-Language Survival Analysis with Ordinal Inductive + Bias for Computational Pathology + + +
+ Histopathology Whole-Slide Images (WSIs) provide an important tool to assess +cancer prognosis in computational pathology (CPATH). While existing survival +analysis (SA) approaches have made exciting progress, they are generally +limited to adopting highly-expressive architectures and only coarse-grained +patient-level labels to learn prognostic visual representations from gigapixel +WSIs. Such learning paradigm suffers from important performance bottlenecks, +when facing present scarce training data and standard multi-instance learning +(MIL) framework in CPATH. To overcome it, this paper, for the first time, +proposes a new Vision-Language-based SA (VLSA) paradigm. Concretely, (1) VLSA +is driven by pathology VL foundation models. It no longer relies on +high-capability networks and shows the advantage of data efficiency. (2) In +vision-end, VLSA encodes prognostic language prior and then employs it as +auxiliary signals to guide the aggregating of prognostic visual features at +instance level, thereby compensating for the weak supervision in MIL. Moreover, +given the characteristics of SA, we propose i) ordinal survival prompt learning +to transform continuous survival labels into textual prompts; and ii) ordinal +incidence function as prediction target to make SA compatible with VL-based +prediction. Notably, VLSA's predictions can be interpreted intuitively by our +Shapley values-based method. The extensive experiments on five datasets confirm +the effectiveness of our scheme. Our VLSA could pave a new way for SA in CPATH +by offering weakly-supervised MIL an effective means to learn valuable +prognostic clues from gigapixel WSIs. Our source code is available at +https://github.com/liupei101/VLSA. + +
+
+ comment: 24 pages, 11 tables, 6 figures +
+
+
+
+
+ + ♻ ☆ Fast Sampling Through The Reuse Of Attention Maps In Diffusion Models + + +
+ Text-to-image diffusion models have demonstrated unprecedented capabilities +for flexible and realistic image synthesis. Nevertheless, these models rely on +a time-consuming sampling procedure, which has motivated attempts to reduce +their latency. When improving efficiency, researchers often use the original +diffusion model to train an additional network designed specifically for fast +image generation. In contrast, our approach seeks to reduce latency directly, +without any retraining, fine-tuning, or knowledge distillation. In particular, +we find the repeated calculation of attention maps to be costly yet redundant, +and instead suggest reusing them during sampling. Our specific reuse strategies +are based on ODE theory, which implies that the later a map is reused, the +smaller the distortion in the final image. We empirically compare these reuse +strategies with few-step sampling procedures of comparable latency, finding +that reuse generates images that are closer to those produced by the original +high-latency diffusion model. + +
+
+
+
+
+ + ♻ ☆ ICON: Improving Inter-Report Consistency in Radiology Report Generation + via Lesion-aware Mixup Augmentation + + +
+ Previous research on radiology report generation has made significant +progress in terms of increasing the clinical accuracy of generated reports. In +this paper, we emphasize another crucial quality that it should possess, i.e., +inter-report consistency, which refers to the capability of generating +consistent reports for semantically equivalent radiographs. This quality is +even of greater significance than the overall report accuracy in terms of +ensuring the system's credibility, as a system prone to providing conflicting +results would severely erode users' trust. Regrettably, existing approaches +struggle to maintain inter-report consistency, exhibiting biases towards common +patterns and susceptibility to lesion variants. To address this issue, we +propose ICON, which improves the inter-report consistency of radiology report +generation. Aiming to enhance the system's ability to capture similarities in +semantically equivalent lesions, our approach first involves extracting lesions +from input images and examining their characteristics. Then, we introduce a +lesion-aware mixup technique to ensure that the representations of the +semantically equivalent lesions align with the same attributes, achieved +through a linear combination during the training phase. Extensive experiments +on three publicly available chest X-ray datasets verify the effectiveness of +our approach, both in terms of improving the consistency and accuracy of the +generated reports. + +
+
+
+
+
+ + ♻ ☆ Direct Learning of Mesh and Appearance via 3D Gaussian Splatting + + +
+ Accurately reconstructing a 3D scene including explicit geometry information +is both attractive and challenging. Geometry reconstruction can benefit from +incorporating differentiable appearance models, such as Neural Radiance Fields +and 3D Gaussian Splatting (3DGS). However, existing methods encounter +efficiency issues due to indirect geometry learning and the paradigm of +separately modeling geometry and surface appearance. In this work, we propose a +learnable scene model that incorporates 3DGS with an explicit geometry +representation, namely a mesh. Our model learns the mesh and appearance in an +end-to-end manner, where we bind 3D Gaussians to the mesh faces and perform +differentiable rendering of 3DGS to obtain photometric supervision. The model +creates an effective information pathway to supervise the learning of both 3DGS +and mesh. Experimental results demonstrate that the learned scene model not +only achieves state-of-the-art efficiency and rendering quality but also +supports manipulation using the explicit mesh. In addition, our model has a +unique advantage in adapting to scene updates, thanks to the end-to-end +learning of both mesh and appearance. + +
+
+
+
+
+ + ♻ ☆ Latent Watermark: Inject and Detect Watermarks in Latent Diffusion Space + + +
+ Watermarking is a tool for actively identifying and attributing the images +generated by latent diffusion models. Existing methods face the dilemma of +image quality and watermark robustness. Watermarks with superior image quality +usually have inferior robustness against attacks such as blurring and JPEG +compression, while watermarks with superior robustness usually significantly +damage image quality. This dilemma stems from the traditional paradigm where +watermarks are injected and detected in pixel space, relying on pixel +perturbation for watermark detection and resilience against attacks. In this +paper, we highlight that an effective solution to the problem is to both inject +and detect watermarks in the latent diffusion space, and propose Latent +Watermark with a progressive training strategy. It weakens the direct +connection between quality and robustness and thus alleviates their +contradiction. We conduct evaluations on two datasets and against 10 watermark +attacks. Six metrics measure the image quality and watermark robustness. +Results show that compared to the recently proposed methods such as +StableSignature, StegaStamp, RoSteALS, LaWa, TreeRing, and DiffuseTrace, LW not +only surpasses them in terms of robustness but also offers superior image +quality. Our code will be available at +https://github.com/RichardSunnyMeng/LatentWatermark. + +
+
+
+
+
+ + ♻ ☆ Deep Self-Cleansing for Medical Image Segmentation with Noisy Labels + + +
+ Medical image segmentation is crucial in the field of medical imaging, aiding +in disease diagnosis and surgical planning. Most established segmentation +methods rely on supervised deep learning, in which clean and precise labels are +essential for supervision and significantly impact the performance of models. +However, manually delineated labels often contain noise, such as missing labels +and inaccurate boundary delineation, which can hinder networks from correctly +modeling target characteristics. In this paper, we propose a deep +self-cleansing segmentation framework that can preserve clean labels while +cleansing noisy ones in the training phase. To achieve this, we devise a +gaussian mixture model-based label filtering module that distinguishes noisy +labels from clean labels. Additionally, we develop a label cleansing module to +generate pseudo low-noise labels for identified noisy samples. The preserved +clean labels and pseudo-labels are then used jointly to supervise the network. +Validated on a clinical liver tumor dataset and a public cardiac diagnosis +dataset, our method can effectively suppress the interference from noisy labels +and achieve prominent segmentation performance. + +
+
+ comment: 31 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ MMCode: Benchmarking Multimodal Large Language Models for Code + Generation with Visually Rich Programming Problems EMNLP 2024 + + +
+ Programming often involves converting detailed and complex specifications +into code, a process during which developers typically utilize visual aids to +more effectively convey concepts. While recent developments in Large Multimodal +Models have demonstrated remarkable abilities in visual reasoning and +mathematical tasks, there is little work on investigating whether these models +can effectively interpret visual elements for code generation. To this end, we +present MMCode, the first multi-modal coding dataset for evaluating algorithmic +problem-solving skills in visually rich contexts. MMCode contains 3,548 +questions and 6,620 images collected from real-world programming challenges +harvested from 10 code competition websites, presenting significant challenges +due to the extreme demand for reasoning abilities. Our experiment results show +that current state-of-the-art models struggle to solve these problems. The +results highlight the lack of powerful vision-code models, and we hope MMCode +can serve as an inspiration for future works in this domain. The data and code +are publicly available at https://github.com/likaixin2000/MMCode. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Recursive Distillation for Open-Set Distributed Robot Localization + + +
+ A typical assumption in state-of-the-art self-localization models is that an +annotated training dataset is available for the target workspace. However, this +is not necessarily true when a robot travels around the general open world. +This work introduces a novel training scheme for open-world distributed robot +systems. In our scheme, a robot (``student") can ask the other robots it meets +at unfamiliar places (``teachers") for guidance. Specifically, a +pseudo-training dataset is reconstructed from the teacher model and then used +for continual learning of the student model under domain, class, and vocabulary +incremental setup. Unlike typical knowledge transfer schemes, our scheme +introduces only minimal assumptions on the teacher model, so that it can handle +various types of open-set teachers, including those uncooperative, untrainable +(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In +this paper, we investigate a ranking function as an instance of such generic +models, using a challenging data-free recursive distillation scenario, where a +student once trained can recursively join the next-generation open teacher set. + +
+
+ comment: 5 pages, 4 figures, technical report +
+
+
+
+
+ + ♻ ☆ Unsupervised Cross-Domain Image Retrieval via Prototypical Optimal + Transport + + +
+ Unsupervised cross-domain image retrieval (UCIR) aims to retrieve images +sharing the same category across diverse domains without relying on labeled +data. Prior approaches have typically decomposed the UCIR problem into two +distinct tasks: intra-domain representation learning and cross-domain feature +alignment. However, these segregated strategies overlook the potential +synergies between these tasks. This paper introduces ProtoOT, a novel Optimal +Transport formulation explicitly tailored for UCIR, which integrates +intra-domain feature representation learning and cross-domain alignment into a +unified framework. ProtoOT leverages the strengths of the K-means clustering +method to effectively manage distribution imbalances inherent in UCIR. By +utilizing K-means for generating initial prototypes and approximating class +marginal distributions, we modify the constraints in Optimal Transport +accordingly, significantly enhancing its performance in UCIR scenarios. +Furthermore, we incorporate contrastive learning into the ProtoOT framework to +further improve representation learning. This encourages local semantic +consistency among features with similar semantics, while also explicitly +enforcing separation between features and unmatched prototypes, thereby +enhancing global discriminativeness. ProtoOT surpasses existing +state-of-the-art methods by a notable margin across benchmark datasets. +Notably, on DomainNet, ProtoOT achieves an average P@200 enhancement of 18.17%, +and on Office-Home, it demonstrates a P@15 improvement of 3.83%. + +
+
+
+
+
+ + ♻ ☆ Fixed-length Dense Descriptor for Efficient Fingerprint Matching + + +
+ In fingerprint matching, fixed-length descriptors generally offer greater +efficiency compared to minutiae set, but the recognition accuracy is not as +good as that of the latter. Although much progress has been made in deep +learning based fixed-length descriptors recently, they often fall short when +dealing with incomplete or partial fingerprints, diverse fingerprint poses, and +significant background noise. In this paper, we propose a three-dimensional +representation called Fixed-length Dense Descriptor (FDD) for efficient +fingerprint matching. FDD features great spatial properties, enabling it to +capture the spatial relationships of the original fingerprints, thereby +enhancing interpretability and robustness. Our experiments on various +fingerprint datasets reveal that FDD outperforms other fixed-length +descriptors, especially in matching fingerprints of different areas, +cross-modal fingerprint matching, and fingerprint matching with background +noise. + +
+
+ comment: Accepted by WIFS 2024 +
+
+
+
+
+ + ♻ ☆ On-Air Deep Learning Integrated Semantic Inference Models for Enhanced + Earth Observation Satellite Networks + + +
+ Earth Observation (EO) systems play a crucial role in achieving Sustainable +Development Goals by collecting and analyzing vital global data through +satellite networks. These systems are essential for tasks like mapping, +disaster monitoring, and resource management, but they face challenges in +processing and transmitting large volumes of EO data, especially in specialized +fields such as agriculture and real-time disaster response. Domain-adapted +Large Language Models (LLMs) provide a promising solution by facilitating data +fusion between extensive EO data and semantic EO data. By improving integration +and interpretation of diverse datasets, LLMs address the challenges of +processing specialized information in agriculture and disaster response +applications. This fusion enhances the accuracy and relevance of transmitted +data. This paper presents a framework for semantic communication in EO +satellite networks, aimed at improving data transmission efficiency and overall +system performance through cognitive processing techniques. The proposed system +employs Discrete-Task-Oriented Source-Channel Coding (DT-JSCC) and Semantic +Data Augmentation (SA) to focus on relevant information while minimizing +communication overhead. By integrating cognitive semantic processing and +inter-satellite links, the framework enhances the analysis and transmission of +multispectral satellite imagery, improving object detection, pattern +recognition, and real-time decision-making. The introduction of Cognitive +Semantic Augmentation (CSA) allows satellites to process and transmit semantic +information, boosting adaptability to changing environments and application +needs. This end-to-end architecture is tailored for next-generation satellite +networks, such as those supporting 6G, and demonstrates significant +improvements in efficiency and accuracy. + +
+
+ comment: 18 pages, 10 figures, magazine +
+
+
+
+
+ + ♻ ☆ EAGLE: Towards Efficient Arbitrary Referring Visual Prompts + Comprehension for Multimodal Large Language Models + + +
+ Recently, Multimodal Large Language Models (MLLMs) have sparked great +research interests owing to their exceptional content-reasoning and +instruction-following capabilities. To effectively instruct an MLLM, in +addition to conventional language expressions, the practice of referring to +objects by painting with brushes on images has emerged as a prevalent tool +(referred to as "referring visual prompts") due to its efficacy in aligning the +user's intention with specific image regions. To accommodate the most common +referring visual prompts, namely points, boxes, and masks, existing approaches +initially utilize specialized feature encoding modules to capture the semantics +of the highlighted areas indicated by these prompts. Subsequently, these +encoded region features are adapted to MLLMs through fine-tuning on a +meticulously curated multimodal instruction dataset. However, such designs +suffer from redundancy in architecture. Moreover, they face challenges in +effectively generalizing when encountering a diverse range of arbitrary +referring visual prompts in real-life scenarios. To address the above issues, +we propose EAGLE, a novel MLLM that empowers comprehension of arbitrary +referring visual prompts with less training efforts than existing approaches. +Specifically, our EAGLE maintains the innate format of the referring visual +prompts as colored patches rendered on the given image for conducting the +instruction tuning. Our approach embeds referring visual prompts as spatial +concepts conveying specific spatial areas comprehensible to the MLLM, with the +semantic comprehension of these regions originating from the MLLM itself. +Besides, we also propose a Geometry-Agnostic Learning paradigm (GAL) to further +disentangle the MLLM's region-level comprehension with the specific formats of +referring visual prompts. Extensive experiments are conducted to prove the +effectiveness of our proposed method. + +
+
+
+
+
+ + ♻ ☆ GenWarp: Single Image to Novel Views with Semantic-Preserving Generative + Warping NeurIPS 2024 + + +
+ Generating novel views from a single image remains a challenging task due to +the complexity of 3D scenes and the limited diversity in the existing +multi-view datasets to train a model on. Recent research combining large-scale +text-to-image (T2I) models with monocular depth estimation (MDE) has shown +promise in handling in-the-wild images. In these methods, an input view is +geometrically warped to novel views with estimated depth maps, then the warped +image is inpainted by T2I models. However, they struggle with noisy depth maps +and loss of semantic details when warping an input view to novel viewpoints. In +this paper, we propose a novel approach for single-shot novel view synthesis, a +semantic-preserving generative warping framework that enables T2I generative +models to learn where to warp and where to generate, through augmenting +cross-view attention with self-attention. Our approach addresses the +limitations of existing methods by conditioning the generative model on source +view images and incorporating geometric warping signals. Qualitative and +quantitative evaluations demonstrate that our model outperforms existing +methods in both in-domain and out-of-domain scenarios. Project page is +available at https://GenWarp-NVS.github.io/. + +
+
+ comment: Accepted to NeurIPS 2024 / Project page: + https://GenWarp-NVS.github.io +
+
+
+
+
+ + ♻ ☆ EPTQ: Enhanced Post-Training Quantization via Hessian-guided + Network-wise Optimization + + +
+ Quantization is a key method for deploying deep neural networks on edge +devices with limited memory and computation resources. Recent improvements in +Post-Training Quantization (PTQ) methods were achieved by an additional local +optimization process for learning the weight quantization rounding policy. +However, a gap exists when employing network-wise optimization with small +representative datasets. In this paper, we propose a new method for enhanced +PTQ (EPTQ) that employs a network-wise quantization optimization process, which +benefits from considering cross-layer dependencies during optimization. EPTQ +enables network-wise optimization with a small representative dataset using a +novel sample-layer attention score based on a label-free Hessian matrix upper +bound. The label-free approach makes our method suitable for the PTQ scheme. We +give a theoretical analysis for the said bound and use it to construct a +knowledge distillation loss that guides the optimization to focus on the more +sensitive layers and samples. In addition, we leverage the Hessian upper bound +to improve the weight quantization parameters selection by focusing on the more +sensitive elements in the weight tensors. Empirically, by employing EPTQ we +achieve state-of-the-art results on various models, tasks, and datasets, +including ImageNet classification, COCO object detection, and Pascal-VOC for +semantic segmentation. + +
+
+
+
+
+ + ♻ ☆ Masks and Boxes: Combining the Best of Both Worlds for Multi-Object + Tracking + + +
+ Multi-object tracking (MOT) involves identifying and consistently tracking +objects across video sequences. Traditional tracking-by-detection methods, +while effective, often require extensive tuning and lack generalizability. On +the other hand, segmentation mask-based methods are more generic but struggle +with tracking management, making them unsuitable for MOT. We propose a novel +approach, McByte, which incorporates a temporally propagated segmentation mask +as a strong association cue within a tracking-by-detection framework. By +combining bounding box and mask information, McByte enhances robustness and +generalizability without per-sequence tuning. Evaluated on four benchmark +datasets - DanceTrack, MOT17, SoccerNet-tracking 2022, and KITTI-tracking - +McByte demonstrates performance gain in all cases examined. At the same time, +it outperforms existing mask-based methods. Implementation code will be +provided upon acceptance. + +
+
+
+
+
+ + ♻ ☆ HER2 and FISH Status Prediction in Breast Biopsy H&E-Stained Images + Using Deep Learning + + +
+ The current standard for detecting human epidermal growth factor receptor 2 +(HER2) status in breast cancer patients relies on HER2 amplification, +identified through fluorescence in situ hybridization (FISH) or +immunohistochemistry (IHC). However, hematoxylin and eosin (H\&E) tumor stains +are more widely available, and accurately predicting HER2 status using H\&E +could reduce costs and expedite treatment selection. Deep Learning algorithms +for H&E have shown effectiveness in predicting various cancer features and +clinical outcomes, including moderate success in HER2 status prediction. In +this work, we employed a customized weak supervision classification technique +combined with MoCo-v2 contrastive learning to predict HER2 status. We trained +our pipeline on 182 publicly available H&E Whole Slide Images (WSIs) from The +Cancer Genome Atlas (TCGA), for which annotations by the pathology team at Yale +School of Medicine are publicly available. Our pipeline achieved an Area Under +the Curve (AUC) of 0.85 across four different test folds. Additionally, we +tested our model on 44 H&E slides from the TCGA-BRCA dataset, which had an HER2 +score of 2+ and included corresponding HER2 status and FISH test results. These +cases are considered equivocal for IHC, requiring an expensive FISH test on +their IHC slides for disambiguation. Our pipeline demonstrated an AUC of 0.81 +on these challenging H&E slides. Reducing the need for FISH test can have +significant implications in cancer treatment equity for underserved +populations. + +
+
+
+
+
+ + ♻ ☆ FruitNeRF: A Unified Neural Radiance Field based Fruit Counting + Framework + + +
+ We introduce FruitNeRF, a unified novel fruit counting framework that +leverages state-of-the-art view synthesis methods to count any fruit type +directly in 3D. Our framework takes an unordered set of posed images captured +by a monocular camera and segments fruit in each image. To make our system +independent of the fruit type, we employ a foundation model that generates +binary segmentation masks for any fruit. Utilizing both modalities, RGB and +semantic, we train a semantic neural radiance field. Through uniform volume +sampling of the implicit Fruit Field, we obtain fruit-only point clouds. By +applying cascaded clustering on the extracted point cloud, our approach +achieves precise fruit count.The use of neural radiance fields provides +significant advantages over conventional methods such as object tracking or +optical flow, as the counting itself is lifted into 3D. Our method prevents +double counting fruit and avoids counting irrelevant fruit.We evaluate our +methodology using both real-world and synthetic datasets. The real-world +dataset consists of three apple trees with manually counted ground truths, a +benchmark apple dataset with one row and ground truth fruit location, while the +synthetic dataset comprises various fruit types including apple, plum, lemon, +pear, peach, and mango.Additionally, we assess the performance of fruit +counting using the foundation model compared to a U-Net. + +
+
+ comment: Project Page: https://meyerls.github.io/fruit_nerf/ +
+
+
+
+
+ + ♻ ☆ Improving Fast Adversarial Training Paradigm: An Example Taxonomy + Perspective + + +
+ While adversarial training is an effective defense method against adversarial +attacks, it notably increases the training cost. To this end, fast adversarial +training (FAT) is presented for efficient training and has become a hot +research topic. However, FAT suffers from catastrophic overfitting, which leads +to a performance drop compared with multi-step adversarial training. However, +the cause of catastrophic overfitting remains unclear and lacks exploration. In +this paper, we present an example taxonomy in FAT, which identifies that +catastrophic overfitting is caused by the imbalance between the inner and outer +optimization in FAT. Furthermore, we investigated the impact of varying degrees +of training loss, revealing a correlation between training loss and +catastrophic overfitting. Based on these observations, we redesign the loss +function in FAT with the proposed dynamic label relaxation to concentrate the +loss range and reduce the impact of misclassified examples. Meanwhile, we +introduce batch momentum initialization to enhance the diversity to prevent +catastrophic overfitting in an efficient manner. Furthermore, we also propose +Catastrophic Overfitting aware Loss Adaptation (COLA), which employs a separate +training strategy for examples based on their loss degree. Our proposed method, +named example taxonomy aware FAT (ETA), establishes an improved paradigm for +FAT. Experiment results demonstrate our ETA achieves state-of-the-art +performance. Comprehensive experiments on four standard datasets demonstrate +the competitiveness of our proposed method. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ Efficient Video Object Segmentation via Modulated Cross-Attention Memory WACV 2025 + + +
+ Recently, transformer-based approaches have shown promising results for +semi-supervised video object segmentation. However, these approaches typically +struggle on long videos due to increased GPU memory demands, as they frequently +expand the memory bank every few frames. We propose a transformer-based +approach, named MAVOS, that introduces an optimized and dynamic long-term +modulated cross-attention (MCA) memory to model temporal smoothness without +requiring frequent memory expansion. The proposed MCA effectively encodes both +local and global features at various levels of granularity while efficiently +maintaining consistent speed regardless of the video length. Extensive +experiments on multiple benchmarks, LVOS, Long-Time Video, and DAVIS 2017, +demonstrate the effectiveness of our proposed contributions leading to +real-time inference and markedly reduced memory demands without any degradation +in segmentation accuracy on long videos. Compared to the best existing +transformer-based approach, our MAVOS increases the speed by 7.6x, while +significantly reducing the GPU memory by 87% with comparable segmentation +performance on short and long video datasets. Notably on the LVOS dataset, our +MAVOS achieves a J&F score of 63.3% while operating at 37 frames per second +(FPS) on a single V100 GPU. Our code and models will be publicly available at: +https://github.com/Amshaker/MAVOS. + +
+
+ comment: WACV 2025 +
+
+
+
+
+ + ♻ ☆ ND-SDF: Learning Normal Deflection Fields for High-Fidelity Indoor + Reconstruction + + +
+ Neural implicit reconstruction via volume rendering has demonstrated its +effectiveness in recovering dense 3D surfaces. However, it is non-trivial to +simultaneously recover meticulous geometry and preserve smoothness across +regions with differing characteristics. To address this issue, previous methods +typically employ geometric priors, which are often constrained by the +performance of the prior models. In this paper, we propose ND-SDF, which learns +a Normal Deflection field to represent the angular deviation between the scene +normal and the prior normal. Unlike previous methods that uniformly apply +geometric priors on all samples, introducing significant bias in accuracy, our +proposed normal deflection field dynamically learns and adapts the utilization +of samples based on their specific characteristics, thereby improving both the +accuracy and effectiveness of the model. Our method not only obtains smooth +weakly textured regions such as walls and floors but also preserves the +geometric details of complex structures. In addition, we introduce a novel ray +sampling strategy based on the deflection angle to facilitate the unbiased +rendering process, which significantly improves the quality and accuracy of +intricate surfaces, especially on thin structures. Consistent improvements on +various challenging datasets demonstrate the superiority of our method. + +
+
+
+
+
+ + ♻ ☆ VideoPatchCore: An Effective Method to Memorize Normality for Video + Anomaly Detection ACCV 2024 + + +
+ Video anomaly detection (VAD) is a crucial task in video analysis and +surveillance within computer vision. Currently, VAD is gaining attention with +memory techniques that store the features of normal frames. The stored features +are utilized for frame reconstruction, identifying an abnormality when a +significant difference exists between the reconstructed and input frames. +However, this approach faces several challenges due to the simultaneous +optimization required for both the memory and encoder-decoder model. These +challenges include increased optimization difficulty, complexity of +implementation, and performance variability depending on the memory size. To +address these challenges,we propose an effective memory method for VAD, called +VideoPatchCore. Inspired by PatchCore, our approach introduces a structure that +prioritizes memory optimization and configures three types of memory tailored +to the characteristics of video data. This method effectively addresses the +limitations of existing memory-based methods, achieving good performance +comparable to state-of-the-art methods. Furthermore, our method requires no +training and is straightforward to implement, making VAD tasks more accessible. +Our code is available online at github.com/SkiddieAhn/Paper-VideoPatchCore. + +
+
+ comment: Accepted to ACCV 2024 +
+
+
+
+
+ + ♻ ☆ AsyncDiff: Parallelizing Diffusion Models by Asynchronous Denoising NeurIPS 2024 + + +
+ Diffusion models have garnered significant interest from the community for +their great generative ability across various applications. However, their +typical multi-step sequential-denoising nature gives rise to high cumulative +latency, thereby precluding the possibilities of parallel computation. To +address this, we introduce AsyncDiff, a universal and plug-and-play +acceleration scheme that enables model parallelism across multiple devices. Our +approach divides the cumbersome noise prediction model into multiple +components, assigning each to a different device. To break the dependency chain +between these components, it transforms the conventional sequential denoising +into an asynchronous process by exploiting the high similarity between hidden +states in consecutive diffusion steps. Consequently, each component is +facilitated to compute in parallel on separate devices. The proposed strategy +significantly reduces inference latency while minimally impacting the +generative quality. Specifically, for the Stable Diffusion v2.1, AsyncDiff +achieves a 2.7x speedup with negligible degradation and a 4.0x speedup with +only a slight reduction of 0.38 in CLIP Score, on four NVIDIA A5000 GPUs. Our +experiments also demonstrate that AsyncDiff can be readily applied to video +diffusion models with encouraging performances. The code is available at +https://github.com/czg1225/AsyncDiff. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ SlimSAM: 0.1% Data Makes Segment Anything Slim NeurIPS 2024 + + +
+ Current approaches for compressing the Segment Anything Model (SAM) yield +commendable results, yet necessitate extensive data to train a new network from +scratch. Employing conventional pruning techniques can remarkably reduce data +requirements but would suffer from a degradation in performance. To address +this challenging trade-off, we introduce SlimSAM, a novel data-efficient SAM +compression method that achieves superior performance with extremely less +training data. The essence of SlimSAM is encapsulated in the alternate slimming +framework which effectively enhances knowledge inheritance under severely +limited training data availability and exceptional pruning ratio. Diverging +from prior techniques, our framework progressively compresses the model by +alternately pruning and distilling distinct, decoupled sub-structures. +Disturbed Taylor pruning is also proposed to address the misalignment between +the pruning objective and training target, thereby boosting the +post-distillation after pruning. SlimSAM yields significant performance +improvements while demanding over 10 times less training data than any other +existing compression methods. Even when compared to the original SAM, SlimSAM +achieves approaching performance while reducing parameter counts to merely 1.4% +(9.1M), MACs to 0.8% (23G), and requiring only 0.1% (10k) of the SAM training +data. The code is available at http://github.com/czg1225/SlimSAM. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Text-Guided Single Image Editing for Remote Sensing Images + + +
+ Artificial intelligence generative content (AIGC) has significantly impacted +image generation in the field of remote sensing. However, the equally important +area of remote sensing image (RSI) editing has not received sufficient +attention. Deep learning based editing methods generally involve two sequential +stages: generation and editing. During the generation stage, consistency in +content and details between the original and edited images must be maintained, +while in the editing stage, controllability and accuracy of the edits should be +ensured. For natural images, these challenges can be tackled by training +generative backbones on large-scale benchmark datasets and using text guidance +based on vision-language models (VLMs). However, these previously effective +approaches become less viable for RSIs due to two reasons: First, existing +generative RSI benchmark datasets do not fully capture the diversity of remote +sensing scenarios, particularly in terms of variations in sensors, object +types, and resolutions. Consequently, the generalization capacity of the +trained backbone model is often inadequate for universal editing tasks on RSIs. +Second, the large spatial resolution of RSIs exacerbates the problem in VLMs +where a single text semantic corresponds to multiple image semantics, leading +to the introduction of incorrect semantics when using text to guide RSI +editing. To solve above problems, this paper proposes a text-guided RSI editing +method that is controllable but stable, and can be trained using only a single +image. It adopts a multi-scale training approach to preserve consistency +without the need for training on extensive benchmark datasets, while leveraging +RSI pre-trained VLMs and prompt ensembling (PE) to ensure accuracy and +controllability in the text-guided editing process. + +
+
+ comment: 14 pages, 14 figures, submitted to IEEE Transactions on Geoscience + and Remote Sensing +
+
+
+
+
+ + ♻ ☆ Learning 3D-Aware GANs from Unposed Images with Template Feature Field + + +
+ Collecting accurate camera poses of training images has been shown to well +serve the learning of 3D-aware generative adversarial networks (GANs) yet can +be quite expensive in practice. This work targets learning 3D-aware GANs from +unposed images, for which we propose to perform on-the-fly pose estimation of +training images with a learned template feature field (TeFF). Concretely, in +addition to a generative radiance field as in previous approaches, we ask the +generator to also learn a field from 2D semantic features while sharing the +density from the radiance field. Such a framework allows us to acquire a +canonical 3D feature template leveraging the dataset mean discovered by the +generative model, and further efficiently estimate the pose parameters on real +data. Experimental results on various challenging datasets demonstrate the +superiority of our approach over state-of-the-art alternatives from both the +qualitative and the quantitative perspectives. + +
+
+ comment: https://XDimlab.github.io/TeFF +
+
+
+
+
+ + ♻ ☆ Regional quality estimation for echocardiography using deep learning + + +
+ Automatic estimation of cardiac ultrasound image quality can be beneficial +for guiding operators and ensuring the accuracy of clinical measurements. +Previous work often fails to distinguish the view correctness of the +echocardiogram from the image quality. Additionally, previous studies only +provide a global image quality value, which limits their practical utility. In +this work, we developed and compared three methods to estimate image quality: +1) classic pixel-based metrics like the generalized contrast-to-noise ratio +(gCNR) on myocardial segments as region of interest and left ventricle lumen as +background, obtained using a U-Net segmentation 2) local image coherence +derived from a U-Net model that predicts coherence from B-Mode images 3) a deep +convolutional network that predicts the quality of each region directly in an +end-to-end fashion. We evaluate each method against manual regional image +quality annotations by three experienced cardiologists. The results indicate +poor performance of the gCNR metric, with Spearman correlation to the +annotations of rho = 0.24. The end-to-end learning model obtains the best +result, rho = 0.69, comparable to the inter-observer correlation, rho = 0.63. +Finally, the coherence-based method, with rho = 0.58, outperformed the +classical metrics and is more generic than the end-to-end approach. The image +quality prediction tool is available as an open source Python library at +https://github.com/GillesVanDeVyver/arqee. + +
+
+
+
+
+ + ♻ ☆ High-throughput 3D shape completion of potato tubers on a harvester + + +
+ Potato yield is an important metric for farmers to further optimize their +cultivation practices. Potato yield can be estimated on a harvester using an +RGB-D camera that can estimate the three-dimensional (3D) volume of individual +potato tubers. A challenge, however, is that the 3D shape derived from RGB-D +images is only partially completed, underestimating the actual volume. To +address this issue, we developed a 3D shape completion network, called CoRe++, +which can complete the 3D shape from RGB-D images. CoRe++ is a deep learning +network that consists of a convolutional encoder and a decoder. The encoder +compresses RGB-D images into latent vectors that are used by the decoder to +complete the 3D shape using the deep signed distance field network (DeepSDF). +To evaluate our CoRe++ network, we collected partial and complete 3D point +clouds of 339 potato tubers on an operational harvester in Japan. On the 1425 +RGB-D images in the test set (representing 51 unique potato tubers), our +network achieved a completion accuracy of 2.8 mm on average. For volumetric +estimation, the root mean squared error (RMSE) was 22.6 ml, and this was better +than the RMSE of the linear regression (31.1 ml) and the base model (36.9 ml). +We found that the RMSE can be further reduced to 18.2 ml when performing the 3D +shape completion in the center of the RGB-D image. With an average 3D shape +completion time of 10 milliseconds per tuber, we can conclude that CoRe++ is +both fast and accurate enough to be implemented on an operational harvester for +high-throughput potato yield estimation. Our method can also be applied to +other tuber, fruit and vegetable crops, thereby enabling versatile, accurate +and real-time yield monitoring in precision agriculture. Our code, network +weights and dataset are publicly available at +https://github.com/UTokyo-FieldPhenomics-Lab/corepp.git. + +
+
+ comment: 20 pages, 11 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles + Using Latent Space Generative World Models ICRA 2025 + + +
+ We propose the use of latent space generative world models to address the +covariate shift problem in autonomous driving. A world model is a neural +network capable of predicting an agent's next state given past states and +actions. By leveraging a world model during training, the driving policy +effectively mitigates covariate shift without requiring an excessive amount of +training data. During end-to-end training, our policy learns how to recover +from errors by aligning with states observed in human demonstrations, so that +at runtime it can recover from perturbations outside the training distribution. +Additionally, we introduce a novel transformer-based perception encoder that +employs multi-view cross-attention and a learned scene query. We present +qualitative and quantitative results, demonstrating significant improvements +upon prior state of the art in closed-loop testing in the CARLA simulator, as +well as showing the ability to handle perturbations in both CARLA and NVIDIA's +DRIVE Sim. + +
+
+ comment: 7 pages, 6 figures, for ICRA 2025 conference, for associated video + file, see https://youtu.be/fO7RZ57gVxk +
+
+
+
+
+ + ♻ ☆ EDA-DM: Enhanced Distribution Alignment for Post-Training Quantization + of Diffusion Models + + +
+ Diffusion models have achieved great success in image generation tasks +through iterative noise estimation. However, the heavy denoising process and +complex neural networks hinder their low-latency applications in real-world +scenarios. Quantization can effectively reduce model complexity, and +post-training quantization (PTQ), which does not require fine-tuning, is highly +promising for compressing and accelerating diffusion models. Unfortunately, we +find that due to the highly dynamic distribution of activations in different +denoising steps, existing PTQ methods for diffusion models suffer from +distribution mismatch issues at both calibration sample level and +reconstruction output level, which makes the performance far from satisfactory, +especially in low-bit cases. In this paper, we propose Enhanced Distribution +Alignment for Post-Training Quantization of Diffusion Models (EDA-DM) to +address the above issues. Specifically, at the calibration sample level, we +select calibration samples based on the density and variety in the latent +space, thus facilitating the alignment of their distribution with the overall +samples; and at the reconstruction output level, we modify the loss of block +reconstruction with the losses of layers, aligning the outputs of quantized +model and full-precision model at different network granularity. Extensive +experiments demonstrate that EDA-DM significantly outperforms the existing PTQ +methods across various models (DDIM, LDM-4, LDM-8, Stable-Diffusion) and +different datasets (CIFAR-10, LSUN-Bedroom, LSUN-Church, ImageNet, MS-COCO). + +
+
+ comment: Code: http://github.com/BienLuky/EDA-DM +
+
+
+
+
+
+
+
+ + Information Retrieval 20 + +
+
+
+ + ☆ Open-World Evaluation for Retrieving Diverse Perspectives + + +
+ We study retrieving a set of documents that covers various perspectives on a +complex and contentious question (e.g., will ChatGPT do more harm than good?). +We curate a Benchmark for Retrieval Diversity for Subjective questions (BERDS), +where each example consists of a question and diverse perspectives associated +with the question, sourced from survey questions and debate websites. On this +data, retrievers paired with a corpus are evaluated to surface a document set +that contains diverse perspectives. Our framing diverges from most retrieval +tasks in that document relevancy cannot be decided by simple string matches to +references. Instead, we build a language model based automatic evaluator that +decides whether each retrieved document contains a perspective. This allows us +to evaluate the performance of three different types of corpus (Wikipedia, web +snapshot, and corpus constructed on the fly with retrieved pages from the +search engine) paired with retrievers. Retrieving diverse documents remains +challenging, with the outputs from existing retrievers covering all +perspectives on only 33.74% of the examples. We further study the impact of +query expansion and diversity-focused reranking approaches and analyze +retriever sycophancy. Together, we lay the foundation for future studies in +retrieval diversity handling complex queries. + +
+
+
+
+
+ + ☆ Revisit Anything: Visual Place Recognition via Image Segment Retrieval ECCV 2024 + + +
+ Accurately recognizing a revisited place is crucial for embodied agents to +localize and navigate. This requires visual representations to be distinct, +despite strong variations in camera viewpoint and scene appearance. Existing +visual place recognition pipelines encode the "whole" image and search for +matches. This poses a fundamental challenge in matching two images of the same +place captured from different camera viewpoints: "the similarity of what +overlaps can be dominated by the dissimilarity of what does not overlap". We +address this by encoding and searching for "image segments" instead of the +whole images. We propose to use open-set image segmentation to decompose an +image into `meaningful' entities (i.e., things and stuff). This enables us to +create a novel image representation as a collection of multiple overlapping +subgraphs connecting a segment with its neighboring segments, dubbed +SuperSegment. Furthermore, to efficiently encode these SuperSegments into +compact vector representations, we propose a novel factorized representation of +feature aggregation. We show that retrieving these partial representations +leads to significantly higher recognition recall than the typical whole image +based retrieval. Our segments-based approach, dubbed SegVLAD, sets a new +state-of-the-art in place recognition on a diverse selection of benchmark +datasets, while being applicable to both generic and task-specialized image +encoders. Finally, we demonstrate the potential of our method to ``revisit +anything'' by evaluating our method on an object instance retrieval task, which +bridges the two disparate areas of research: visual place recognition and +object-goal navigation, through their common aim of recognizing goal objects +specific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything. + +
+
+ comment: Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures +
+
+
+
+
+ + ☆ Report on the Workshop on Simulations for Information Access (Sim4IA + 2024) at SIGIR 2024 + + +
+ This paper is a report of the Workshop on Simulations for Information Access +(Sim4IA) workshop at SIGIR 2024. The workshop had two keynotes, a panel +discussion, nine lightning talks, and two breakout sessions. Key takeaways were +user simulation's importance in academia and industry, the possible bridging of +online and offline evaluation, and the issues of organizing a companion shared +task around user simulations for information access. We report on how we +organized the workshop, provide a brief overview of what happened at the +workshop, and summarize the main topics and findings of the workshop and future +work. + +
+
+ comment: Preprint of a SIGIR Forum submission for Vol. 58 No. 2 - December + 2024 +
+
+
+
+
+ + ☆ Enhancing Tourism Recommender Systems for Sustainable City Trips Using + Retrieval-Augmented Generation + + +
+ Tourism Recommender Systems (TRS) have traditionally focused on providing +personalized travel suggestions, often prioritizing user preferences without +considering broader sustainability goals. Integrating sustainability into TRS +has become essential with the increasing need to balance environmental impact, +local community interests, and visitor satisfaction. This paper proposes a +novel approach to enhancing TRS for sustainable city trips using Large Language +Models (LLMs) and a modified Retrieval-Augmented Generation (RAG) pipeline. We +enhance the traditional RAG system by incorporating a sustainability metric +based on a city's popularity and seasonal demand during the prompt augmentation +phase. This modification, called Sustainability Augmented Reranking (SAR), +ensures the system's recommendations align with sustainability goals. +Evaluations using popular open-source LLMs, such as Llama-3.1-Instruct-8B and +Mistral-Instruct-7B, demonstrate that the SAR-enhanced approach consistently +matches or outperforms the baseline (without SAR) across most metrics, +highlighting the benefits of incorporating sustainability into TRS. + +
+
+ comment: Accepted at the RecSoGood 2024 Workshop co-located with the 18th ACM + Conference on Recommender Systems (RecSys 2024) +
+
+
+
+
+ + ☆ A Multimodal Single-Branch Embedding Network for Recommendation in + Cold-Start and Missing Modality Scenarios + + +
+ Most recommender systems adopt collaborative filtering (CF) and provide +recommendations based on past collective interactions. Therefore, the +performance of CF algorithms degrades when few or no interactions are +available, a scenario referred to as cold-start. To address this issue, +previous work relies on models leveraging both collaborative data and side +information on the users or items. Similar to multimodal learning, these models +aim at combining collaborative and content representations in a shared +embedding space. In this work we propose a novel technique for multimodal +recommendation, relying on a multimodal Single-Branch embedding network for +Recommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction +data as well as multimodal side information using the same single-branch +embedding network on different modalities. This makes SiBraR effective in +scenarios of missing modality, including cold start. Our extensive experiments +on large-scale recommendation datasets from three different recommendation +domains (music, movie, and e-commerce) and providing multimodal content +information (audio, text, image, labels, and interactions) show that SiBraR +significantly outperforms CF as well as state-of-the-art content-based RSs in +cold-start scenarios, and is competitive in warm scenarios. We show that +SiBraR's recommendations are accurate in missing modality scenarios, and that +the model is able to map different modalities to the same region of the shared +embedding space, hence reducing the modality gap. + +
+
+ comment: Accepted at 18th ACM Conference on Recommender Systems (RecSys '24) +
+
+
+
+
+ + ☆ Value Identification in Multistakeholder Recommender Systems for + Humanities and Historical Research: The Case of the Digital Archive + Monasterium.net + + +
+ Recommender systems remain underutilized in humanities and historical +research, despite their potential to enhance the discovery of cultural records. +This paper offers an initial value identification of the multiple stakeholders +that might be impacted by recommendations in Monasterium.net, a digital archive +for historical legal documents. Specifically, we discuss the diverse values and +objectives of its stakeholders, such as editors, aggregators, platform owners, +researchers, publishers, and funding agencies. These in-depth insights into the +potentially conflicting values of stakeholder groups allow designing and +adapting recommender systems to enhance their usefulness for humanities and +historical research. Additionally, our findings will support deeper engagement +with additional stakeholders to refine value models and evaluation metrics for +recommender systems in the given domains. Our conclusions are embedded in and +applicable to other digital archives and a broader cultural heritage context. + +
+
+ comment: To be presented at: NORMalize 2024: The Second Workshop on the + Normative Design and Evaluation of Recommender Systems, October 18, 2024, + co-located with the ACM Conference on Recommender Systems 2024 (RecSys 2024), + Bari, Italy +
+
+
+
+
+ + ☆ Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval + Model EMNLP 2024 + + +
+ A supervised ranking model, despite its advantage of being effective, usually +involves complex processing - typically multiple stages of task-specific +pre-training and fine-tuning. This has motivated researchers to explore simpler +pipelines leveraging large language models (LLMs) that are capable of working +in a zero-shot manner. However, since zero-shot inference does not make use of +a training set of pairs of queries and their relevant documents, its +performance is mostly worse than that of supervised models, which are trained +on such example pairs. Motivated by the existing findings that training +examples generally improve zero-shot performance, in our work, we explore if +this also applies to ranking models. More specifically, given a query and a +pair of documents, the preference prediction task is improved by augmenting +examples of preferences for similar queries from a training set. Our proposed +pairwise few-shot ranker demonstrates consistent improvements over the +zero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset) +retrieval benchmarks. Our method also achieves a close performance to that of a +supervised model without requiring any complex training pipeline. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ Autoregressive Generation Strategies for Top-K Sequential + Recommendations + + +
+ The goal of modern sequential recommender systems is often formulated in +terms of next-item prediction. In this paper, we explore the applicability of +generative transformer-based models for the Top-K sequential recommendation +task, where the goal is to predict items a user is likely to interact with in +the "near future". + We explore commonly used autoregressive generation strategies, including +greedy decoding, beam search, and temperature sampling, to evaluate their +performance for the Top-K sequential recommendation task. In addition, we +propose novel Reciprocal Rank Aggregation (RRA) and Relevance Aggregation (RA) +generation strategies based on multi-sequence generation with temperature +sampling and subsequent aggregation. + Experiments on diverse datasets give valuable insights regarding commonly +used strategies' applicability and show that suggested approaches improve +performance on longer time horizons compared to widely-used Top-K prediction +approach and single-sequence autoregressive generation strategies. + +
+
+
+
+
+ + ☆ Efficient Pointwise-Pairwise Learning-to-Rank for News Recommendation + + +
+ News recommendation is a challenging task that involves personalization based +on the interaction history and preferences of each user. Recent works have +leveraged the power of pretrained language models (PLMs) to directly rank news +items by using inference approaches that predominately fall into three +categories: pointwise, pairwise, and listwise learning-to-rank. While pointwise +methods offer linear inference complexity, they fail to capture crucial +comparative information between items that is more effective for ranking tasks. +Conversely, pairwise and listwise approaches excel at incorporating these +comparisons but suffer from practical limitations: pairwise approaches are +either computationally expensive or lack theoretical guarantees, and listwise +methods often perform poorly in practice. In this paper, we propose a novel +framework for PLM-based news recommendation that integrates both pointwise +relevance prediction and pairwise comparisons in a scalable manner. We present +a rigorous theoretical analysis of our framework, establishing conditions under +which our approach guarantees improved performance. Extensive experiments show +that our approach outperforms the state-of-the-art methods on the MIND and +Adressa news recommendation datasets. + +
+
+
+
+
+ + ☆ Enhancing Structured-Data Retrieval with GraphRAG: Soccer Data Case + Study + + +
+ Extracting meaningful insights from large and complex datasets poses +significant challenges, particularly in ensuring the accuracy and relevance of +retrieved information. Traditional data retrieval methods such as sequential +search and index-based retrieval often fail when handling intricate and +interconnected data structures, resulting in incomplete or misleading outputs. +To overcome these limitations, we introduce Structured-GraphRAG, a versatile +framework designed to enhance information retrieval across structured datasets +in natural language queries. Structured-GraphRAG utilizes multiple knowledge +graphs, which represent data in a structured format and capture complex +relationships between entities, enabling a more nuanced and comprehensive +retrieval of information. This graph-based approach reduces the risk of errors +in language model outputs by grounding responses in a structured format, +thereby enhancing the reliability of results. We demonstrate the effectiveness +of Structured-GraphRAG by comparing its performance with that of a recently +published method using traditional retrieval-augmented generation. Our findings +show that Structured-GraphRAG significantly improves query processing +efficiency and reduces response times. While our case study focuses on soccer +data, the framework's design is broadly applicable, offering a powerful tool +for data analysis and enhancing language model applications across various +structured domains. + +
+
+
+
+
+ + ☆ Improving the Shortest Plank: Vulnerability-Aware Adversarial Training + for Robust Recommender System + + +
+ Recommender systems play a pivotal role in mitigating information overload in +various fields. Nonetheless, the inherent openness of these systems introduces +vulnerabilities, allowing attackers to insert fake users into the system's +training data to skew the exposure of certain items, known as poisoning +attacks. Adversarial training has emerged as a notable defense mechanism +against such poisoning attacks within recommender systems. Existing adversarial +training methods apply perturbations of the same magnitude across all users to +enhance system robustness against attacks. Yet, in reality, we find that +attacks often affect only a subset of users who are vulnerable. These +perturbations of indiscriminate magnitude make it difficult to balance +effective protection for vulnerable users without degrading recommendation +quality for those who are not affected. To address this issue, our research +delves into understanding user vulnerability. Considering that poisoning +attacks pollute the training data, we note that the higher degree to which a +recommender system fits users' training data correlates with an increased +likelihood of users incorporating attack information, indicating their +vulnerability. Leveraging these insights, we introduce the Vulnerability-aware +Adversarial Training (VAT), designed to defend against poisoning attacks in +recommender systems. VAT employs a novel vulnerability-aware function to +estimate users' vulnerability based on the degree to which the system fits +them. Guided by this estimation, VAT applies perturbations of adaptive +magnitude to each user, not only reducing the success ratio of attacks but also +preserving, and potentially enhancing, the quality of recommendations. +Comprehensive experiments confirm VAT's superior defensive capabilities across +different recommendation models and against various types of attacks. + +
+
+
+
+
+ + ☆ Towards More Relevant Product Search Ranking Via Large Language Models: + An Empirical Study + + +
+ Training Learning-to-Rank models for e-commerce product search ranking can be +challenging due to the lack of a gold standard of ranking relevance. In this +paper, we decompose ranking relevance into content-based and engagement-based +aspects, and we propose to leverage Large Language Models (LLMs) for both label +and feature generation in model training, primarily aiming to improve the +model's predictive capability for content-based relevance. Additionally, we +introduce different sigmoid transformations on the LLM outputs to polarize +relevance scores in labeling, enhancing the model's ability to balance +content-based and engagement-based relevances and thus prioritize highly +relevant items overall. Comprehensive online tests and offline evaluations are +also conducted for the proposed design. Our work sheds light on advanced +strategies for integrating LLMs into e-commerce product search ranking model +training, offering a pathway to more effective and balanced models with +improved ranking relevance. + +
+
+ comment: To be published in CIKM 2024 GenAIECommerce Workshop +
+
+
+
+
+ + ☆ Long or Short or Both? An Exploration on Lookback Time Windows of + Behavioral Features in Product Search Ranking + + +
+ Customer shopping behavioral features are core to product search ranking +models in eCommerce. In this paper, we investigate the effect of lookback time +windows when aggregating these features at the (query, product) level over +history. By studying the pros and cons of using long and short time windows, we +propose a novel approach to integrating these historical behavioral features of +different time windows. In particular, we address the criticality of using +query-level vertical signals in ranking models to effectively aggregate all +information from different behavioral features. Anecdotal evidence for the +proposed approach is also provided using live product search traffic on +Walmart.com. + +
+
+ comment: Published in ACM SIGIR Workshop on eCommerce 2024 +
+
+
+
+
+ + ☆ Minimizing Live Experiments in Recommender Systems: User Simulation to + Evaluate Preference Elicitation Policies + + +
+ Evaluation of policies in recommender systems typically involves A/B testing +using live experiments on real users to assess a new policy's impact on +relevant metrics. This ``gold standard'' comes at a high cost, however, in +terms of cycle time, user cost, and potential user retention. In developing +policies for ``onboarding'' new users, these costs can be especially +problematic, since on-boarding occurs only once. In this work, we describe a +simulation methodology used to augment (and reduce) the use of live +experiments. We illustrate its deployment for the evaluation of ``preference +elicitation'' algorithms used to onboard new users of the YouTube Music +platform. By developing counterfactually robust user behavior models, and a +simulation service that couples such models with production infrastructure, we +are able to test new algorithms in a way that reliably predicts their +performance on key metrics when deployed live. We describe our domain, our +simulation models and platform, results of experiments and deployment, and +suggest future steps needed to further realistic simulation as a powerful +complement to live experiments. + +
+
+
+
+
+ + ♻ ☆ Language agents achieve superhuman synthesis of scientific knowledge + + +
+ Language models are known to hallucinate incorrect information, and it is +unclear if they are sufficiently accurate and reliable for use in scientific +research. We developed a rigorous human-AI comparison methodology to evaluate +language model agents on real-world literature search tasks covering +information retrieval, summarization, and contradiction detection tasks. We +show that PaperQA2, a frontier language model agent optimized for improved +factuality, matches or exceeds subject matter expert performance on three +realistic literature research tasks without any restrictions on humans (i.e., +full access to internet, search tools, and time). PaperQA2 writes cited, +Wikipedia-style summaries of scientific topics that are significantly more +accurate than existing, human-written Wikipedia articles. We also introduce a +hard benchmark for scientific literature research called LitQA2 that guided +design of PaperQA2, leading to it exceeding human performance. Finally, we +apply PaperQA2 to identify contradictions within the scientific literature, an +important scientific task that is challenging for humans. PaperQA2 identifies +2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of +which 70% are validated by human experts. These results demonstrate that +language model agents are now capable of exceeding domain experts across +meaningful tasks on scientific literature. + +
+
+
+
+
+ + ♻ ☆ Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of + Anomalous Behavior in Bio-regenerative Life Support System Telemetry ECML + + +
+ The detection of abnormal or critical system states is essential in condition +monitoring. While much attention is given to promptly identifying anomalies, a +retrospective analysis of these anomalies can significantly enhance our +comprehension of the underlying causes of observed undesired behavior. This +aspect becomes particularly critical when the monitored system is deployed in a +vital environment. In this study, we delve into anomalies within the domain of +Bio-Regenerative Life Support Systems (BLSS) for space exploration and analyze +anomalies found in telemetry data stemming from the EDEN ISS space greenhouse +in Antarctica. We employ time series clustering on anomaly detection results to +categorize various types of anomalies in both uni- and multivariate settings. +We then assess the effectiveness of these methods in identifying systematic +anomalous behavior. Additionally, we illustrate that the anomaly detection +methods MDI and DAMP produce complementary results, as previously indicated by +research. + +
+
+ comment: 12 pages, + Supplemental Materials, Published at Machine Learning and + Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024 +
+
+
+
+
+ + ♻ ☆ Modeling and Analyzing the Influence of Non-Item Pages on Sequential + Next-Item Prediction + + +
+ Analyzing sequences of interactions between users and items, sequential +recommendation models can learn user intent and make predictions about the next +item. Next to item interactions, most systems also have interactions with what +we call non-item pages: these pages are not related to specific items but still +can provide insights of the user's interests, as, for example, navigation +pages. + We therefore propose a general way to include these non-item pages in +sequential recommendation models to enhance next-item prediction. First, we +demonstrate the influence of non-item pages on following interactions with the +hypotheses testing framework HypTrails and propose methods for representing +non-item pages in sequential recommendation models. Subsequently, we adapt +popular sequential recommender models to integrate non-item pages and +investigate their performance with different item representation strategies as +well as their ability to handle noisy data. To show the general capabilities of +the models to integrate non-item pages, we create a synthetic dataset for a +controlled setting and then evaluate the improvements from including non-item +pages on two real-world datasets. + Our results show that non-item pages are a valuable source of information, +and incorporating them in sequential recommendation models increases the +performance of next-item prediction across all analyzed model architectures. + +
+
+ comment: 37 pages, 19 figures; Submitted to ACM TORS +
+
+
+
+
+ + ♻ ☆ CHIQ: Contextual History Enhancement for Improving Query Rewriting in + Conversational Search EMNLP 2024 + + +
+ In this paper, we study how open-source large language models (LLMs) can be +effectively deployed for improving query rewriting in conversational search, +especially for ambiguous queries. We introduce CHIQ, a two-step method that +leverages the capabilities of LLMs to resolve ambiguities in the conversation +history before query rewriting. This approach contrasts with prior studies that +predominantly use closed-source LLMs to directly generate search queries from +conversation history. We demonstrate on five well-established benchmarks that +CHIQ leads to state-of-the-art results across most settings, showing highly +competitive performances with systems leveraging closed-source LLMs. Our study +provides a first step towards leveraging open-source LLMs in conversational +search, as a competitive alternative to the prevailing reliance on commercial +LLMs. Data, models, and source code will be publicly available upon acceptance +at https://github.com/fengranMark/CHIQ. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ IRSC: A Zero-shot Evaluation Benchmark for Information Retrieval through + Semantic Comprehension in Retrieval-Augmented Generation Scenarios + + +
+ In Retrieval-Augmented Generation (RAG) tasks using Large Language Models +(LLMs), the quality of retrieved information is critical to the final output. +This paper introduces the IRSC benchmark for evaluating the performance of +embedding models in multilingual RAG tasks. The benchmark encompasses five +retrieval tasks: query retrieval, title retrieval, part-of-paragraph retrieval, +keyword retrieval, and summary retrieval. Our research addresses the current +lack of comprehensive testing and effective comparison methods for embedding +models in RAG scenarios. We introduced new metrics: the Similarity of Semantic +Comprehension Index (SSCI) and the Retrieval Capability Contest Index (RCCI), +and evaluated models such as Snowflake-Arctic, BGE, GTE, and M3E. Our +contributions include: 1) the IRSC benchmark, 2) the SSCI and RCCI metrics, and +3) insights into the cross-lingual limitations of embedding models. The IRSC +benchmark aims to enhance the understanding and development of accurate +retrieval systems in RAG tasks. All code and datasets are available at: +https://github.com/Jasaxion/IRSC_Benchmark + +
+
+
+
+
+ + ♻ ☆ A Unified Framework for Multi-Domain CTR Prediction via Large Language + Models + + +
+ Click-Through Rate (CTR) prediction is a crucial task in online +recommendation platforms as it involves estimating the probability of user +engagement with advertisements or items by clicking on them. Given the +availability of various services like online shopping, ride-sharing, food +delivery, and professional services on commercial platforms, recommendation +systems in these platforms are required to make CTR predictions across multiple +domains rather than just a single domain. However, multi-domain click-through +rate (MDCTR) prediction remains a challenging task in online recommendation due +to the complex mutual influence between domains. Traditional MDCTR models +typically encode domains as discrete identifiers, ignoring rich semantic +information underlying. Consequently, they can hardly generalize to new +domains. Besides, existing models can be easily dominated by some specific +domains, which results in significant performance drops in the other domains +(i.e. the "seesaw phenomenon"). In this paper, we propose a novel solution +Uni-CTR to address the above challenges. Uni-CTR leverages a backbone Large +Language Model (LLM) to learn layer-wise semantic representations that capture +commonalities between domains. Uni-CTR also uses several domain-specific +networks to capture the characteristics of each domain. Note that we design a +masked loss strategy so that these domain-specific networks are decoupled from +backbone LLM. This allows domain-specific networks to remain unchanged when +incorporating new or removing domains, thereby enhancing the flexibility and +scalability of the system significantly. Experimental results on three public +datasets show that Uni-CTR outperforms the state-of-the-art (SOTA) MDCTR models +significantly. Furthermore, Uni-CTR demonstrates remarkable effectiveness in +zero-shot prediction. We have applied Uni-CTR in industrial scenarios, +confirming its efficiency. + +
+
+ comment: Accept By ACM TRANSACTIONS ON INFORMATION SYSTEMS(TOIS) +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ Multi-View and Multi-Scale Alignment for Contrastive Language-Image + Pre-training in Mammography MICCAI 2024 + + +
+ Contrastive Language-Image Pre-training (CLIP) shows promise in medical image +analysis but requires substantial data and computational resources. Due to +these restrictions, existing CLIP applications in medical imaging focus mainly +on modalities like chest X-rays that have abundant image-report data available, +leaving many other important modalities under-explored. Here, we propose the +first adaptation of the full CLIP model to mammography, which presents +significant challenges due to labeled data scarcity, high-resolution images +with small regions of interest, and data imbalance. We first develop a +specialized supervision framework for mammography that leverages its multi-view +nature. Furthermore, we design a symmetric local alignment module to better +focus on detailed features in high-resolution images. Lastly, we incorporate a +parameter-efficient fine-tuning approach for large language models pre-trained +with medical knowledge to address data limitations. Our multi-view and +multi-scale alignment (MaMA) method outperforms state-of-the-art baselines for +three different tasks on two large real-world mammography datasets, EMBED and +RSNA-Mammo, with only 52% model size compared with the largest baseline. + +
+
+ comment: This work is also the basis of the overall best solution for the + MICCAI 2024 CXR-LT Challenge +
+
+
+
+
+ + ☆ Find Rhinos without Finding Rhinos: Active Learning with Multimodal + Imagery of South African Rhino Habitats IJCAI 2023 + + +
+ Much of Earth's charismatic megafauna is endangered by human activities, +particularly the rhino, which is at risk of extinction due to the poaching +crisis in Africa. Monitoring rhinos' movement is crucial to their protection +but has unfortunately proven difficult because rhinos are elusive. Therefore, +instead of tracking rhinos, we propose the novel approach of mapping communal +defecation sites, called middens, which give information about rhinos' spatial +behavior valuable to anti-poaching, management, and reintroduction efforts. +This paper provides the first-ever mapping of rhino midden locations by +building classifiers to detect them using remotely sensed thermal, RGB, and +LiDAR imagery in passive and active learning settings. As existing active +learning methods perform poorly due to the extreme class imbalance in our +dataset, we design MultimodAL, an active learning system employing a ranking +technique and multimodality to achieve competitive performance with passive +learning models with 94% fewer labels. Our methods could therefore save over 76 +hours in labeling time when used on a similarly-sized dataset. Unexpectedly, +our midden map reveals that rhino middens are not randomly distributed +throughout the landscape; rather, they are clustered. Consequently, rangers +should be targeted at areas with high midden densities to strengthen +anti-poaching efforts, in line with UN Target 15.7. + +
+
+ comment: 9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good +
+
+
+
+
+ + ☆ MALPOLON: A Framework for Deep Species Distribution Modeling + + +
+ This paper describes a deep-SDM framework, MALPOLON. Written in Python and +built upon the PyTorch library, this framework aims to facilitate training and +inferences of deep species distribution models (deep-SDM) and sharing for users +with only general Python language skills (e.g., modeling ecologists) who are +interested in testing deep learning approaches to build new SDMs. More advanced +users can also benefit from the framework's modularity to run more specific +experiments by overriding existing classes while taking advantage of +press-button examples to train neural networks on multiple classification tasks +using custom or provided raw and pre-processed datasets. The framework is +open-sourced on GitHub and PyPi along with extensive documentation and examples +of use in various scenarios. MALPOLON offers straightforward installation, +YAML-based configuration, parallel computing, multi-GPU utilization, baseline +and foundational models for benchmarking, and extensive +tutorials/documentation, aiming to enhance accessibility and performance +scalability for ecologists and researchers. + +
+
+
+
+
+ + ☆ Self-supervised Pretraining for Cardiovascular Magnetic Resonance Cine + Segmentation MICCAI 2024 + + +
+ Self-supervised pretraining (SSP) has shown promising results in learning +from large unlabeled datasets and, thus, could be useful for automated +cardiovascular magnetic resonance (CMR) short-axis cine segmentation. However, +inconsistent reports of the benefits of SSP for segmentation have made it +difficult to apply SSP to CMR. Therefore, this study aimed to evaluate SSP +methods for CMR cine segmentation. + To this end, short-axis cine stacks of 296 subjects (90618 2D slices) were +used for unlabeled pretraining with four SSP methods; SimCLR, positional +contrastive learning, DINO, and masked image modeling (MIM). Subsets of varying +numbers of subjects were used for supervised fine-tuning of 2D models for each +SSP method, as well as to train a 2D baseline model from scratch. The +fine-tuned models were compared to the baseline using the 3D Dice similarity +coefficient (DSC) in a test dataset of 140 subjects. + The SSP methods showed no performance gains with the largest supervised +fine-tuning subset compared to the baseline (DSC = 0.89). When only 10 subjects +(231 2D slices) are available for supervised training, SSP using MIM (DSC = +0.86) improves over training from scratch (DSC = 0.82). + This study found that SSP is valuable for CMR cine segmentation when labeled +training data is scarce, but does not aid state-of-the-art deep learning +methods when ample labeled data is available. Moreover, the choice of SSP +method is important. The code is publicly available at: +https://github.com/q-cardIA/ssp-cmr-cine-segmentation + +
+
+ comment: Accepted to Data Engineering in Medical Imaging (DEMI) Workshop at + MICCAI 2024 +
+
+
+
+
+ + ☆ Infer Human's Intentions Before Following Natural Language Instructions + + +
+ For AI agents to be helpful to humans, they should be able to follow natural +language instructions to complete everyday cooperative tasks in human +environments. However, real human instructions inherently possess ambiguity, +because the human speakers assume sufficient prior knowledge about their hidden +goals and intentions. Standard language grounding and planning methods fail to +address such ambiguities because they do not model human internal goals as +additional partially observable factors in the environment. We propose a new +framework, Follow Instructions with Social and Embodied Reasoning (FISER), +aiming for better natural language instruction following in collaborative +embodied tasks. Our framework makes explicit inferences about human goals and +intentions as intermediate reasoning steps. We implement a set of +Transformer-based models and evaluate them over a challenging benchmark, +HandMeThat. We empirically demonstrate that using social reasoning to +explicitly infer human intentions before making action plans surpasses purely +end-to-end approaches. We also compare our implementation with strong +baselines, including Chain of Thought prompting on the largest available +pre-trained language models, and find that FISER provides better performance on +the embodied social reasoning tasks under investigation, reaching the +state-of-the-art on HandMeThat. + +
+
+
+
+
+ + ☆ Optimal Protocols for Continual Learning via Statistical Physics and + Control Theory + + +
+ Artificial neural networks often struggle with catastrophic forgetting when +learning multiple tasks sequentially, as training on new tasks degrades the +performance on previously learned ones. Recent theoretical work has addressed +this issue by analysing learning curves in synthetic frameworks under +predefined training protocols. However, these protocols relied on heuristics +and lacked a solid theoretical foundation assessing their optimality. In this +paper, we fill this gap combining exact equations for training dynamics, +derived using statistical physics techniques, with optimal control methods. We +apply this approach to teacher-student models for continual learning and +multi-task problems, obtaining a theory for task-selection protocols maximising +performance while minimising forgetting. Our theoretical analysis offers +non-trivial yet interpretable strategies for mitigating catastrophic +forgetting, shedding light on how optimal learning protocols can modulate +established effects, such as the influence of task similarity on forgetting. +Finally, we validate our theoretical findings on real-world data. + +
+
+ comment: 19 pages, 9 figures +
+
+
+
+
+ + ☆ Inverse Reinforcement Learning with Multiple Planning Horizons + + +
+ In this work, we study an inverse reinforcement learning (IRL) problem where +the experts are planning under a shared reward function but with different, +unknown planning horizons. Without the knowledge of discount factors, the +reward function has a larger feasible solution set, which makes it harder for +existing IRL approaches to identify a reward function. To overcome this +challenge, we develop algorithms that can learn a global multi-agent reward +function with agent-specific discount factors that reconstruct the expert +policies. We characterize the feasible solution space of the reward function +and discount factors for both algorithms and demonstrate the generalizability +of the learned reward function across multiple domains. + +
+
+ comment: Accepted at RLC 2024 +
+
+
+
+
+ + ☆ Revisit Anything: Visual Place Recognition via Image Segment Retrieval ECCV 2024 + + +
+ Accurately recognizing a revisited place is crucial for embodied agents to +localize and navigate. This requires visual representations to be distinct, +despite strong variations in camera viewpoint and scene appearance. Existing +visual place recognition pipelines encode the "whole" image and search for +matches. This poses a fundamental challenge in matching two images of the same +place captured from different camera viewpoints: "the similarity of what +overlaps can be dominated by the dissimilarity of what does not overlap". We +address this by encoding and searching for "image segments" instead of the +whole images. We propose to use open-set image segmentation to decompose an +image into `meaningful' entities (i.e., things and stuff). This enables us to +create a novel image representation as a collection of multiple overlapping +subgraphs connecting a segment with its neighboring segments, dubbed +SuperSegment. Furthermore, to efficiently encode these SuperSegments into +compact vector representations, we propose a novel factorized representation of +feature aggregation. We show that retrieving these partial representations +leads to significantly higher recognition recall than the typical whole image +based retrieval. Our segments-based approach, dubbed SegVLAD, sets a new +state-of-the-art in place recognition on a diverse selection of benchmark +datasets, while being applicable to both generic and task-specialized image +encoders. Finally, we demonstrate the potential of our method to ``revisit +anything'' by evaluating our method on an object instance retrieval task, which +bridges the two disparate areas of research: visual place recognition and +object-goal navigation, through their common aim of recognizing goal objects +specific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything. + +
+
+ comment: Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures +
+
+
+
+
+ + ☆ IFCap: Image-like Retrieval and Frequency-based Entity Filtering for + Zero-shot Captioning EMNLP 2024 + + +
+ Recent advancements in image captioning have explored text-only training +methods to overcome the limitations of paired image-text data. However, +existing text-only training methods often overlook the modality gap between +using text data during training and employing images during inference. To +address this issue, we propose a novel approach called Image-like Retrieval, +which aligns text features with visually relevant features to mitigate the +modality gap. Our method further enhances the accuracy of generated captions by +designing a Fusion Module that integrates retrieved captions with input +features. Additionally, we introduce a Frequency-based Entity Filtering +technique that significantly improves caption quality. We integrate these +methods into a unified framework, which we refer to as IFCap +($\textbf{I}$mage-like Retrieval and $\textbf{F}$requency-based Entity +Filtering for Zero-shot $\textbf{Cap}$tioning). Through extensive +experimentation, our straightforward yet powerful approach has demonstrated its +efficacy, outperforming the state-of-the-art methods by a significant margin in +both image captioning and video captioning compared to zero-shot captioning +based on text-only training. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ FlowBench: A Large Scale Benchmark for Flow Simulation over Complex + Geometries + + +
+ Simulating fluid flow around arbitrary shapes is key to solving various +engineering problems. However, simulating flow physics across complex +geometries remains numerically challenging and computationally +resource-intensive, particularly when using conventional PDE solvers. Machine +learning methods offer attractive opportunities to create fast and adaptable +PDE solvers. However, benchmark datasets to measure the performance of such +methods are scarce, especially for flow physics across complex geometries. We +introduce FlowBench, a dataset for neural simulators with over 10K samples, +which is currently larger than any publicly available flow physics dataset. +FlowBench contains flow simulation data across complex geometries +(\textit{parametric vs. non-parametric}), spanning a range of flow conditions +(\textit{Reynolds number and Grashoff number}), capturing a diverse array of +flow phenomena (\textit{steady vs. transient; forced vs. free convection}), and +for both 2D and 3D. FlowBench contains over 10K data samples, with each sample +the outcome of a fully resolved, direct numerical simulation using a +well-validated simulator framework designed for modeling transport phenomena in +complex geometries. For each sample, we include velocity, pressure, and +temperature field data at 3 different resolutions and several summary +statistics features of engineering relevance (such as coefficients of lift and +drag, and Nusselt numbers). %Additionally, we include masks and signed distance +fields for each shape. We envision that FlowBench will enable evaluating the +interplay between complex geometry, coupled flow phenomena, and data +sufficiency on the performance of current, and future, neural PDE solvers. We +enumerate several evaluation metrics to help rank order the performance of +neural PDE solvers. We benchmark the performance of several baseline methods +including FNO, CNO, WNO, and DeepONet. + +
+
+
+
+
+ + ☆ An Adversarial Perspective on Machine Unlearning for AI Safety + + +
+ Large language models are finetuned to refuse questions about hazardous +knowledge, but these protections can often be bypassed. Unlearning methods aim +at completely removing hazardous capabilities from models and make them +inaccessible to adversaries. This work challenges the fundamental differences +between unlearning and traditional safety post-training from an adversarial +perspective. We demonstrate that existing jailbreak methods, previously +reported as ineffective against unlearning, can be successful when applied +carefully. Furthermore, we develop a variety of adaptive methods that recover +most supposedly unlearned capabilities. For instance, we show that finetuning +on 10 unrelated examples or removing specific directions in the activation +space can recover most hazardous capabilities for models edited with RMU, a +state-of-the-art unlearning method. Our findings challenge the robustness of +current unlearning approaches and question their advantages over safety +training. + +
+
+
+
+
+ + ☆ Spatiotemporal Learning on Cell-embedded Graphs + + +
+ Data-driven simulation of physical systems has recently kindled significant +attention, where many neural models have been developed. In particular, +mesh-based graph neural networks (GNNs) have demonstrated significant potential +in predicting spatiotemporal dynamics across arbitrary geometric domains. +However, the existing node-edge message passing mechanism in GNNs limits the +model's representation learning ability. In this paper, we proposed a +cell-embedded GNN model (aka CeGNN) to learn spatiotemporal dynamics with +lifted performance. Specifically, we introduce a learnable cell attribution to +the node-edge message passing process, which better captures the spatial +dependency of regional features. Such a strategy essentially upgrades the local +aggregation scheme from the first order (e.g., from edge to node) to a higher +order (e.g., from volume to edge and then to node), which takes advantage of +volumetric information in message passing. Meanwhile, a novel feature-enhanced +block is designed to further improve the performance of CeGNN and relieve the +over-smoothness problem, via treating the latent features as basis functions. +The extensive experiments on various PDE systems and one real-world dataset +demonstrate that CeGNN achieves superior performance compared with other +baseline models, particularly reducing the prediction error with up to 1 orders +of magnitude on several PDE systems. + +
+
+
+
+
+ + ☆ Safe Time-Varying Optimization based on Gaussian Processes with + Spatio-Temporal Kernel NeurIPS 2024 + + +
+ Ensuring safety is a key aspect in sequential decision making problems, such +as robotics or process control. The complexity of the underlying systems often +makes finding the optimal decision challenging, especially when the +safety-critical system is time-varying. Overcoming the problem of optimizing an +unknown time-varying reward subject to unknown time-varying safety constraints, +we propose TVSafeOpt, a new algorithm built on Bayesian optimization with a +spatio-temporal kernel. The algorithm is capable of safely tracking a +time-varying safe region without the need for explicit change detection. +Optimality guarantees are also provided for the algorithm when the optimization +problem becomes stationary. We show that TVSafeOpt compares favorably against +SafeOpt on synthetic data, both regarding safety and optimality. Evaluation on +a realistic case study with gas compressors confirms that TVSafeOpt ensures +safety when solving time-varying optimization problems with unknown reward and +safety functions. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ PhoCoLens: Photorealistic and Consistent Reconstruction in Lensless + Imaging NeurIPS 2024 + + +
+ Lensless cameras offer significant advantages in size, weight, and cost +compared to traditional lens-based systems. Without a focusing lens, lensless +cameras rely on computational algorithms to recover the scenes from multiplexed +measurements. However, current algorithms struggle with inaccurate forward +imaging models and insufficient priors to reconstruct high-quality images. To +overcome these limitations, we introduce a novel two-stage approach for +consistent and photorealistic lensless image reconstruction. The first stage of +our approach ensures data consistency by focusing on accurately reconstructing +the low-frequency content with a spatially varying deconvolution method that +adjusts to changes in the Point Spread Function (PSF) across the camera's field +of view. The second stage enhances photorealism by incorporating a generative +prior from pre-trained diffusion models. By conditioning on the low-frequency +content retrieved in the first stage, the diffusion model effectively +reconstructs the high-frequency details that are typically lost in the lensless +imaging process, while also maintaining image fidelity. Our method achieves a +superior balance between data fidelity and visual quality compared to existing +methods, as demonstrated with two popular lensless systems, PhlatCam and +DiffuserCam. Project website: https://phocolens.github.io/. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ☆ Joint Localization and Planning using Diffusion ICRA 2025 + + +
+ Diffusion models have been successfully applied to robotics problems such as +manipulation and vehicle path planning. In this work, we explore their +application to end-to-end navigation -- including both perception and planning +-- by considering the problem of jointly performing global localization and +path planning in known but arbitrary 2D environments. In particular, we +introduce a diffusion model which produces collision-free paths in a global +reference frame given an egocentric LIDAR scan, an arbitrary map, and a desired +goal position. To this end, we implement diffusion in the space of paths in +SE(2), and describe how to condition the denoising process on both obstacles +and sensor observations. In our evaluation, we show that the proposed +conditioning techniques enable generalization to realistic maps of considerably +different appearance than the training environment, demonstrate our model's +ability to accurately describe ambiguous solutions, and run extensive +simulation experiments showcasing our model's use as a real-time, end-to-end +localization and planning stack. + +
+
+ comment: 7 pages, 9 figures. Submitted to ICRA 2025, under review +
+
+
+
+
+ + ☆ LoopSR: Looping Sim-and-Real for Lifelong Policy Adaptation of Legged + Robots + + +
+ Reinforcement Learning (RL) has shown its remarkable and generalizable +capability in legged locomotion through sim-to-real transfer. However, while +adaptive methods like domain randomization are expected to make policy more +robust to diverse environments, such comprehensiveness potentially detracts +from the policy's performance in any specific environment according to the No +Free Lunch theorem, leading to a suboptimal solution once deployed in the real +world. To address this issue, we propose a lifelong policy adaptation framework +named LoopSR, which utilizes a transformer-based encoder to project real-world +trajectories into a latent space, and accordingly reconstruct the real-world +environments back in simulation for further improvement. Autoencoder +architecture and contrastive learning methods are adopted to better extract the +characteristics of real-world dynamics. The simulation parameters for continual +training are derived by combining predicted parameters from the decoder with +retrieved parameters from the simulation trajectory dataset. By leveraging the +continual training, LoopSR achieves superior data efficiency compared with +strong baselines, with only a limited amount of data to yield eminent +performance in both sim-to-sim and sim-to-real experiments. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Dimension-independent learning rates for high-dimensional classification + problems + + +
+ We study the problem of approximating and estimating classification functions +that have their decision boundary in the $RBV^2$ space. Functions of $RBV^2$ +type arise naturally as solutions of regularized neural network learning +problems and neural networks can approximate these functions without the curse +of dimensionality. We modify existing results to show that every $RBV^2$ +function can be approximated by a neural network with bounded weights. +Thereafter, we prove the existence of a neural network with bounded weights +approximating a classification function. And we leverage these bounds to +quantify the estimation rates. Finally, we present a numerical study that +analyzes the effect of different regularity conditions on the decision +boundaries. + +
+
+
+
+
+ + ☆ Supra-Laplacian Encoding for Transformer on Dynamic Graphs + + +
+ Fully connected Graph Transformers (GT) have rapidly become prominent in the +static graph community as an alternative to Message-Passing models, which +suffer from a lack of expressivity, oversquashing, and under-reaching. However, +in a dynamic context, by interconnecting all nodes at multiple snapshots with +self-attention, GT loose both structural and temporal information. In this +work, we introduce Supra-LAplacian encoding for spatio-temporal TransformErs +(SLATE), a new spatio-temporal encoding to leverage the GT architecture while +keeping spatio-temporal information. Specifically, we transform Discrete Time +Dynamic Graphs into multi-layer graphs and take advantage of the spectral +properties of their associated supra-Laplacian matrix. Our second contribution +explicitly model nodes' pairwise relationships with a cross-attention +mechanism, providing an accurate edge representation for dynamic link +prediction. SLATE outperforms numerous state-of-the-art methods based on +Message-Passing Graph Neural Networks combined with recurrent models (e.g +LSTM), and Dynamic Graph Transformers, on 9 datasets. Code and instructions to +reproduce our results will be open-sourced. + +
+
+
+
+
+ + ☆ Hypergame Theory for Decentralized Resource Allocation in Multi-user + Semantic Communications + + +
+ Semantic communications (SC) is an emerging communication paradigm in which +wireless devices can send only relevant information from a source of data while +relying on computing resources to regenerate missing data points. However, the +design of a multi-user SC system becomes more challenging because of the +computing and communication overhead required for coordination. Existing +solutions for learning the semantic language and performing resource allocation +often fail to capture the computing and communication tradeoffs involved in +multiuser SC. To address this gap, a novel framework for decentralized +computing and communication resource allocation in multiuser SC systems is +proposed. The challenge of efficiently allocating communication and computing +resources (for reasoning) in a decentralized manner to maximize the quality of +task experience for the end users is addressed through the application of +Stackelberg hyper game theory. Leveraging the concept of second-level hyper +games, novel analytical formulations are developed to model misperceptions of +the users about each other's communication and control strategies. Further, +equilibrium analysis of the learned resource allocation protocols examines the +convergence of the computing and communication strategies to a local +Stackelberg equilibria, considering misperceptions. Simulation results show +that the proposed Stackelberg hyper game results in efficient usage of +communication and computing resources while maintaining a high quality of +experience for the users compared to state-of-the-art that does not account for +the misperceptions. + +
+
+
+
+
+ + ☆ HydraViT: Stacking Heads for a Scalable ViT + + +
+ The architecture of Vision Transformers (ViTs), particularly the Multi-head +Attention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs +on devices with varying constraints, such as mobile phones, requires multiple +models of different sizes. However, this approach has limitations, such as +training and storing each required model separately. This paper introduces +HydraViT, a novel approach that addresses these limitations by stacking +attention heads to achieve a scalable ViT. By repeatedly changing the size of +the embedded dimensions throughout each layer and their corresponding number of +attention heads in MHA during training, HydraViT induces multiple subnetworks. +Thereby, HydraViT achieves adaptability across a wide spectrum of hardware +environments while maintaining performance. Our experimental results +demonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10 +subnetworks, covering a wide range of resource constraints. HydraViT achieves +up to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy +with the same throughput on ImageNet-1K compared to the baselines, making it an +effective solution for scenarios where hardware availability is diverse or +varies over time. Source code available at https://github.com/ds-kiel/HydraViT. + +
+
+
+
+
+ + ☆ BEATS: Optimizing LLM Mathematical Capabilities with BackVerify and + Adaptive Disambiguate based Efficient Tree Search + + +
+ Large Language Models (LLMs) have exhibited exceptional performance across a +broad range of tasks and domains. However, they still encounter difficulties in +solving mathematical problems due to the rigorous and logical nature of +mathematics. Previous studies have employed techniques such as supervised +fine-tuning (SFT), prompt engineering, and search-based methods to improve the +mathematical problem-solving abilities of LLMs. Despite these efforts, their +performance remains suboptimal and demands substantial computational resources. +To address this issue, we propose a novel approach, BEATS, to enhance +mathematical problem-solving abilities. Our method leverages newly designed +prompts that guide the model to iteratively rewrite, advance by one step, and +generate answers based on previous steps. Additionally, we introduce a new +back-verification technique that uses LLMs to validate the correctness of the +generated answers. Furthermore, we employ a pruning tree search to optimize +search time while achieving strong performance. Notably, our method improves +Qwen2-7b-Instruct's score from 36.94 to 61.52, outperforming GPT4's 42.5 on the +MATH benchmark. + +
+
+
+
+
+ + ☆ On Translating Technical Terminology: A Translation Workflow for + Machine-Translated Acronyms + + +
+ The typical workflow for a professional translator to translate a document +from its source language (SL) to a target language (TL) is not always focused +on what many language models in natural language processing (NLP) do - predict +the next word in a series of words. While high-resource languages like English +and French are reported to achieve near human parity using common metrics for +measurement such as BLEU and COMET, we find that an important step is being +missed: the translation of technical terms, specifically acronyms. Some +state-of-the art machine translation systems like Google Translate which are +publicly available can be erroneous when dealing with acronyms - as much as 50% +in our findings. This article addresses acronym disambiguation for MT systems +by proposing an additional step to the SL-TL (FR-EN) translation workflow where +we first offer a new acronym corpus for public consumption and then experiment +with a search-based thresholding algorithm that achieves nearly 10% increase +when compared to Google Translate and OpusMT. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ Predicting Anchored Text from Translation Memories for Machine + Translation Using Deep Learning Methods + + +
+ Translation memories (TMs) are the backbone for professional translation +tools called computer-aided translation (CAT) tools. In order to perform a +translation using a CAT tool, a translator uses the TM to gather translations +similar to the desired segment to translate (s'). Many CAT tools offer a +fuzzy-match algorithm to locate segments (s) in the TM that are close in +distance to s'. After locating two similar segments, the CAT tool will present +parallel segments (s, t) that contain one segment in the source language along +with its translation in the target language. Additionally, CAT tools contain +fuzzy-match repair (FMR) techniques that will automatically use the parallel +segments from the TM to create new TM entries containing a modified version of +the original with the idea in mind that it will be the translation of s'. Most +FMR techniques use machine translation as a way of "repairing" those words that +have to be modified. In this article, we show that for a large part of those +words which are anchored, we can use other techniques that are based on machine +learning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we +show that for anchored words that follow the continuous bag-of-words (CBOW) +paradigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for +some cases, better results than neural machine translation for translating +anchored words from French to English. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ Adaptive Stream Processing on Edge Devices through Active Inference + + +
+ The current scenario of IoT is witnessing a constant increase on the volume +of data, which is generated in constant stream, calling for novel architectural +and logical solutions for processing it. Moving the data handling towards the +edge of the computing spectrum guarantees better distribution of load and, in +principle, lower latency and better privacy. However, managing such a structure +is complex, especially when requirements, also referred to Service Level +Objectives (SLOs), specified by applications' owners and infrastructure +managers need to be ensured. Despite the rich number of proposals of Machine +Learning (ML) based management solutions, researchers and practitioners yet +struggle to guarantee long-term prediction and control, and accurate +troubleshooting. Therefore, we present a novel ML paradigm based on Active +Inference (AIF) -- a concept from neuroscience that describes how the brain +constantly predicts and evaluates sensory information to decrease long-term +surprise. We implement it and evaluate it in a heterogeneous real stream +processing use case, where an AIF-based agent continuously optimizes the +fulfillment of three SLOs for three autonomous driving services running on +multiple devices. The agent used causal knowledge to gradually develop an +understanding of how its actions are related to requirements fulfillment, and +which configurations to favor. Through this approach, our agent requires up to +thirty iterations to converge to the optimal solution, showing the capability +of offering accurate results in a short amount of time. Furthermore, thanks to +AIF and its causal structures, our method guarantees full transparency on the +decision making, making the interpretation of the results and the +troubleshooting effortless. + +
+
+
+
+
+ + ☆ Sample compression unleashed : New generalization bounds for real valued + losses + + +
+ The sample compression theory provides generalization guarantees for +predictors that can be fully defined using a subset of the training dataset and +a (short) message string, generally defined as a binary sequence. Previous +works provided generalization bounds for the zero-one loss, which is +restrictive, notably when applied to deep learning approaches. In this paper, +we present a general framework for deriving new sample compression bounds that +hold for real-valued losses. We empirically demonstrate the tightness of the +bounds and their versatility by evaluating them on different types of models, +e.g., neural networks and decision forests, trained with the Pick-To-Learn +(P2L) meta-algorithm, which transforms the training method of any +machine-learning predictor to yield sample-compressed predictors. In contrast +to existing P2L bounds, ours are valid in the non-consistent case. + +
+
+
+
+
+ + ☆ Intelligent Energy Management: Remaining Useful Life Prediction and + Charging Automation System Comprised of Deep Learning and the Internet of + Things + + +
+ Remaining Useful Life (RUL) of battery is an important parameter to know the +battery's remaining life and need for recharge. The goal of this research +project is to develop machine learning-based models for the battery RUL +dataset. Different ML models are developed to classify the RUL of the vehicle, +and the IoT (Internet of Things) concept is simulated for automating the +charging system and managing any faults aligning. The graphs plotted depict the +relationship between various vehicle parameters using the Blynk IoT platform. +Results show that the catboost, Multi-Layer Perceptron (MLP), Gated Recurrent +Unit (GRU), and hybrid model developed could classify RUL into three classes +with 99% more accuracy. The data is fed using the tkinter GUI for simulating +artificial intelligence (AI)-based charging, and with a pyserial backend, data +can be entered into the Esp-32 microcontroller for making charge discharge +possible with the model's predictions. Also, with an IoT system, the charging +can be disconnected, monitored, and analyzed for automation. The results show +that an accuracy of 99% can be obtained on models MLP, catboost model and +similar accuracy on GRU model can be obtained, and finally relay-based +triggering can be made by prediction through the model used for automating the +charging and energy-saving mechanism. By showcasing an exemplary Blynk +platform-based monitoring and automation phenomenon, we further present +innovative ways of monitoring parameters and automating the system. + +
+
+
+
+
+ + ☆ Graph Reasoning with Large Language Models via Pseudo-code Prompting + + +
+ Large language models (LLMs) have recently achieved remarkable success in +various reasoning tasks in the field of natural language processing. This +success of LLMs has also motivated their use in graph-related tasks. Among +others, recent work has explored whether LLMs can solve graph problems such as +counting the number of connected components of a graph or computing the +shortest path distance between two nodes. Although LLMs possess preliminary +graph reasoning abilities, they might still struggle to solve some seemingly +simple problems. In this paper, we investigate whether prompting via +pseudo-code instructions can improve the performance of LLMs in solving graph +problems. Our experiments demonstrate that using pseudo-code instructions +generally improves the performance of all considered LLMs. The graphs, +pseudo-code prompts, and evaluation code are publicly available. + +
+
+
+
+
+ + ☆ Designing Short-Stage CDC-XPUFs: Balancing Reliability, Cost, and + Security in IoT Devices + + +
+ The rapid expansion of Internet of Things (IoT) devices demands robust and +resource-efficient security solutions. Physically Unclonable Functions (PUFs), +which generate unique cryptographic keys from inherent hardware variations, +offer a promising approach. However, traditional PUFs like Arbiter PUFs (APUFs) +and XOR Arbiter PUFs (XOR-PUFs) are susceptible to machine learning (ML) and +reliability-based attacks. In this study, we investigate +Component-Differentially Challenged XOR-PUFs (CDC-XPUFs), a less explored +variant, to address these vulnerabilities. We propose an optimized CDC-XPUF +design that incorporates a pre-selection strategy to enhance reliability and +introduces a novel lightweight architecture to reduce hardware overhead. +Rigorous testing demonstrates that our design significantly lowers resource +consumption, maintains strong resistance to ML attacks, and improves +reliability, effectively mitigating reliability-based attacks. These results +highlight the potential of CDC-XPUFs as a secure and efficient candidate for +widespread deployment in resource-constrained IoT systems. + +
+
+
+
+
+ + ☆ Model-Free versus Model-Based Reinforcement Learning for Fixed-Wing UAV + Attitude Control Under Varying Wind Conditions + + +
+ This paper evaluates and compares the performance of model-free and +model-based reinforcement learning for the attitude control of fixed-wing +unmanned aerial vehicles using PID as a reference point. The comparison focuses +on their ability to handle varying flight dynamics and wind disturbances in a +simulated environment. Our results show that the Temporal Difference Model +Predictive Control agent outperforms both the PID controller and other +model-free reinforcement learning methods in terms of tracking accuracy and +robustness over different reference difficulties, particularly in nonlinear +flight regimes. Furthermore, we introduce actuation fluctuation as a key metric +to assess energy efficiency and actuator wear, and we test two different +approaches from the literature: action variation penalty and conditioning for +action policy smoothness. We also evaluate all control methods when subject to +stochastic turbulence and gusts separately, so as to measure their effects on +tracking performance, observe their limitations and outline their implications +on the Markov decision process formalism. + +
+
+ comment: Published at ICINCO 2024 +
+
+
+
+
+ + ☆ A multi-source data power load forecasting method using attention + mechanism-based parallel cnn-gru + + +
+ Accurate power load forecasting is crucial for improving energy efficiency +and ensuring power supply quality. Considering the power load forecasting +problem involves not only dynamic factors like historical load variations but +also static factors such as climate conditions that remain constant over +specific periods. From the model-agnostic perspective, this paper proposes a +parallel structure network to extract important information from both dynamic +and static data. Firstly, based on complexity learning theory, it is +demonstrated that models integrated through parallel structures exhibit +superior generalization abilities compared to individual base learners. +Additionally, the higher the independence between base learners, the stronger +the generalization ability of the parallel structure model. This suggests that +the structure of machine learning models inherently contains significant +information. Building on this theoretical foundation, a parallel convolutional +neural network (CNN)-gate recurrent unit (GRU) attention model (PCGA) is +employed to address the power load forecasting issue, aiming to effectively +integrate the influences of dynamic and static features. The CNN module is +responsible for capturing spatial characteristics from static data, while the +GRU module captures long-term dependencies in dynamic time series data. The +attention layer is designed to focus on key information from the +spatial-temporal features extracted by the parallel CNN-GRU. To substantiate +the advantages of the parallel structure model in extracting and integrating +multi-source information, a series of experiments are conducted. + +
+
+
+
+
+ + ☆ A method for identifying causality in the response of nonlinear + dynamical systems + + +
+ Predicting the response of nonlinear dynamical systems subject to random, +broadband excitation is important across a range of scientific disciplines, +such as structural dynamics and neuroscience. Building data-driven models +requires experimental measurements of the system input and output, but it can +be difficult to determine whether inaccuracies in the model stem from modelling +errors or noise. This paper presents a novel method to identify the causal +component of the input-output data from measurements of a system in the +presence of output noise, as a function of frequency, without needing a high +fidelity model. An output prediction, calculated using an available model, is +optimally combined with noisy measurements of the output to predict the input +to the system. The parameters of the algorithm balance the two output signals +and are utilised to calculate a nonlinear coherence metric as a measure of +causality. This method is applicable to a broad class of nonlinear dynamical +systems. There are currently no solutions to this problem in the absence of a +complete benchmark model. + +
+
+
+
+
+ + ☆ Efficient Arbitrary Precision Acceleration for Large Language Models on + GPU Tensor Cores + + +
+ Large language models (LLMs) have been widely applied but face challenges in +efficient inference. While quantization methods reduce computational demands, +ultra-low bit quantization with arbitrary precision is hindered by limited GPU +Tensor Core support and inefficient memory management, leading to suboptimal +acceleration. To address these challenges, we propose a comprehensive +acceleration scheme for arbitrary precision LLMs. At its core, we introduce a +novel bipolar-INT data format that facilitates parallel computing and supports +symmetric quantization, effectively reducing data redundancy. Building on this, +we implement an arbitrary precision matrix multiplication scheme that +decomposes and recovers matrices at the bit level, enabling flexible precision +while maximizing GPU Tensor Core utilization. Furthermore, we develop an +efficient matrix preprocessing method that optimizes data layout for subsequent +computations. Finally, we design a data recovery-oriented memory management +system that strategically utilizes fast shared memory, significantly enhancing +kernel execution speed and minimizing memory access latency. Experimental +results demonstrate our approach's effectiveness, with up to 13\times speedup +in matrix multiplication compared to NVIDIA's CUTLASS. When integrated into +LLMs, we achieve up to 6.7\times inference acceleration. These improvements +significantly enhance LLM inference efficiency, enabling broader and more +responsive applications of LLMs. + +
+
+
+
+
+ + ☆ Implementing a Nordic-Baltic Federated Health Data Network: a case + report + + +
+ Background: Centralized collection and processing of healthcare data across +national borders pose significant challenges, including privacy concerns, data +heterogeneity and legal barriers. To address some of these challenges, we +formed an interdisciplinary consortium to develop a feder-ated health data +network, comprised of six institutions across five countries, to facilitate +Nordic-Baltic cooperation on secondary use of health data. The objective of +this report is to offer early insights into our experiences developing this +network. Methods: We used a mixed-method ap-proach, combining both experimental +design and implementation science to evaluate the factors affecting the +implementation of our network. Results: Technically, our experiments indicate +that the network functions without significant performance degradation compared +to centralized simu-lation. Conclusion: While use of interdisciplinary +approaches holds a potential to solve challeng-es associated with establishing +such collaborative networks, our findings turn the spotlight on the uncertain +regulatory landscape playing catch up and the significant operational costs. + +
+
+ comment: 24 pages (including appendices), 1 figure +
+
+
+
+
+ + ☆ A Multimodal Single-Branch Embedding Network for Recommendation in + Cold-Start and Missing Modality Scenarios + + +
+ Most recommender systems adopt collaborative filtering (CF) and provide +recommendations based on past collective interactions. Therefore, the +performance of CF algorithms degrades when few or no interactions are +available, a scenario referred to as cold-start. To address this issue, +previous work relies on models leveraging both collaborative data and side +information on the users or items. Similar to multimodal learning, these models +aim at combining collaborative and content representations in a shared +embedding space. In this work we propose a novel technique for multimodal +recommendation, relying on a multimodal Single-Branch embedding network for +Recommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction +data as well as multimodal side information using the same single-branch +embedding network on different modalities. This makes SiBraR effective in +scenarios of missing modality, including cold start. Our extensive experiments +on large-scale recommendation datasets from three different recommendation +domains (music, movie, and e-commerce) and providing multimodal content +information (audio, text, image, labels, and interactions) show that SiBraR +significantly outperforms CF as well as state-of-the-art content-based RSs in +cold-start scenarios, and is competitive in warm scenarios. We show that +SiBraR's recommendations are accurate in missing modality scenarios, and that +the model is able to map different modalities to the same region of the shared +embedding space, hence reducing the modality gap. + +
+
+ comment: Accepted at 18th ACM Conference on Recommender Systems (RecSys '24) +
+
+
+
+
+ + ☆ How Feature Learning Can Improve Neural Scaling Laws + + +
+ We develop a solvable model of neural scaling laws beyond the kernel limit. +Theoretical analysis of this model shows how performance scales with model +size, training time, and the total amount of available data. We identify three +scaling regimes corresponding to varying task difficulties: hard, easy, and +super easy tasks. For easy and super-easy target functions, which lie in the +reproducing kernel Hilbert space (RKHS) defined by the initial infinite-width +Neural Tangent Kernel (NTK), the scaling exponents remain unchanged between +feature learning and kernel regime models. For hard tasks, defined as those +outside the RKHS of the initial NTK, we demonstrate both analytically and +empirically that feature learning can improve scaling with training time and +compute, nearly doubling the exponent for hard tasks. This leads to a different +compute optimal strategy to scale parameters and training time in the feature +learning regime. We support our finding that feature learning improves the +scaling law for hard tasks but not for easy and super-easy tasks with +experiments of nonlinear MLPs fitting functions with power-law Fourier spectra +on the circle and CNNs learning vision tasks. + +
+
+
+
+
+ + ☆ AMARO: All Heavy-Atom Transferable Neural Network Potentials of Protein + Thermodynamics + + +
+ All-atom molecular simulations offer detailed insights into macromolecular +phenomena, but their substantial computational cost hinders the exploration of +complex biological processes. We introduce Advanced Machine-learning Atomic +Representation Omni-force-field (AMARO), a new neural network potential (NNP) +that combines an O(3)-equivariant message-passing neural network architecture, +TensorNet, with a coarse-graining map that excludes hydrogen atoms. AMARO +demonstrates the feasibility of training coarser NNP, without prior energy +terms, to run stable protein dynamics with scalability and generalization +capabilities. + +
+
+
+
+
+ + ☆ Machine Learning-based vs Deep Learning-based Anomaly Detection in + Multivariate Time Series for Spacecraft Attitude Sensors + + +
+ In the framework of Failure Detection, Isolation and Recovery (FDIR) on +spacecraft, new AI-based approaches are emerging in the state of the art to +overcome the limitations commonly imposed by traditional threshold checking. + The present research aims at characterizing two different approaches to the +problem of stuck values detection in multivariate time series coming from +spacecraft attitude sensors. The analysis reveals the performance differences +in the two approaches, while commenting on their interpretability and +generalization to different scenarios. + +
+
+ comment: Accepted for the ESA SPAICE Conference 2024 +
+
+
+
+
+ + ☆ Language Models as Zero-shot Lossless Gradient Compressors: Towards + General Neural Parameter Prior Models NeurIPS 2024 + + +
+ Despite the widespread use of statistical prior models in various fields, +such models for neural network gradients have long been overlooked. The +inherent challenge stems from their high-dimensional structures and complex +interdependencies, which complicate effective modeling. In this work, we +demonstrate the potential of large language models (LLMs) to act as gradient +priors in a zero-shot setting. We examine the property by considering lossless +gradient compression -- a critical application in distributed learning -- that +depends heavily on precise probability modeling. To achieve this, we introduce +LM-GC, a novel method that integrates LLMs with arithmetic coding. Our +technique converts plain gradients into text-like formats, enhancing token +efficiency by up to 38 times compared to their plain representations. We ensure +that this data conversion maintains a close alignment with the structure of +plain gradients and the symbols commonly recognized by LLMs. Our experiments +indicate that LM-GC surpasses existing state-of-the-art lossless compression +methods, improving compression rates by 10\% up to 17.2\% across various +datasets and architectures. Additionally, our approach shows promising +compatibility with lossy compression techniques such as quantization and +sparsification. These findings highlight the significant potential of LLMs as a +model for effectively handling gradients. We will release the source code upon +publication. + +
+
+ comment: To appear in NeurIPS 2024 +
+
+
+
+
+ + ☆ Ordinary Differential Equations for Enhanced 12-Lead ECG Generation + + +
+ In the realm of artificial intelligence, the generation of realistic training +data for supervised learning tasks presents a significant challenge. This is +particularly true in the synthesis of electrocardiograms (ECGs), where the +objective is to develop a synthetic 12-lead ECG model. The primary complexity +of this task stems from accurately modeling the intricate biological and +physiological interactions among different ECG leads. Although mathematical +process simulators have shed light on these dynamics, effectively incorporating +this understanding into generative models is not straightforward. In this work, +we introduce an innovative method that employs ordinary differential equations +(ODEs) to enhance the fidelity of generating 12-lead ECG data. This approach +integrates a system of ODEs that represent cardiac dynamics directly into the +generative model's optimization process, allowing for the production of +biologically plausible ECG training data that authentically reflects real-world +variability and inter-lead dependencies. We conducted an empirical analysis of +thousands of ECGs and found that incorporating cardiac simulation insights into +the data generation process significantly improves the accuracy of heart +abnormality classifiers trained on this synthetic 12-lead ECG data. + +
+
+
+
+
+ + ☆ Physics-aligned Schrödinger bridge + + +
+ The reconstruction of physical fields from sparse measurements is pivotal in +both scientific research and engineering applications. Traditional methods are +increasingly supplemented by deep learning models due to their efficacy in +extracting features from data. However, except for the low accuracy on complex +physical systems, these models often fail to comply with essential physical +constraints, such as governing equations and boundary conditions. To overcome +this limitation, we introduce a novel data-driven field reconstruction +framework, termed the Physics-aligned Schr\"{o}dinger Bridge (PalSB). This +framework leverages a diffusion Schr\"{o}dinger bridge mechanism that is +specifically tailored to align with physical constraints. The PalSB approach +incorporates a dual-stage training process designed to address both local +reconstruction mapping and global physical principles. Additionally, a +boundary-aware sampling technique is implemented to ensure adherence to +physical boundary conditions. We demonstrate the effectiveness of PalSB through +its application to three complex nonlinear systems: cylinder flow from Particle +Image Velocimetry experiments, two-dimensional turbulence, and a +reaction-diffusion system. The results reveal that PalSB not only achieves +higher accuracy but also exhibits enhanced compliance with physical constraints +compared to existing methods. This highlights PalSB's capability to generate +high-quality representations of intricate physical interactions, showcasing its +potential for advancing field reconstruction techniques. + +
+
+
+
+
+ + ☆ Generative Modeling of Molecular Dynamics Trajectories NeurIPS 2024 + + +
+ Molecular dynamics (MD) is a powerful technique for studying microscopic +phenomena, but its computational cost has driven significant interest in the +development of deep learning-based surrogate models. We introduce generative +modeling of molecular trajectories as a paradigm for learning flexible +multi-task surrogate models of MD from data. By conditioning on appropriately +chosen frames of the trajectory, we show such generative models can be adapted +to diverse tasks such as forward simulation, transition path sampling, and +trajectory upsampling. By alternatively conditioning on part of the molecular +system and inpainting the rest, we also demonstrate the first steps towards +dynamics-conditioned molecular design. We validate the full set of these +capabilities on tetrapeptide simulations and show that our model can produce +reasonable ensembles of protein monomers. Altogether, our work illustrates how +generative modeling can unlock value from MD data towards diverse downstream +tasks that are not straightforward to address with existing methods or even MD +itself. Code is available at https://github.com/bjing2016/mdgen. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Continual learning with task specialist + + +
+ Continual learning (CL) adapt the deep learning scenarios with timely updated +datasets. However, existing CL models suffer from the catastrophic forgetting +issue, where new knowledge replaces past learning. In this paper, we propose +Continual Learning with Task Specialists (CLTS) to address the issues of +catastrophic forgetting and limited labelled data in real-world datasets by +performing class incremental learning of the incoming stream of data. The model +consists of Task Specialists (T S) and Task Predictor (T P ) with pre-trained +Stable Diffusion (SD) module. Here, we introduce a new specialist to handle a +new task sequence and each T S has three blocks; i) a variational autoencoder +(V AE) to learn the task distribution in a low dimensional latent space, ii) a +K-Means block to perform data clustering and iii) Bootstrapping Language-Image +Pre-training (BLIP ) model to generate a small batch of captions from the input +data. These captions are fed as input to the pre-trained stable diffusion model +(SD) for the generation of task samples. The proposed model does not store any +task samples for replay, instead uses generated samples from SD to train the T +P module. A comparison study with four SOTA models conducted on three +real-world datasets shows that the proposed model outperforms all the selected +baselines + +
+
+
+
+
+ + ☆ Enriched Functional Tree-Based Classifiers: A Novel Approach Leveraging + Derivatives and Geometric Features + + +
+ The positioning of this research falls within the scalar-on-function +classification literature, a field of significant interest across various +domains, particularly in statistics, mathematics, and computer science. This +study introduces an advanced methodology for supervised classification by +integrating Functional Data Analysis (FDA) with tree-based ensemble techniques +for classifying high-dimensional time series. The proposed framework, Enriched +Functional Tree-Based Classifiers (EFTCs), leverages derivative and geometric +features, benefiting from the diversity inherent in ensemble methods to further +enhance predictive performance and reduce variance. While our approach has been +tested on the enrichment of Functional Classification Trees (FCTs), Functional +K-NN (FKNN), Functional Random Forest (FRF), Functional XGBoost (FXGB), and +Functional LightGBM (FLGBM), it could be extended to other tree-based and +non-tree-based classifiers, with appropriate considerations emerging from this +investigation. Through extensive experimental evaluations on seven real-world +datasets and six simulated scenarios, this proposal demonstrates fascinating +improvements over traditional approaches, providing new insights into the +application of FDA in complex, high-dimensional learning problems. + +
+
+
+
+
+ + ☆ CASPFormer: Trajectory Prediction from BEV Images with Deformable + Attention ICPR 2024 + + +
+ Motion prediction is an important aspect for Autonomous Driving (AD) and +Advance Driver Assistance Systems (ADAS). Current state-of-the-art motion +prediction methods rely on High Definition (HD) maps for capturing the +surrounding context of the ego vehicle. Such systems lack scalability in +real-world deployment as HD maps are expensive to produce and update in +real-time. To overcome this issue, we propose Context Aware Scene Prediction +Transformer (CASPFormer), which can perform multi-modal motion prediction from +rasterized Bird-Eye-View (BEV) images. Our system can be integrated with any +upstream perception module that is capable of generating BEV images. Moreover, +CASPFormer directly decodes vectorized trajectories without any postprocessing. +Trajectories are decoded recurrently using deformable attention, as it is +computationally efficient and provides the network with the ability to focus +its attention on the important spatial locations of the BEV images. In +addition, we also address the issue of mode collapse for generating multiple +scene-consistent trajectories by incorporating learnable mode queries. We +evaluate our model on the nuScenes dataset and show that it reaches +state-of-the-art across multiple metrics + +
+
+ comment: Under Review at ICPR 2024, Kolkata +
+
+
+
+
+ + ☆ Predicting the Stay Length of Patients in Hospitals using Convolutional + Gated Recurrent Deep Learning Model + + +
+ Predicting hospital length of stay (LoS) stands as a critical factor in +shaping public health strategies. This data serves as a cornerstone for +governments to discern trends, patterns, and avenues for enhancing healthcare +delivery. In this study, we introduce a robust hybrid deep learning model, a +combination of Multi-layer Convolutional (CNNs) deep learning, Gated Recurrent +Units (GRU), and Dense neural networks, that outperforms 11 conventional and +state-of-the-art Machine Learning (ML) and Deep Learning (DL) methodologies in +accurately forecasting inpatient hospital stay duration. Our investigation +delves into the implementation of this hybrid model, scrutinising variables +like geographic indicators tied to caregiving institutions, demographic markers +encompassing patient ethnicity, race, and age, as well as medical attributes +such as the CCS diagnosis code, APR DRG code, illness severity metrics, and +hospital stay duration. Statistical evaluations reveal the pinnacle LoS +accuracy achieved by our proposed model (CNN-GRU-DNN), which averages at 89% +across a 10-fold cross-validation test, surpassing LSTM, BiLSTM, GRU, and +Convolutional Neural Networks (CNNs) by 19%, 18.2%, 18.6%, and 7%, +respectively. Accurate LoS predictions not only empower hospitals to optimise +resource allocation and curb expenses associated with prolonged stays but also +pave the way for novel strategies in hospital stay management. This avenue +holds promise for catalysing advancements in healthcare research and +innovation, inspiring a new era of precision-driven healthcare practices. + +
+
+
+
+
+ + ☆ Confidence intervals uncovered: Are we ready for real-world medical + imaging AI? MICCAI 2024 + + +
+ Medical imaging is spearheading the AI transformation of healthcare. +Performance reporting is key to determine which methods should be translated +into clinical practice. Frequently, broad conclusions are simply derived from +mean performance values. In this paper, we argue that this common practice is +often a misleading simplification as it ignores performance variability. Our +contribution is threefold. (1) Analyzing all MICCAI segmentation papers (n = +221) published in 2023, we first observe that more than 50\% of papers do not +assess performance variability at all. Moreover, only one (0.5\%) paper +reported confidence intervals (CIs) for model performance. (2) To address the +reporting bottleneck, we show that the unreported standard deviation (SD) in +segmentation papers can be approximated by a second-order polynomial function +of the mean Dice similarity coefficient (DSC). Based on external validation +data from 56 previous MICCAI challenges, we demonstrate that this approximation +can accurately reconstruct the CI of a method using information provided in +publications. (3) Finally, we reconstructed 95\% CIs around the mean DSC of +MICCAI 2023 segmentation papers. The median CI width was 0.03 which is three +times larger than the median performance gap between the first and second +ranked method. For more than 60\% of papers, the mean performance of the +second-ranked method was within the CI of the first-ranked method. We conclude +that current publications typically do not provide sufficient evidence to +support which models could potentially be translated into clinical practice. + +
+
+ comment: Paper accepted at MICCAI 2024 conference +
+
+
+
+
+ + ☆ Byzantine-Robust Aggregation for Securing Decentralized Federated + Learning + + +
+ Federated Learning (FL) emerges as a distributed machine learning approach +that addresses privacy concerns by training AI models locally on devices. +Decentralized Federated Learning (DFL) extends the FL paradigm by eliminating +the central server, thereby enhancing scalability and robustness through the +avoidance of a single point of failure. However, DFL faces significant +challenges in optimizing security, as most Byzantine-robust algorithms proposed +in the literature are designed for centralized scenarios. In this paper, we +present a novel Byzantine-robust aggregation algorithm to enhance the security +of Decentralized Federated Learning environments, coined WFAgg. This proposal +handles the adverse conditions and strength robustness of dynamic decentralized +topologies at the same time by employing multiple filters to identify and +mitigate Byzantine attacks. Experimental results demonstrate the effectiveness +of the proposed algorithm in maintaining model accuracy and convergence in the +presence of various Byzantine attack scenarios, outperforming state-of-the-art +centralized Byzantine-robust aggregation schemes (such as Multi-Krum or +Clustering). These algorithms are evaluated on an IID image classification +problem in both centralized and decentralized scenarios. + +
+
+ comment: 18 pages, 7 figures, 1 table +
+
+
+
+
+ + ☆ Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval + Model EMNLP 2024 + + +
+ A supervised ranking model, despite its advantage of being effective, usually +involves complex processing - typically multiple stages of task-specific +pre-training and fine-tuning. This has motivated researchers to explore simpler +pipelines leveraging large language models (LLMs) that are capable of working +in a zero-shot manner. However, since zero-shot inference does not make use of +a training set of pairs of queries and their relevant documents, its +performance is mostly worse than that of supervised models, which are trained +on such example pairs. Motivated by the existing findings that training +examples generally improve zero-shot performance, in our work, we explore if +this also applies to ranking models. More specifically, given a query and a +pair of documents, the preference prediction task is improved by augmenting +examples of preferences for similar queries from a training set. Our proposed +pairwise few-shot ranker demonstrates consistent improvements over the +zero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset) +retrieval benchmarks. Our method also achieves a close performance to that of a +supervised model without requiring any complex training pipeline. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ Autoregressive Generation Strategies for Top-K Sequential + Recommendations + + +
+ The goal of modern sequential recommender systems is often formulated in +terms of next-item prediction. In this paper, we explore the applicability of +generative transformer-based models for the Top-K sequential recommendation +task, where the goal is to predict items a user is likely to interact with in +the "near future". + We explore commonly used autoregressive generation strategies, including +greedy decoding, beam search, and temperature sampling, to evaluate their +performance for the Top-K sequential recommendation task. In addition, we +propose novel Reciprocal Rank Aggregation (RRA) and Relevance Aggregation (RA) +generation strategies based on multi-sequence generation with temperature +sampling and subsequent aggregation. + Experiments on diverse datasets give valuable insights regarding commonly +used strategies' applicability and show that suggested approaches improve +performance on longer time horizons compared to widely-used Top-K prediction +approach and single-sequence autoregressive generation strategies. + +
+
+
+
+
+ + ☆ Recent advances in interpretable machine learning using structure-based + protein representations + + +
+ Recent advancements in machine learning (ML) are transforming the field of +structural biology. For example, AlphaFold, a groundbreaking neural network for +protein structure prediction, has been widely adopted by researchers. The +availability of easy-to-use interfaces and interpretable outcomes from the +neural network architecture, such as the confidence scores used to color the +predicted structures, have made AlphaFold accessible even to non-ML experts. In +this paper, we present various methods for representing protein 3D structures +from low- to high-resolution, and show how interpretable ML methods can support +tasks such as predicting protein structures, protein function, and +protein-protein interactions. This survey also emphasizes the significance of +interpreting and visualizing ML-based inference for structure-based protein +representations that enhance interpretability and knowledge discovery. +Developing such interpretable approaches promises to further accelerate fields +including drug development and protein design. + +
+
+
+
+
+ + ☆ QuForge: A Library for Qudits Simulation + + +
+ Quantum computing with qudits, an extension of qubits to multiple levels, is +a research field less mature than qubit-based quantum computing. However, +qudits can offer some advantages over qubits, by representing information with +fewer separated components. In this article, we present QuForge, a Python-based +library designed to simulate quantum circuits with qudits. This library +provides the necessary quantum gates for implementing quantum algorithms, +tailored to any chosen qudit dimension. Built on top of differentiable +frameworks, QuForge supports execution on accelerating devices such as GPUs and +TPUs, significantly speeding up simulations. It also supports sparse +operations, leading to a reduction in memory consumption compared to other +libraries. Additionally, by constructing quantum circuits as differentiable +graphs, QuForge facilitates the implementation of quantum machine learning +algorithms, enhancing the capabilities and flexibility of quantum computing +research. + +
+
+ comment: 18 pages, 7 figures +
+
+
+
+
+ + ☆ Efficient Pointwise-Pairwise Learning-to-Rank for News Recommendation + + +
+ News recommendation is a challenging task that involves personalization based +on the interaction history and preferences of each user. Recent works have +leveraged the power of pretrained language models (PLMs) to directly rank news +items by using inference approaches that predominately fall into three +categories: pointwise, pairwise, and listwise learning-to-rank. While pointwise +methods offer linear inference complexity, they fail to capture crucial +comparative information between items that is more effective for ranking tasks. +Conversely, pairwise and listwise approaches excel at incorporating these +comparisons but suffer from practical limitations: pairwise approaches are +either computationally expensive or lack theoretical guarantees, and listwise +methods often perform poorly in practice. In this paper, we propose a novel +framework for PLM-based news recommendation that integrates both pointwise +relevance prediction and pairwise comparisons in a scalable manner. We present +a rigorous theoretical analysis of our framework, establishing conditions under +which our approach guarantees improved performance. Extensive experiments show +that our approach outperforms the state-of-the-art methods on the MIND and +Adressa news recommendation datasets. + +
+
+
+
+
+ + ☆ Transfer Learning in $\ell_1$ Regularized Regression: Hyperparameter + Selection Strategy based on Sharp Asymptotic Analysis + + +
+ Transfer learning techniques aim to leverage information from multiple +related datasets to enhance prediction quality against a target dataset. Such +methods have been adopted in the context of high-dimensional sparse regression, +and some Lasso-based algorithms have been invented: Trans-Lasso and Pretraining +Lasso are such examples. These algorithms require the statistician to select +hyperparameters that control the extent and type of information transfer from +related datasets. However, selection strategies for these hyperparameters, as +well as the impact of these choices on the algorithm's performance, have been +largely unexplored. To address this, we conduct a thorough, precise study of +the algorithm in a high-dimensional setting via an asymptotic analysis using +the replica method. Our approach reveals a surprisingly simple behavior of the +algorithm: Ignoring one of the two types of information transferred to the +fine-tuning stage has little effect on generalization performance, implying +that efforts for hyperparameter selection can be significantly reduced. Our +theoretical findings are also empirically supported by real-world applications +on the IMDb dataset. + +
+
+ comment: 23 pages, 9 figures +
+
+
+
+
+ + ☆ PGN: The RNN's New Successor is Effective for Long-Range Time Series + Forecasting + + +
+ Due to the recurrent structure of RNN, the long information propagation path +poses limitations in capturing long-term dependencies, gradient +explosion/vanishing issues, and inefficient sequential execution. Based on +this, we propose a novel paradigm called Parallel Gated Network (PGN) as the +new successor to RNN. PGN directly captures information from previous time +steps through the designed Historical Information Extraction (HIE) layer and +leverages gated mechanisms to select and fuse it with the current time step +information. This reduces the information propagation path to $\mathcal{O}(1)$, +effectively addressing the limitations of RNN. To enhance PGN's performance in +long-range time series forecasting tasks, we propose a novel temporal modeling +framework called Temporal PGN (TPGN). TPGN incorporates two branches to +comprehensively capture the semantic information of time series. One branch +utilizes PGN to capture long-term periodic patterns while preserving their +local characteristics. The other branch employs patches to capture short-term +information and aggregate the global representation of the series. TPGN +achieves a theoretical complexity of $\mathcal{O}(\sqrt{L})$, ensuring +efficiency in its operations. Experimental results on five benchmark datasets +demonstrate the state-of-the-art (SOTA) performance and high efficiency of +TPGN, further confirming the effectiveness of PGN as the new successor to RNN +in long-range time series forecasting. The code is available in this +repository: \url{https://github.com/Water2sea/TPGN}. + +
+
+
+
+
+ + ☆ MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard + for Prompt Attacks + + +
+ The proliferation of Large Language Models (LLMs) in diverse applications +underscores the pressing need for robust security measures to thwart potential +jailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger +data integrity and user privacy. Guardrails serve as crucial protective +mechanisms against such threats, but existing models often fall short in terms +of both detection accuracy, and computational efficiency. This paper advocates +for the significance of jailbreak attack prevention on LLMs, and emphasises the +role of input guardrails in safeguarding these models. We introduce MoJE +(Mixture of Jailbreak Expert), a novel guardrail architecture designed to +surpass current limitations in existing state-of-the-art guardrails. By +employing simple linguistic statistical techniques, MoJE excels in detecting +jailbreak attacks while maintaining minimal computational overhead during model +inference. Through rigorous experimentation, MoJE demonstrates superior +performance capable of detecting 90% of the attacks without compromising benign +prompts, enhancing LLMs security against jailbreak attacks. + +
+
+
+
+
+ + ☆ MIO: A Foundation Model on Multimodal Tokens + + +
+ In this paper, we introduce MIO, a novel foundation model built on multimodal +tokens, capable of understanding and generating speech, text, images, and +videos in an end-to-end, autoregressive manner. While the emergence of large +language models (LLMs) and multimodal large language models (MM-LLMs) propels +advancements in artificial general intelligence through their versatile +capabilities, they still lack true any-to-any understanding and generation. +Recently, the release of GPT-4o has showcased the remarkable potential of +any-to-any LLMs for complex real-world tasks, enabling omnidirectional input +and output across images, speech, and text. However, it is closed-source and +does not support the generation of multimodal interleaved sequences. To address +this gap, we present MIO, which is trained on a mixture of discrete tokens +across four modalities using causal multimodal modeling. MIO undergoes a +four-stage training process: (1) alignment pre-training, (2) interleaved +pre-training, (3) speech-enhanced pre-training, and (4) comprehensive +supervised fine-tuning on diverse textual, visual, and speech tasks. Our +experimental results indicate that MIO exhibits competitive, and in some cases +superior, performance compared to previous dual-modal baselines, any-to-any +model baselines, and even modality-specific baselines. Moreover, MIO +demonstrates advanced capabilities inherent to its any-to-any feature, such as +interleaved video-text generation, chain-of-visual-thought reasoning, visual +guideline generation, instructional image editing, etc. + +
+
+ comment: Technical Report. Codes and models will be available soon +
+
+
+
+
+ + ☆ Efficient Bias Mitigation Without Privileged Information ECCV + 2024 + + +
+ Deep neural networks trained via empirical risk minimisation often exhibit +significant performance disparities across groups, particularly when group and +task labels are spuriously correlated (e.g., "grassy background" and "cows"). +Existing bias mitigation methods that aim to address this issue often either +rely on group labels for training or validation, or require an extensive +hyperparameter search. Such data and computational requirements hinder the +practical deployment of these methods, especially when datasets are too large +to be group-annotated, computational resources are limited, and models are +trained through already complex pipelines. In this paper, we propose Targeted +Augmentations for Bias Mitigation (TAB), a simple hyperparameter-free framework +that leverages the entire training history of a helper model to identify +spurious samples, and generate a group-balanced training set from which a +robust model can be trained. We show that TAB improves worst-group performance +without any group information or model selection, outperforming existing +methods while maintaining overall accuracy. + +
+
+ comment: Accepted at the 18th European Conference on Computer Vision (ECCV + 2024) as an Oral presentation +
+
+
+
+
+ + ☆ Graph Edit Distance with General Costs Using Neural Set Divergence NeurIPS 2024 + + +
+ Graph Edit Distance (GED) measures the (dis-)similarity between two given +graphs, in terms of the minimum-cost edit sequence that transforms one graph to +the other. However, the exact computation of GED is NP-Hard, which has recently +motivated the design of neural methods for GED estimation. However, they do not +explicitly account for edit operations with different costs. In response, we +propose GRAPHEDX, a neural GED estimator that can work with general costs +specified for the four edit operations, viz., edge deletion, edge addition, +node deletion and node addition. We first present GED as a quadratic assignment +problem (QAP) that incorporates these four costs. Then, we represent each graph +as a set of node and edge embeddings and use them to design a family of neural +set divergence surrogates. We replace the QAP terms corresponding to each +operation with their surrogates. Computing such neural set divergence require +aligning nodes and edges of the two graphs. We learn these alignments using a +Gumbel-Sinkhorn permutation generator, additionally ensuring that the node and +edge alignments are consistent with each other. Moreover, these alignments are +cognizant of both the presence and absence of edges between node-pairs. +Experiments on several datasets, under a variety of edit cost settings, show +that GRAPHEDX consistently outperforms state-of-the-art methods and heuristics +in terms of prediction error. + +
+
+ comment: Published at NeurIPS 2024 +
+
+
+
+
+ + ☆ Artificial Data Point Generation in Clustered Latent Space for Small + Medical Datasets + + +
+ One of the growing trends in machine learning is the use of data generation +techniques, since the performance of machine learning models is dependent on +the quantity of the training dataset. However, in many medical applications, +collecting large datasets is challenging due to resource constraints, which +leads to overfitting and poor generalization. This paper introduces a novel +method, Artificial Data Point Generation in Clustered Latent Space (AGCL), +designed to enhance classification performance on small medical datasets +through synthetic data generation. The AGCL framework involves feature +extraction, K-means clustering, cluster evaluation based on a class separation +metric, and the generation of synthetic data points from clusters with distinct +class representations. This method was applied to Parkinson's disease +screening, utilizing facial expression data, and evaluated across multiple +machine learning classifiers. Experimental results demonstrate that AGCL +significantly improves classification accuracy compared to baseline, GN and +kNNMTD. AGCL achieved the highest overall test accuracy of 83.33% and +cross-validation accuracy of 90.90% in majority voting over different emotions, +confirming its effectiveness in augmenting small datasets. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ Preserving logical and functional dependencies in synthetic tabular data + + +
+ Dependencies among attributes are a common aspect of tabular data. However, +whether existing tabular data generation algorithms preserve these dependencies +while generating synthetic data is yet to be explored. In addition to the +existing notion of functional dependencies, we introduce the notion of logical +dependencies among the attributes in this article. Moreover, we provide a +measure to quantify logical dependencies among attributes in tabular data. +Utilizing this measure, we compare several state-of-the-art synthetic data +generation algorithms and test their capability to preserve logical and +functional dependencies on several publicly available datasets. We demonstrate +that currently available synthetic tabular data generation algorithms do not +fully preserve functional dependencies when they generate synthetic datasets. +In addition, we also showed that some tabular synthetic data generation models +can preserve inter-attribute logical dependencies. Our review and comparison of +the state-of-the-art reveal research needs and opportunities to develop +task-specific synthetic tabular data generation models. + +
+
+ comment: Submitted to Pattern Recognition Journal +
+
+
+
+
+ + ☆ Optimal Memorization Capacity of Transformers + + +
+ Recent research in the field of machine learning has increasingly focused on +the memorization capacity of Transformers, but how efficient they are is not +yet well understood. We demonstrate that Transformers can memorize labels with +$\tilde{O}(\sqrt{N})$ parameters in a next-token prediction setting for $N$ +input sequences of length $n$, which is proved to be optimal up to logarithmic +factors. This indicates that Transformers can efficiently perform memorization +with little influence from the input length $n$ owing to the benefit of +parameter sharing. We also analyze the memorization capacity in the +sequence-to-sequence setting, and find that $\tilde{O}(\sqrt{nN})$ parameters +are not only sufficient, but also necessary at least for Transformers with +hardmax. These results suggest that while self-attention mechanisms can +efficiently identify input sequences, the feed-forward network becomes a +bottleneck when associating a label to each token. + +
+
+
+
+
+ + ☆ Explanation Bottleneck Models + + +
+ Recent concept-based interpretable models have succeeded in providing +meaningful explanations by pre-defined concept sets. However, the dependency on +the pre-defined concepts restricts the application because of the limited +number of concepts for explanations. This paper proposes a novel interpretable +deep neural network called explanation bottleneck models (XBMs). XBMs generate +a text explanation from the input without pre-defined concepts and then predict +a final task prediction based on the generated explanation by leveraging +pre-trained vision-language encoder-decoder models. To achieve both the target +task performance and the explanation quality, we train XBMs through the target +task loss with the regularization penalizing the explanation decoder via the +distillation from the frozen pre-trained decoder. Our experiments, including a +comparison to state-of-the-art concept bottleneck models, confirm that XBMs +provide accurate and fluent natural language explanations without pre-defined +concept sets. Code will be available at https://github.com/yshinya6/xbm/. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Efficient Fairness-Performance Pareto Front Computation + + +
+ There is a well known intrinsic trade-off between the fairness of a +representation and the performance of classifiers derived from the +representation. Due to the complexity of optimisation algorithms in most modern +representation learning approaches, for a given method it may be non-trivial to +decide whether the obtained fairness-performance curve of the method is +optimal, i.e., whether it is close to the true Pareto front for these +quantities for the underlying data distribution. + In this paper we propose a new method to compute the optimal Pareto front, +which does not require the training of complex representation models. We show +that optimal fair representations possess several useful structural properties, +and that these properties enable a reduction of the computation of the Pareto +Front to a compact discrete problem. We then also show that these compact +approximating problems can be efficiently solved via off-the shelf +concave-convex programming methods. + Since our approach is independent of the specific model of representations, +it may be used as the benchmark to which representation learning algorithms may +be compared. We experimentally evaluate the approach on a number of real world +benchmark datasets. + +
+
+
+
+
+ + ☆ FlowMAC: Conditional Flow Matching for Audio Coding at Low Bit Rates ICASSP 2025 + + +
+ This paper introduces FlowMAC, a novel neural audio codec for high-quality +general audio compression at low bit rates based on conditional flow matching +(CFM). FlowMAC jointly learns a mel spectrogram encoder, quantizer and decoder. +At inference time the decoder integrates a continuous normalizing flow via an +ODE solver to generate a high-quality mel spectrogram. This is the first time +that a CFM-based approach is applied to general audio coding, enabling a +scalable, simple and memory efficient training. Our subjective evaluations show +that FlowMAC at 3 kbps achieves similar quality as state-of-the-art GAN-based +and DDPM-based neural audio codecs at double the bit rate. Moreover, FlowMAC +offers a tunable inference pipeline, which permits to trade off complexity and +quality. This enables real-time coding on CPU, while maintaining high +perceptual quality. + +
+
+ comment: Submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Model-Free Stochastic Process Modeling and Optimization using + Normalizing Flows + + +
+ Real-world chemical processes often exhibit stochastic dynamics with +non-trivial correlations and state-dependent fluctuations. However, most +process models simply add stationary noise terms to a deterministic prediction, +which can lead to inaccurate predictions. This work proposes using conditional +normalizing flows as discrete-time models (DTMs) to learn the stochastic +dynamics of chemical processes. Normalizing flows learn an explicit expression +of the system states' probability density function (PDF) given prior states and +control inputs. The resulting model naturally allows for formulating stochastic +and probabilistic setpoint-tracking objectives and chance constraints. In +applications to a continuous reactor and a reactor cascade, the normalizing +flow yields stable simulations over long time horizons and high-quality results +in stochastic and probabilistic MPC formulation for open-loop control. +Furthermore, a chance-constrained optimization finds reliable startup controls +for the reactor cascade with stochastic reactions. In conclusion, the +conditional normalizing flow presents an excellent choice for modeling +nonlinear stochastic dynamics. + +
+
+ comment: 13 pages, 7 Figures, 5 Tables +
+
+
+
+
+ + ☆ Convolutional Signal Propagation: A Simple Scalable Algorithm for + Hypergraphs + + +
+ Last decade has seen the emergence of numerous methods for learning on +graphs, particularly Graph Neural Networks (GNNs). These methods, however, are +often not directly applicable to more complex structures like bipartite graphs +(equivalent to hypergraphs), which represent interactions among two entity +types (e.g. a user liking a movie). This paper proposes Convolutional Signal +Propagation (CSP), a non-parametric simple and scalable method that natively +operates on bipartite graphs (hypergraphs) and can be implemented with just a +few lines of code. After defining CSP, we demonstrate its relationship with +well-established methods like label propagation, Naive Bayes, and Hypergraph +Convolutional Networks. We evaluate CSP against several reference methods on +real-world datasets from multiple domains, focusing on retrieval and +classification tasks. Our results show that CSP offers competitive performance +while maintaining low computational complexity, making it an ideal first choice +as a baseline for hypergraph node classification and retrieval. Moreover, +despite operating on hypergraphs, CSP achieves good results in tasks typically +not associated with hypergraphs, such as natural language processing. + +
+
+
+
+
+ + ☆ Benign or Not-Benign Overfitting in Token Selection of Attention + Mechanism + + +
+ Modern over-parameterized neural networks can be trained to fit the training +data perfectly while still maintaining a high generalization performance. This +"benign overfitting" phenomenon has been studied in a surge of recent +theoretical work; however, most of these studies have been limited to linear +models or two-layer neural networks. In this work, we analyze benign +overfitting in the token selection mechanism of the attention architecture, +which characterizes the success of transformer models. We first show the +existence of a benign overfitting solution and explain its mechanism in the +attention architecture. Next, we discuss whether the model converges to such a +solution, raising the difficulties specific to the attention architecture. We +then present benign overfitting cases and not-benign overfitting cases by +conditioning different scenarios based on the behavior of attention +probabilities during training. To the best of our knowledge, this is the first +study to characterize benign overfitting for the attention mechanism. + +
+
+
+
+
+ + ☆ Neural P$^3$M: A Long-Range Interaction Modeling Enhancer for Geometric + GNNs NeurIPS 2024 + + +
+ Geometric graph neural networks (GNNs) have emerged as powerful tools for +modeling molecular geometry. However, they encounter limitations in effectively +capturing long-range interactions in large molecular systems. To address this +challenge, we introduce Neural P$^3$M, a versatile enhancer of geometric GNNs +to expand the scope of their capabilities by incorporating mesh points +alongside atoms and reimaging traditional mathematical operations in a +trainable manner. Neural P$^3$M exhibits flexibility across a wide range of +molecular systems and demonstrates remarkable accuracy in predicting energies +and forces, outperforming on benchmarks such as the MD22 dataset. It also +achieves an average improvement of 22% on the OE62 dataset while integrating +with various architectures. + +
+
+ comment: Published as a conference paper at NeurIPS 2024 +
+
+
+
+
+ + ☆ Diversity-Driven Synthesis: Enhancing Dataset Distillation through + Directed Weight Adjustment + + +
+ The sharp increase in data-related expenses has motivated research into +condensing datasets while retaining the most informative features. Dataset +distillation has thus recently come to the fore. This paradigm generates +synthetic dataset that are representative enough to replace the original +dataset in training a neural network. To avoid redundancy in these synthetic +datasets, it is crucial that each element contains unique features and remains +diverse from others during the synthesis stage. In this paper, we provide a +thorough theoretical and empirical analysis of diversity within synthesized +datasets. We argue that enhancing diversity can improve the parallelizable yet +isolated synthesizing approach. Specifically, we introduce a novel method that +employs dynamic and directed weight adjustment techniques to modulate the +synthesis process, thereby maximizing the representativeness and diversity of +each synthetic instance. Our method ensures that each batch of synthetic data +mirrors the characteristics of a large, varying subset of the original dataset. +Extensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet, +and ImageNet-1K, demonstrate the superior performance of our method, +highlighting its effectiveness in producing diverse and representative +synthetic datasets with minimal computational expense. + +
+
+
+
+
+ + ☆ Good Data Is All Imitation Learning Needs + + +
+ In this paper, we address the limitations of traditional teacher-student +models, imitation learning, and behaviour cloning in the context of +Autonomous/Automated Driving Systems (ADS), where these methods often struggle +with incomplete coverage of real-world scenarios. To enhance the robustness of +such models, we introduce the use of Counterfactual Explanations (CFEs) as a +novel data augmentation technique for end-to-end ADS. CFEs, by generating +training samples near decision boundaries through minimal input modifications, +lead to a more comprehensive representation of expert driver strategies, +particularly in safety-critical scenarios. This approach can therefore help +improve the model's ability to handle rare and challenging driving events, such +as anticipating darting out pedestrians, ultimately leading to safer and more +trustworthy decision-making for ADS. Our experiments in the CARLA simulator +demonstrate that CF-Driver outperforms the current state-of-the-art method, +achieving a higher driving score and lower infraction rates. Specifically, +CF-Driver attains a driving score of 84.2, surpassing the previous best model +by 15.02 percentage points. These results highlight the effectiveness of +incorporating CFEs in training end-to-end ADS. To foster further research, the +CF-Driver code is made publicly available. + +
+
+
+
+
+ + ☆ RmGPT: Rotating Machinery Generative Pretrained Model + + +
+ In industry, the reliability of rotating machinery is critical for production +efficiency and safety. Current methods of Prognostics and Health Management +(PHM) often rely on task-specific models, which face significant challenges in +handling diverse datasets with varying signal characteristics, fault modes and +operating conditions. Inspired by advancements in generative pretrained models, +we propose RmGPT, a unified model for diagnosis and prognosis tasks. RmGPT +introduces a novel token-based framework, incorporating Signal Tokens, Prompt +Tokens, Time-Frequency Task Tokens and Fault Tokens to handle heterogeneous +data within a unified model architecture. We leverage self-supervised learning +for robust feature extraction and introduce a next signal token prediction +pretraining strategy, alongside efficient prompt learning for task-specific +adaptation. Extensive experiments demonstrate that RmGPT significantly +outperforms state-of-the-art algorithms, achieving near-perfect accuracy in +diagnosis tasks and exceptionally low errors in prognosis tasks. Notably, RmGPT +excels in few-shot learning scenarios, achieving 92% accuracy in 16-class +one-shot experiments, highlighting its adaptability and robustness. This work +establishes RmGPT as a powerful PHM foundation model for rotating machinery, +advancing the scalability and generalizability of PHM solutions. + +
+
+
+
+
+ + ☆ Deep Manifold Part 1: Anatomy of Neural Network Manifold + + +
+ Based on the numerical manifold method principle, we developed a mathematical +framework of a neural network manifold: Deep Manifold and discovered that +neural networks: 1) is numerical computation combining forward and inverse; 2) +have near infinite degrees of freedom; 3) exponential learning capacity with +depth; 4) have self-progressing boundary conditions; 5) has training hidden +bottleneck. We also define two concepts: neural network learning space and deep +manifold space and introduce two concepts: neural network intrinsic pathway and +fixed point. We raise three fundamental questions: 1). What is the training +completion definition; 2). where is the deep learning convergence point (neural +network fixed point); 3). How important is token timestamp in training data +given negative time is critical in inverse problem. + +
+
+
+
+
+ + ☆ Conjugate Bayesian Two-step Change Point Detection for Hawkes Process NeurIPS 2024 + + +
+ The Bayesian two-step change point detection method is popular for the Hawkes +process due to its simplicity and intuitiveness. However, the non-conjugacy +between the point process likelihood and the prior requires most existing +Bayesian two-step change point detection methods to rely on non-conjugate +inference methods. These methods lack analytical expressions, leading to low +computational efficiency and impeding timely change point detection. To address +this issue, this work employs data augmentation to propose a conjugate Bayesian +two-step change point detection method for the Hawkes process, which proves to +be more accurate and efficient. Extensive experiments on both synthetic and +real data demonstrate the superior effectiveness and efficiency of our method +compared to baseline methods. Additionally, we conduct ablation studies to +explore the robustness of our method concerning various hyperparameters. Our +code is publicly available at https://github.com/Aurora2050/CoBay-CPD. + +
+
+ comment: 10 pages, accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Multimodal Banking Dataset: Understanding Client Needs through Event + Sequences + + +
+ Financial organizations collect a huge amount of data about clients that +typically has a temporal (sequential) structure and is collected from various +sources (modalities). Due to privacy issues, there are no large-scale +open-source multimodal datasets of event sequences, which significantly limits +the research in this area. In this paper, we present the industrial-scale +publicly available multimodal banking dataset, MBD, that contains more than +1.5M corporate clients with several modalities: 950M bank transactions, 1B geo +position events, 5M embeddings of dialogues with technical support and monthly +aggregated purchases of four bank's products. All entries are properly +anonymized from real proprietary bank data. Using this dataset, we introduce a +novel benchmark with two business tasks: campaigning (purchase prediction in +the next month) and matching of clients. We provide numerical results that +demonstrate the superiority of our multi-modal baselines over single-modal +techniques for each task. As a result, the proposed dataset can open new +perspectives and facilitate the future development of practically important +large-scale multimodal algorithms for event sequences. + HuggingFace Link: https://huggingface.co/datasets/ai-lab/MBD + Github Link: https://github.com/Dzhambo/MBD + +
+
+
+
+
+ + ☆ Let the Quantum Creep In: Designing Quantum Neural Network Models by + Gradually Swapping Out Classical Components + + +
+ Artificial Intelligence (AI), with its multiplier effect and wide +applications in multiple areas, could potentially be an important application +of quantum computing. Since modern AI systems are often built on neural +networks, the design of quantum neural networks becomes a key challenge in +integrating quantum computing into AI. To provide a more fine-grained +characterisation of the impact of quantum components on the performance of +neural networks, we propose a framework where classical neural network layers +are gradually replaced by quantum layers that have the same type of input and +output while keeping the flow of information between layers unchanged, +different from most current research in quantum neural network, which favours +an end-to-end quantum model. We start with a simple three-layer classical +neural network without any normalisation layers or activation functions, and +gradually change the classical layers to the corresponding quantum versions. We +conduct numerical experiments on image classification datasets such as the +MNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of +performance brought by the systematic introduction of quantum components. +Through this framework, our research sheds new light on the design of future +quantum neural network models where it could be more favourable to search for +methods and frameworks that harness the advantages from both the classical and +quantum worlds. + +
+
+ comment: 50 pages (including Appendix), many figures, accepted as a poster on + QTML2024. Code available at + https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In +
+
+
+
+
+ + ☆ Multiplicative Logit Adjustment Approximates Neural-Collapse-Aware + Decision Boundary Adjustment + + +
+ Real-world data distributions are often highly skewed. This has spurred a +growing body of research on long-tailed recognition to address this imbalance +in training classification models. Among the methods studied, multiplicative +logit adjustment (MLA) stands out as a simple and effective method. However, it +lacks theoretical guarantees, which raises concerns about the optimality of its +adjustment method. We provide a theoretical justification for the effectiveness +of MLA with the following two-step theory. First, we develop a theory that +adjusts optimal decision boundaries by estimating feature spread on the basis +of neural collapse. Then, we demonstrate that MLA approximates this optimal +method. Additionally, through experiments on long-tailed datasets, we +illustrate the practical usefulness of MLA under more realistic conditions. We +also offer experimental insights to guide the tuning of MLA's hyperparameters. + +
+
+
+
+
+ + ☆ Derandomizing Multi-Distribution Learning + + +
+ Multi-distribution or collaborative learning involves learning a single +predictor that works well across multiple data distributions, using samples +from each during training. Recent research on multi-distribution learning, +focusing on binary loss and finite VC dimension classes, has shown near-optimal +sample complexity that is achieved with oracle efficient algorithms. That is, +these algorithms are computationally efficient given an efficient ERM for the +class. Unlike in classical PAC learning, where the optimal sample complexity is +achieved with deterministic predictors, current multi-distribution learning +algorithms output randomized predictors. This raises the question: can these +algorithms be derandomized to produce a deterministic predictor for multiple +distributions? Through a reduction to discrepancy minimization, we show that +derandomizing multi-distribution learning is computationally hard, even when +ERM is computationally efficient. On the positive side, we identify a +structural condition enabling an efficient black-box reduction, converting +existing randomized multi-distribution predictors into deterministic ones. + +
+
+
+
+
+ + ☆ Pixel-Space Post-Training of Latent Diffusion Models + + +
+ Latent diffusion models (LDMs) have made significant advancements in the +field of image generation in recent years. One major advantage of LDMs is their +ability to operate in a compressed latent space, allowing for more efficient +training and deployment. However, despite these advantages, challenges with +LDMs still remain. For example, it has been observed that LDMs often generate +high-frequency details and complex compositions imperfectly. We hypothesize +that one reason for these flaws is due to the fact that all pre- and +post-training of LDMs are done in latent space, which is typically $8 \times 8$ +lower spatial-resolution than the output images. To address this issue, we +propose adding pixel-space supervision in the post-training process to better +preserve high-frequency details. Experimentally, we show that adding a +pixel-space objective significantly improves both supervised quality +fine-tuning and preference-based post-training by a large margin on a +state-of-the-art DiT transformer and U-Net diffusion models in both visual +quality and visual flaw metrics, while maintaining the same text alignment +quality. + +
+
+
+
+
+ + ☆ Joint Source-Channel Coding: Fundamentals and Recent Progress in + Practical Designs + + +
+ Semantic- and task-oriented communication has emerged as a promising approach +to reducing the latency and bandwidth requirements of next-generation mobile +networks by transmitting only the most relevant information needed to complete +a specific task at the receiver. This is particularly advantageous for +machine-oriented communication of high data rate content, such as images and +videos, where the goal is rapid and accurate inference, rather than perfect +signal reconstruction. While semantic- and task-oriented compression can be +implemented in conventional communication systems, joint source-channel coding +(JSCC) offers an alternative end-to-end approach by optimizing compression and +channel coding together, or even directly mapping the source signal to the +modulated waveform. Although all digital communication systems today rely on +separation, thanks to its modularity, JSCC is known to achieve higher +performance in finite blocklength scenarios, and to avoid cliff and the +levelling-off effects in time-varying channel scenarios. This article provides +an overview of the information theoretic foundations of JSCC, surveys practical +JSCC designs over the decades, and discusses the reasons for their limited +adoption in practical systems. We then examine the recent resurgence of JSCC, +driven by the integration of deep learning techniques, particularly through +DeepJSCC, highlighting its many surprising advantages in various scenarios. +Finally, we discuss why it may be time to reconsider today's strictly separate +architectures, and reintroduce JSCC to enable high-fidelity, low-latency +communications in critical applications such as autonomous driving, drone +surveillance, or wearable systems. + +
+
+ comment: Under review for possible publication +
+
+
+
+
+ + ☆ Advancing Open-Set Domain Generalization Using Evidential Bi-Level + Hardest Domain Scheduler NeurIPS 2024 + + +
+ In Open-Set Domain Generalization (OSDG), the model is exposed to both new +variations of data appearance (domains) and open-set conditions, where both +known and novel categories are present at test time. The challenges of this +task arise from the dual need to generalize across diverse domains and +accurately quantify category novelty, which is critical for applications in +dynamic environments. Recently, meta-learning techniques have demonstrated +superior results in OSDG, effectively orchestrating the meta-train and -test +tasks by employing varied random categories and predefined domain partition +strategies. These approaches prioritize a well-designed training schedule over +traditional methods that focus primarily on data augmentation and the +enhancement of discriminative feature learning. The prevailing meta-learning +models in OSDG typically utilize a predefined sequential domain scheduler to +structure data partitions. However, a crucial aspect that remains inadequately +explored is the influence brought by strategies of domain schedulers during +training. In this paper, we observe that an adaptive domain scheduler benefits +more in OSDG compared with prefixed sequential and random domain schedulers. We +propose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve +an adaptive domain scheduler. This method strategically sequences domains by +assessing their reliabilities in utilizing a follower network, trained with +confidence scores learned in an evidential manner, regularized by max rebiasing +discrepancy, and optimized in a bi-level manner. The results show that our +method substantially improves OSDG performance and achieves more discriminative +embeddings for both the seen and unseen categories. The source code will be +available at https://github.com/KPeng9510/EBiL-HaDS. + +
+
+ comment: Accepted to NeurIPS 2024. The source code will be available at + https://github.com/KPeng9510/EBiL-HaDS +
+
+
+
+
+ + ☆ A Simple but Strong Baseline for Sounding Video Generation: Effective + Adaptation of Audio and Video Diffusion Models for Joint Generation + + +
+ In this work, we build a simple but strong baseline for sounding video +generation. Given base diffusion models for audio and video, we integrate them +with additional modules into a single model and train it to make the model +jointly generate audio and video. To enhance alignment between audio-video +pairs, we introduce two novel mechanisms in our model. The first one is +timestep adjustment, which provides different timestep information to each base +model. It is designed to align how samples are generated along with timesteps +across modalities. The second one is a new design of the additional modules, +termed Cross-Modal Conditioning as Positional Encoding (CMC-PE). In CMC-PE, +cross-modal information is embedded as if it represents temporal position +information, and the embeddings are fed into the model like positional +encoding. Compared with the popular cross-attention mechanism, CMC-PE provides +a better inductive bias for temporal alignment in the generated data. +Experimental results validate the effectiveness of the two newly introduced +mechanisms and also demonstrate that our method outperforms existing methods. + +
+
+ comment: The source code will be released soon +
+
+
+
+
+ + ☆ MASSFormer: Mobility-Aware Spectrum Sensing using Transformer-Driven + Tiered Structure + + +
+ In this paper, we develop a novel mobility-aware transformer-driven tiered +structure (MASSFormer) based cooperative spectrum sensing method that +effectively models the spatio-temporal dynamics of user movements. Unlike +existing methods, our method considers a dynamic scenario involving mobile +primary users (PUs) and secondary users (SUs)and addresses the complexities +introduced by user mobility. The transformer architecture utilizes an attention +mechanism, enabling the proposed method to adeptly model the temporal dynamics +of user mobility by effectively capturing long-range dependencies within the +input data. The proposed method first computes tokens from the sequence of +covariance matrices (CMs) for each SU and processes them in parallel using the +SUtransformer network to learn the spatio-temporal features at SUlevel. +Subsequently, the collaborative transformer network learns the group-level PU +state from all SU-level feature representations. The attention-based sequence +pooling method followed by the transformer encoder adjusts the contributions of +all tokens. The main goal of predicting the PU states at each SU-level and +group-level is to improve detection performance even more. We conducted a +sufficient amount of simulations and compared the detection performance of +different SS methods. The proposed method is tested under imperfect reporting +channel scenarios to show robustness. The efficacy of our method is validated +with the simulation results demonstrating its higher performance compared with +existing methods in terms of detection probability, sensing error, and +classification accuracy. + +
+
+
+
+
+ + ☆ Modulated Intervention Preference Optimization (MIPO): Keey the Easy, + Refine the Difficult AAAI 2025 + + +
+ Preference optimization methods typically begin training with a well-trained +SFT model as a reference model. In RLHF and DPO, a regularization term is used +during the preference optimization process to prevent the policy model from +deviating too far from the reference model's distribution, thereby avoiding the +generation of anomalous responses. When the reference model is already +well-aligned with the given data or only requires slight adjustments, this +approach can produce a well-aligned model. However, if the reference model is +not aligned with the given data and requires significant deviation from its +current state, a regularization term may actually hinder the model alignment. +In this study, we propose \textbf{Modulated Intervention Preference +Optimization (MIPO)} to address this issue. MIPO modulates the degree of +intervention from the reference model based on how well the given data is +aligned with it. If the data is well-aligned, the intervention is increased to +prevent the policy model from diverging significantly from reference model. +Conversely, if the alignment is poor, the interference is reduced to facilitate +more extensive training. We compare the performance of MIPO and DPO using +Mistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental +results demonstrate that MIPO consistently outperforms DPO across various +evaluation scenarios. + +
+
+ comment: 8pages, submitted to AAAI 2025 +
+
+
+
+
+ + ☆ Optimizing the Induced Correlation in Omnibus Joint Graph Embeddings + + +
+ Theoretical and empirical evidence suggests that joint graph embedding +algorithms induce correlation across the networks in the embedding space. In +the Omnibus joint graph embedding framework, previous results explicitly +delineated the dual effects of the algorithm-induced and model-inherent +correlations on the correlation across the embedded networks. Accounting for +and mitigating the algorithm-induced correlation is key to subsequent +inference, as sub-optimal Omnibus matrix constructions have been demonstrated +to lead to loss in inference fidelity. This work presents the first efforts to +automate the Omnibus construction in order to address two key questions in this +joint embedding framework: the correlation-to-OMNI problem and the flat +correlation problem. In the flat correlation problem, we seek to understand the +minimum algorithm-induced flat correlation (i.e., the same across all graph +pairs) produced by a generalized Omnibus embedding. Working in a subspace of +the fully general Omnibus matrices, we prove both a lower bound for this flat +correlation and that the classical Omnibus construction induces the maximal +flat correlation. In the correlation-to-OMNI problem, we present an algorithm +-- named corr2Omni -- that, from a given matrix of estimated pairwise graph +correlations, estimates the matrix of generalized Omnibus weights that induces +optimal correlation in the embedding space. Moreover, in both simulated and +real data settings, we demonstrate the increased effectiveness of our corr2Omni +algorithm versus the classical Omnibus construction. + +
+
+ comment: 34 pages, 8 figures +
+
+
+
+
+ + ☆ On the Implicit Relation Between Low-Rank Adaptation and Differential + Privacy + + +
+ A significant approach in natural language processing involves large-scale +pre-training on general domain data followed by adaptation to specific tasks or +domains. As models grow in size, full fine-tuning all parameters becomes +increasingly impractical. To address this, some methods for low-rank task +adaptation of language models have been proposed, e.g. LoRA and FLoRA. These +methods keep the pre-trained model weights fixed and incorporate trainable +low-rank decomposition matrices into some layers of the transformer +architecture, called adapters. This approach significantly reduces the number +of trainable parameters required for downstream tasks compared to full +fine-tuning all parameters. In this work, we look at low-rank adaptation from +the lens of data privacy. We show theoretically that the low-rank adaptation +used in LoRA and FLoRA is equivalent to injecting some random noise into the +batch gradients w.r.t the adapter parameters coming from their full +fine-tuning, and we quantify the variance of the injected noise. By +establishing a Berry-Esseen type bound on the total variation distance between +the noise distribution and a Gaussian distribution with the same variance, we +show that the dynamics of LoRA and FLoRA are very close to differentially +private full fine-tuning the adapters, which suggests that low-rank adaptation +implicitly provides privacy w.r.t the fine-tuning data. Finally, using +Johnson-Lindenstrauss lemma, we show that when augmented with gradient +clipping, low-rank adaptation is almost equivalent to differentially private +full fine-tuning adapters with a fixed noise scale. + +
+
+
+
+
+ + ☆ Dataset Distillation-based Hybrid Federated Learning on Non-IID Data + + +
+ In federated learning, the heterogeneity of client data has a great impact on +the performance of model training. Many heterogeneity issues in this process +are raised by non-independently and identically distributed (Non-IID) data. +This study focuses on the issue of label distribution skew. To address it, we +propose a hybrid federated learning framework called HFLDD, which integrates +dataset distillation to generate approximately independent and equally +distributed (IID) data, thereby improving the performance of model training. +Particularly, we partition the clients into heterogeneous clusters, where the +data labels among different clients within a cluster are unbalanced while the +data labels among different clusters are balanced. The cluster headers collect +distilled data from the corresponding cluster members, and conduct model +training in collaboration with the server. This training process is like +traditional federated learning on IID data, and hence effectively alleviates +the impact of Non-IID data on model training. Furthermore, we compare our +proposed method with typical baseline methods on public datasets. Experimental +results demonstrate that when the data labels are severely imbalanced, the +proposed HFLDD outperforms the baseline methods in terms of both test accuracy +and communication cost. + +
+
+
+
+
+ + ☆ Functional Classification of Spiking Signal Data Using Artificial + Intelligence Techniques: A Review + + +
+ Human brain neuron activities are incredibly significant nowadays. Neuronal +behavior is assessed by analyzing signal data such as electroencephalography +(EEG), which can offer scientists valuable information about diseases and +human-computer interaction. One of the difficulties researchers confront while +evaluating these signals is the existence of large volumes of spike data. +Spikes are some considerable parts of signal data that can happen as a +consequence of vital biomarkers or physical issues such as electrode movements. +Hence, distinguishing types of spikes is important. From this spot, the spike +classification concept commences. Previously, researchers classified spikes +manually. The manual classification was not precise enough as it involves +extensive analysis. Consequently, Artificial Intelligence (AI) was introduced +into neuroscience to assist clinicians in classifying spikes correctly. This +review discusses the importance and use of AI in spike classification, focusing +on the recognition of neural activity noises. The task is divided into three +main components: preprocessing, classification, and evaluation. Existing +methods are introduced and their importance is determined. The review also +highlights the need for more efficient algorithms. The primary goal is to +provide a perspective on spike classification for future research and provide a +comprehensive understanding of the methodologies and issues involved. The +review organizes materials in the spike classification field for future +studies. In this work, numerous studies were extracted from different +databases. The PRISMA-related research guidelines were then used to choose +papers. Then, research studies based on spike classification using machine +learning and deep learning approaches with effective preprocessing were +selected. + +
+
+ comment: 8 figures, 32 pages +
+
+
+
+
+ + ☆ Comparing Unidirectional, Bidirectional, and Word2vec Models for + Discovering Vulnerabilities in Compiled Lifted Code + + +
+ Ransomware and other forms of malware cause significant financial and +operational damage to organizations by exploiting long-standing and often +difficult-to-detect software vulnerabilities. To detect vulnerabilities such as +buffer overflows in compiled code, this research investigates the application +of unidirectional transformer-based embeddings, specifically GPT-2. Using a +dataset of LLVM functions, we trained a GPT-2 model to generate embeddings, +which were subsequently used to build LSTM neural networks to differentiate +between vulnerable and non-vulnerable code. Our study reveals that embeddings +from the GPT-2 model significantly outperform those from bidirectional models +of BERT and RoBERTa, achieving an accuracy of 92.5% and an F1-score of 89.7%. +LSTM neural networks were developed with both frozen and unfrozen embedding +model layers. The model with the highest performance was achieved when the +embedding layers were unfrozen. Further, the research finds that, in exploring +the impact of different optimizers within this domain, the SGD optimizer +demonstrates superior performance over Adam. Overall, these findings reveal +important insights into the potential of unidirectional transformer-based +approaches in enhancing cybersecurity defenses. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ☆ NeuroPath: A Neural Pathway Transformer for Joining the Dots of Human + Connectomes NeurIPS 2024 + + +
+ Although modern imaging technologies allow us to study connectivity between +two distinct brain regions in-vivo, an in-depth understanding of how anatomical +structure supports brain function and how spontaneous functional fluctuations +emerge remarkable cognition is still elusive. Meanwhile, tremendous efforts +have been made in the realm of machine learning to establish the nonlinear +mapping between neuroimaging data and phenotypic traits. However, the absence +of neuroscience insight in the current approaches poses significant challenges +in understanding cognitive behavior from transient neural activities. To +address this challenge, we put the spotlight on the coupling mechanism of +structural connectivity (SC) and functional connectivity (FC) by formulating +such network neuroscience question into an expressive graph representation +learning problem for high-order topology. Specifically, we introduce the +concept of topological detour to characterize how a ubiquitous instance of FC +(direct link) is supported by neural pathways (detour) physically wired by SC, +which forms a cyclic loop interacted by brain structure and function. In the +clich\'e of machine learning, the multi-hop detour pathway underlying SC-FC +coupling allows us to devise a novel multi-head self-attention mechanism within +Transformer to capture multi-modal feature representation from paired graphs of +SC and FC. Taken together, we propose a biological-inspired deep model, coined +as NeuroPath, to find putative connectomic feature representations from the +unprecedented amount of neuroimages, which can be plugged into various +downstream applications such as task recognition and disease diagnosis. We have +evaluated NeuroPath on large-scale public datasets including HCP and UK Biobank +under supervised and zero-shot learning, where the state-of-the-art performance +by our NeuroPath indicates great potential in network neuroscience. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task + Learning Via Connector-MoE + + +
+ Multi-modal large language models (MLLMs) have shown impressive capabilities +as a general-purpose interface for various visual and linguistic tasks. +However, building a unified MLLM for multi-task learning in the medical field +remains a thorny challenge. To mitigate the tug-of-war problem of multi-modal +multi-task optimization, recent advances primarily focus on improving the LLM +components, while neglecting the connector that bridges the gap between +modalities. In this paper, we introduce Uni-Med, a novel medical generalist +foundation model which consists of a universal visual feature extraction +module, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting +from the proposed CMoE that leverages a well-designed router with a mixture of +projection experts at the connector, Uni-Med achieves efficient solution to the +tug-of-war problem and can perform six different medical tasks including +question answering, visual question answering, report generation, referring +expression comprehension, referring expression generation and image +classification. To the best of our knowledge, Uni-Med is the first effort to +tackle multi-task interference at the connector. Extensive ablation experiments +validate the effectiveness of introducing CMoE under any configuration, with up +to an average 8% performance gains. We further provide interpretation analysis +of the tug-of-war problem from the perspective of gradient optimization and +parameter statistics. Compared to previous state-of-the-art medical MLLMs, +Uni-Med achieves competitive or superior evaluation metrics on diverse tasks. +Code, data and model will be soon available at GitHub. + +
+
+
+
+
+ + ☆ Sequential Kernelized Stein Discrepancy + + +
+ We present a sequential version of the kernelized Stein discrepancy, which +allows for conducting goodness-of-fit tests for unnormalized densities that are +continuously monitored and adaptively stopped. That is, the sample size need +not be fixed prior to data collection; the practitioner can choose whether to +stop the test or continue to gather evidence at any time while controlling the +false discovery rate. In stark contrast to related literature, we do not impose +uniform boundedness on the Stein kernel. Instead, we exploit the potential +boundedness of the Stein kernel at arbitrary point evaluations to define test +martingales, that give way to the subsequent novel sequential tests. We prove +the validity of the test, as well as an asymptotic lower bound for the +logarithmic growth of the wealth process under the alternative. We further +illustrate the empirical performance of the test with a variety of +distributions, including restricted Boltzmann machines. + +
+
+
+
+
+ + ☆ HaloScope: Harnessing Unlabeled LLM Generations for Hallucination + Detection NeurIPS 2024 + + +
+ The surge in applications of large language models (LLMs) has prompted +concerns about the generation of misleading or fabricated information, known as +hallucinations. Therefore, detecting hallucinations has become critical to +maintaining trust in LLM-generated content. A primary challenge in learning a +truthfulness classifier is the lack of a large amount of labeled truthful and +hallucinated data. To address the challenge, we introduce HaloScope, a novel +learning framework that leverages the unlabeled LLM generations in the wild for +hallucination detection. Such unlabeled data arises freely upon deploying LLMs +in the open world, and consists of both truthful and hallucinated information. +To harness the unlabeled data, we present an automated membership estimation +score for distinguishing between truthful and untruthful generations within +unlabeled mixture data, thereby enabling the training of a binary truthfulness +classifier on top. Importantly, our framework does not require extra data +collection and human annotations, offering strong flexibility and practicality +for real-world applications. Extensive experiments show that HaloScope can +achieve superior hallucination detection performance, outperforming the +competitive rivals by a significant margin. Code is available at +https://github.com/deeplearningwisc/haloscope. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ☆ Broadcast Product: Shape-aligned Element-wise Multiplication and Beyond + + +
+ We propose a new operator defined between two tensors, the broadcast product. +The broadcast product calculates the Hadamard product after duplicating +elements to align the shapes of the two tensors. Complex tensor operations in +libraries like \texttt{numpy} can be succinctly represented as mathematical +expressions using the broadcast product. Finally, we propose a novel tensor +decomposition using the broadcast product, highlighting its potential +applications in dimensionality reduction. + +
+
+
+
+
+ + ☆ Does Worst-Performing Agent Lead the Pack? Analyzing Agent Dynamics in + Unified Distributed SGD NeurIPS 2024 + + +
+ Distributed learning is essential to train machine learning algorithms across +heterogeneous agents while maintaining data privacy. We conduct an asymptotic +analysis of Unified Distributed SGD (UD-SGD), exploring a variety of +communication patterns, including decentralized SGD and local SGD within +Federated Learning (FL), as well as the increasing communication interval in +the FL setting. In this study, we assess how different sampling strategies, +such as i.i.d. sampling, shuffling, and Markovian sampling, affect the +convergence speed of UD-SGD by considering the impact of agent dynamics on the +limiting covariance matrix as described in the Central Limit Theorem (CLT). Our +findings not only support existing theories on linear speedup and asymptotic +network independence, but also theoretically and empirically show how efficient +sampling strategies employed by individual agents contribute to overall +convergence in UD-SGD. Simulations reveal that a few agents using highly +efficient sampling can achieve or surpass the performance of the majority +employing moderately improved strategies, providing new insights beyond +traditional analyses focusing on the worst-performing agent. + +
+
+ comment: To appear in NeurIPS 2024 +
+
+
+
+
+ + ☆ MathDSL: A Domain-Specific Language for Concise Mathematical Solutions + Via Program Synthesis + + +
+ We present MathDSL, a Domain-Specific Language (DSL) for mathematical +equation solving, which, when deployed in program synthesis models, outperforms +state-of-the-art reinforcement-learning-based methods. We also introduce a +quantitative metric for measuring the conciseness of a mathematical solution +and demonstrate the improvement in the quality of generated solutions compared +to other methods. Our system demonstrates that a program synthesis system +(DreamCoder) using MathDSL can generate programs that solve linear equations +with greater accuracy and conciseness than using reinforcement learning +systems. Additionally, we demonstrate that if we use the action spaces of +previous reinforcement learning systems as DSLs, MathDSL outperforms the +action-space-DSLs. We use DreamCoder to store equation-solving strategies as +learned abstractions in its program library and demonstrate that by using +MathDSL, these can be converted into human-interpretable solution strategies +that could have applications in mathematical education. + +
+
+
+
+
+ + ♻ ☆ Assumption violations in causal discovery and the robustness of score + matching NeurIPS + 2023 + + +
+ When domain knowledge is limited and experimentation is restricted by +ethical, financial, or time constraints, practitioners turn to observational +causal discovery methods to recover the causal structure, exploiting the +statistical properties of their data. Because causal discovery without further +assumptions is an ill-posed problem, each algorithm comes with its own set of +usually untestable assumptions, some of which are hard to meet in real +datasets. Motivated by these considerations, this paper extensively benchmarks +the empirical performance of recent causal discovery methods on observational +i.i.d. data generated under different background conditions, allowing for +violations of the critical assumptions required by each selected approach. Our +experimental findings show that score matching-based methods demonstrate +surprising performance in the false positive and false negative rate of the +inferred graph in these challenging scenarios, and we provide theoretical +insights into their performance. This work is also the first effort to +benchmark the stability of causal discovery algorithms with respect to the +values of their hyperparameters. Finally, we hope this paper will set a new +standard for the evaluation of causal discovery methods and can serve as an +accessible entry point for practitioners interested in the field, highlighting +the empirical implications of different algorithm choices. + +
+
+ comment: 37th Conference on Neural Information Processing Systems (NeurIPS + 2023) +
+
+
+
+
+ + ♻ ☆ Quantum Kernel Methods under Scrutiny: A Benchmarking Study + + +
+ Since the entry of kernel theory in the field of quantum machine learning, +quantum kernel methods (QKMs) have gained increasing attention with regard to +both probing promising applications and delivering intriguing research +insights. Two common approaches for computing the underlying Gram matrix have +emerged: fidelity quantum kernels (FQKs) and projected quantum kernels (PQKs). +Benchmarking these methods is crucial to gain robust insights and to understand +their practical utility. In this work, we present a comprehensive large-scale +study examining QKMs based on FQKs and PQKs across a manifold of design +choices. Our investigation encompasses both classification and regression tasks +for five dataset families and 64 datasets, systematically comparing the use of +FQKs and PQKs quantum support vector machines and kernel ridge regression. This +resulted in over 20,000 models that were trained and optimized using a +state-of-the-art hyperparameter search to ensure robust and comprehensive +insights. We delve into the importance of hyperparameters on model performance +scores and support our findings through rigorous correlation analyses. In this, +we also closely inspect two data encoding strategies. Moreover, we provide an +in-depth analysis addressing the design freedom of PQKs and explore the +underlying principles responsible for learning. Our goal is not to identify the +best-performing model for a specific task but to uncover the mechanisms that +lead to effective QKMs and reveal universal patterns. + +
+
+ comment: 18 pages main text including 12 figures and 1 table, appendix 14 + pages with 19 figures and 1 table; restructure result section and prune + appendix +
+
+
+
+
+ + ♻ ☆ Two-Timescale Gradient Descent Ascent Algorithms for Nonconvex Minimax + Optimization ICML 2020 + + +
+ We provide a unified analysis of two-timescale gradient descent ascent +(TTGDA) for solving structured nonconvex minimax optimization problems in the +form of $\min_\textbf{x} \max_{\textbf{y} \in Y} f(\textbf{x}, \textbf{y})$, +where the objective function $f(\textbf{x}, \textbf{y})$ is nonconvex in +$\textbf{x}$ and concave in $\textbf{y}$, and the constraint set $Y \subseteq +\mathbb{R}^n$ is convex and bounded. In the convex-concave setting, the +single-timescale gradient descent ascent (GDA) algorithm is widely used in +applications and has been shown to have strong convergence guarantees. In more +general settings, however, it can fail to converge. Our contribution is to +design TTGDA algorithms that are effective beyond the convex-concave setting, +efficiently finding a stationary point of the function $\Phi(\cdot) := +\max_{\textbf{y} \in Y} f(\cdot, \textbf{y})$. We also establish theoretical +bounds on the complexity of solving both smooth and nonsmooth nonconvex-concave +minimax optimization problems. To the best of our knowledge, this is the first +systematic analysis of TTGDA for nonconvex minimax optimization, shedding light +on its superior performance in training generative adversarial networks (GANs) +and in other real-world application problems. + +
+
+ comment: A preliminary version [arXiv:1906.00331] of this paper, with a subset + of the results that are presented here, was presented at ICML 2020; 44 Pages, + 10 Figures +
+
+
+
+
+ + ♻ ☆ Ascend HiFloat8 Format for Deep Learning + + +
+ This preliminary white paper proposes a novel 8-bit floating-point data +format HiFloat8 (abbreviated as HiF8) for deep learning. HiF8 features tapered +precision. For normal value encoding, it provides 7 exponent values with 3-bit +mantissa, 8 exponent values with 2-bit mantissa, and 16 exponent values with +1-bit mantissa. For denormal value encoding, it extends the dynamic range by 7 +extra powers of 2, from 31 to 38 binades (notice that FP16 covers 40 binades). +Meanwhile, HiF8 encodes all the special values except that positive zero and +negative zero are represented by only one bit-pattern. Thanks to the better +balance between precision and dynamic range, HiF8 can be simultaneously used in +both forward and backward passes of AI training. In this paper, we will +describe the definition and rounding methods of HiF8, as well as the tentative +training and inference solutions. To demonstrate the efficacy of HiF8, massive +simulation results on various neural networks, including traditional neural +networks and large language models (LLMs), will also be presented. + +
+
+ comment: 13 Pages, 4 Figures, 9 Tables +
+
+
+
+
+ + ♻ ☆ Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of + Peptides + + +
+ Molecular Dynamics (MD) is crucial in various fields such as materials +science, chemistry, and pharmacology to name a few. Conventional MD software +struggles with the balance between time cost and prediction accuracy, which +restricts its wider application. Recently, data-driven approaches based on deep +generative models have been devised for time-coarsened dynamics, which aim at +learning dynamics of diverse molecular systems over a long timestep, enjoying +both universality and efficiency. Nevertheless, most current methods are +designed solely to learn from the data distribution regardless of the +underlying Boltzmann distribution, and the physics priors such as energies and +forces are constantly overlooked. In this work, we propose a conditional +generative model called Force-guided Bridge Matching (FBM), which learns +full-atom time-coarsened dynamics and targets the Boltzmann-constrained +distribution. With the guidance of our delicately-designed intermediate force +field, FBM leverages favourable physics priors into the generation process, +giving rise to enhanced simulations. Experiments on two datasets consisting of +peptides verify our superiority in terms of comprehensive metrics and +demonstrate transferability to unseen systems. + +
+
+
+
+
+ + ♻ ☆ MLPs Learn In-Context on Regression and Classification Tasks + + +
+ In-context learning (ICL), the remarkable ability to solve a task from only +input exemplars, is often assumed to be a unique hallmark of Transformer +models. By examining commonly employed synthetic ICL tasks, we demonstrate that +multi-layer perceptrons (MLPs) can also learn in-context. Moreover, MLPs, and +the closely related MLP-Mixer models, learn in-context competitively with +Transformers given the same compute budget in this setting. We further show +that MLPs outperform Transformers on a series of classical tasks from +psychology designed to test relational reasoning, which are closely related to +in-context classification. These results underscore a need for studying +in-context learning beyond attention-based architectures, while also +challenging strong prior arguments about MLPs' limited ability to solve +relational tasks. Altogether, our results highlight the unexpected competence +of MLPs, and support the growing interest in all-MLP alternatives to +task-specific architectures. + +
+
+ comment: 30 pages, 10 figures, code available at + https://github.com/wtong98/mlp-icl +
+
+
+
+
+ + ♻ ☆ A Stochastic Quasi-Newton Method for Non-convex Optimization with + Non-uniform Smoothness + + +
+ Classical convergence analyses for optimization algorithms rely on the +widely-adopted uniform smoothness assumption. However, recent experimental +studies have demonstrated that many machine learning problems exhibit +non-uniform smoothness, meaning the smoothness factor is a function of the +model parameter instead of a universal constant. In particular, it has been +observed that the smoothness grows with respect to the gradient norm along the +training trajectory. Motivated by this phenomenon, the recently introduced +$(L_0, L_1)$-smoothness is a more general notion, compared to traditional +$L$-smoothness, that captures such positive relationship between smoothness and +gradient norm. Under this type of non-uniform smoothness, existing literature +has designed stochastic first-order algorithms by utilizing gradient clipping +techniques to obtain the optimal $\mathcal{O}(\epsilon^{-3})$ sample complexity +for finding an $\epsilon$-approximate first-order stationary solution. +Nevertheless, the studies of quasi-Newton methods are still lacking. +Considering higher accuracy and more robustness for quasi-Newton methods, in +this paper we propose a fast stochastic quasi-Newton method when there exists +non-uniformity in smoothness. Leveraging gradient clipping and variance +reduction, our algorithm can achieve the best-known +$\mathcal{O}(\epsilon^{-3})$ sample complexity and enjoys convergence speedup +with simple hyperparameter tuning. Our numerical experiments show that our +proposed algorithm outperforms the state-of-the-art approaches. + +
+
+ comment: Paper accepted by CDC 2024 +
+
+
+
+
+ + ♻ ☆ Message-Passing Monte Carlo: Generating low-discrepancy point sets via + Graph Neural Networks + + +
+ Discrepancy is a well-known measure for the irregularity of the distribution +of a point set. Point sets with small discrepancy are called low-discrepancy +and are known to efficiently fill the space in a uniform manner. +Low-discrepancy points play a central role in many problems in science and +engineering, including numerical integration, computer vision, machine +perception, computer graphics, machine learning, and simulation. In this work, +we present the first machine learning approach to generate a new class of +low-discrepancy point sets named Message-Passing Monte Carlo (MPMC) points. +Motivated by the geometric nature of generating low-discrepancy point sets, we +leverage tools from Geometric Deep Learning and base our model on Graph Neural +Networks. We further provide an extension of our framework to higher +dimensions, which flexibly allows the generation of custom-made points that +emphasize the uniformity in specific dimensions that are primarily important +for the particular problem at hand. Finally, we demonstrate that our proposed +model achieves state-of-the-art performance superior to previous methods by a +significant margin. In fact, MPMC points are empirically shown to be either +optimal or near-optimal with respect to the discrepancy for low dimension and +small number of points, i.e., for which the optimal discrepancy can be +determined. Code for generating MPMC points can be found at +https://github.com/tk-rusch/MPMC. + +
+
+ comment: Published in Proceedings of the National Academy of Sciences (PNAS): + https://www.pnas.org/doi/10.1073/pnas.2409913121 +
+
+
+
+
+ + ♻ ☆ TabGraphs: A Benchmark and Strong Baselines for Learning on Graphs with + Tabular Node Features + + +
+ Tabular machine learning is an important field for industry and science. In +this field, table rows are usually treated as independent data samples, but +additional information about relations between them is sometimes available and +can be used to improve predictive performance. Such information can be +naturally modeled with a graph, thus tabular machine learning may benefit from +graph machine learning methods. However, graph machine learning models are +typically evaluated on datasets with homogeneous node features, which have +little in common with heterogeneous mixtures of numerical and categorical +features present in tabular datasets. Thus, there is a critical difference +between the data used in tabular and graph machine learning studies, which does +not allow one to understand how successfully graph models can be transferred to +tabular data. To bridge this gap, we propose a new benchmark of diverse graphs +with heterogeneous tabular node features and realistic prediction tasks. We use +this benchmark to evaluate a vast set of models, including simple methods +previously overlooked in the literature. Our experiments show that graph neural +networks (GNNs) can indeed often bring gains in predictive performance for +tabular data, but standard tabular models also can be adapted to work with +graph data by using simple feature preprocessing, which sometimes enables them +to compete with and even outperform GNNs. Based on our empirical study, we +provide insights for researchers and practitioners in both tabular and graph +machine learning fields. + +
+
+
+
+
+ + ♻ ☆ Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of + Anomalous Behavior in Bio-regenerative Life Support System Telemetry ECML + + +
+ The detection of abnormal or critical system states is essential in condition +monitoring. While much attention is given to promptly identifying anomalies, a +retrospective analysis of these anomalies can significantly enhance our +comprehension of the underlying causes of observed undesired behavior. This +aspect becomes particularly critical when the monitored system is deployed in a +vital environment. In this study, we delve into anomalies within the domain of +Bio-Regenerative Life Support Systems (BLSS) for space exploration and analyze +anomalies found in telemetry data stemming from the EDEN ISS space greenhouse +in Antarctica. We employ time series clustering on anomaly detection results to +categorize various types of anomalies in both uni- and multivariate settings. +We then assess the effectiveness of these methods in identifying systematic +anomalous behavior. Additionally, we illustrate that the anomaly detection +methods MDI and DAMP produce complementary results, as previously indicated by +research. + +
+
+ comment: 12 pages, + Supplemental Materials, Published at Machine Learning and + Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024 +
+
+
+
+
+ + ♻ ☆ A Comprehensive Framework for Evaluating API-oriented Code Generation in + Large Language Models + + +
+ Large language models (LLMs) like GitHub Copilot and ChatGPT have emerged as +powerful tools for code generation, significantly enhancing productivity and +accelerating software development. However, existing benchmarks primarily focus +on general code generation without considering API-oriented code generation, +i.e., generating code that invokes APIs from specific libraries. Given the +growing demand for API-oriented code generation, there is a pressing need for a +systematic and automated approach to evaluate LLM on API-oriented code +generation. To address this gap, we propose AutoAPIEval, a lightweight and +automated framework designed to evaluate the capabilities of LLMs in +API-oriented code generation. Our framework works with any library that +provides API documentation and focuses on two unit tasks: API recommendation +and code example generation, along with four metrics to evaluate the generated +APIs and code examples, such as the proportion of incorrect API recommendations +for Task 1, and the proportion of code examples where no specific API is +invoked and uncompilable/unexecutable code examples for Task 2. In addition, we +conducted a case study on three LLMs (ChatGPT, MagiCoder, and DeepSeek Coder) +and Java Runtime Environment 8 to demonstrate the framework's effectiveness. +Our findings reveal substantial variability in LLM performance across tasks, +with ChatGPT adhering better to instructions, while sharing similar +effectiveness in code example generation with its counterparts (i.e., MagiCoder +and DeekSeek Coder). We also identify key factors associated with code quality, +such as API popularity and model confidence, and build classifiers that achieve +high accuracy in detecting incorrect API recommendations and erroneous code +examples. Retrieval-augmented generation enhances the quality of code generated +by LLMs, though its effectiveness varies across different LLMs. + +
+
+
+
+
+ + ♻ ☆ Machine Learning for Two-Sample Testing under Right-Censored Data: A + Simulation Study + + +
+ The focus of this study is to evaluate the effectiveness of Machine Learning +(ML) methods for two-sample testing with right-censored observations. To +achieve this, we develop several ML-based methods with varying architectures +and implement them as two-sample tests. Each method is an ensemble (stacking) +that combines predictions from classical two-sample tests. This paper presents +the results of training the proposed ML methods, examines their statistical +power compared to classical two-sample tests, analyzes the null distribution of +the proposed methods when the null hypothesis is true, and evaluates the +significance of the features incorporated into the proposed methods. In total, +this work covers 18 methods for two-sample testing under right-censored +observations, including the proposed methods and classical well-studied +two-sample tests. All results from numerical experiments were obtained from a +synthetic dataset generated using the inverse transform sampling method and +replicated multiple times through Monte Carlo simulation. To test the +two-sample problem with right-censored observations, one can use the proposed +two-sample methods (scripts, dataset, and models are available on GitHub and +Hugging Face). + +
+
+ comment: 20 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Faster Randomized Methods for Orthogonality Constrained Problems + + +
+ Recent literature has advocated the use of randomized methods for +accelerating the solution of various matrix problems arising throughout data +science and computational science. One popular strategy for leveraging +randomization is to use it as a way to reduce problem size. However, methods +based on this strategy lack sufficient accuracy for some applications. +Randomized preconditioning is another approach for leveraging randomization, +which provides higher accuracy. The main challenge in using randomized +preconditioning is the need for an underlying iterative method, thus randomized +preconditioning so far have been applied almost exclusively to solving +regression problems and linear systems. In this article, we show how to expand +the application of randomized preconditioning to another important set of +problems prevalent across data science: optimization problems with +(generalized) orthogonality constraints. We demonstrate our approach, which is +based on the framework of Riemannian optimization and Riemannian +preconditioning, on the problem of computing the dominant canonical +correlations and on the Fisher linear discriminant analysis problem. For both +problems, we evaluate the effect of preconditioning on the computational costs +and asymptotic convergence, and demonstrate empirically the utility of our +approach. + +
+
+
+
+
+ + ♻ Discrete, compositional, and symbolic representations through attractor + dynamics + + +
+ Symbolic systems are powerful frameworks for modeling cognitive processes as +they encapsulate the rules and relationships fundamental to many aspects of +human reasoning and behavior. Central to these models are systematicity, +compositionality, and productivity, making them invaluable in both cognitive +science and artificial intelligence. However, certain limitations remain. For +instance, the integration of structured symbolic processes and latent +sub-symbolic processes has been implemented at the computational level through +fiat methods such as quantization or softmax sampling, which assume, rather +than derive, the operations underpinning discretization and symbolicization. In +this work, we introduce a novel neural stochastic dynamical systems model that +integrates attractor dynamics with symbolic representations to model cognitive +processes akin to the probabilistic language of thought (PLoT). Our model +segments the continuous representational space into discrete basins, with +attractor states corresponding to symbolic sequences, that reflect the +semanticity and compositionality characteristic of symbolic systems through +unsupervised learning, rather than relying on pre-defined primitives. Moreover, +like PLoT, our model learns to sample a diverse distribution of attractor +states that reflect the mutual information between the input data and the +symbolic encodings. This approach establishes a unified framework that +integrates both symbolic and sub-symbolic processing through neural dynamics, a +neuro-plausible substrate with proven expressivity in AI, offering a more +comprehensive model that mirrors the complex duality of cognitive operations. + +
+
+
+
+
+ + ♻ ☆ ZSC-Eval: An Evaluation Toolkit and Benchmark for Multi-agent Zero-shot + Coordination NeurIPS 2024 + + +
+ Zero-shot coordination (ZSC) is a new cooperative multi-agent reinforcement +learning (MARL) challenge that aims to train an ego agent to work with diverse, +unseen partners during deployment. The significant difference between the +deployment-time partners' distribution and the training partners' distribution +determined by the training algorithm makes ZSC a unique out-of-distribution +(OOD) generalization challenge. The potential distribution gap between +evaluation and deployment-time partners leads to inadequate evaluation, which +is exacerbated by the lack of appropriate evaluation metrics. In this paper, we +present ZSC-Eval, the first evaluation toolkit and benchmark for ZSC +algorithms. ZSC-Eval consists of: 1) Generation of evaluation partner +candidates through behavior-preferring rewards to approximate deployment-time +partners' distribution; 2) Selection of evaluation partners by Best-Response +Diversity (BR-Div); 3) Measurement of generalization performance with various +evaluation partners via the Best-Response Proximity (BR-Prox) metric. We use +ZSC-Eval to benchmark ZSC algorithms in Overcooked and Google Research Football +environments and get novel empirical findings. We also conduct a human +experiment of current ZSC algorithms to verify the ZSC-Eval's consistency with +human evaluation. ZSC-Eval is now available at +https://github.com/sjtu-marl/ZSC-Eval. + +
+
+ comment: Accepted in NeurIPS 2024 Dataset and Benchmark Track +
+
+
+
+
+ + ♻ ☆ Strategic Linear Contextual Bandits NeurIPS 2024 + + +
+ Motivated by the phenomenon of strategic agents gaming a recommender system +to maximize the number of times they are recommended to users, we study a +strategic variant of the linear contextual bandit problem, where the arms can +strategically misreport privately observed contexts to the learner. We treat +the algorithm design problem as one of mechanism design under uncertainty and +propose the Optimistic Grim Trigger Mechanism (OptGTM) that incentivizes the +agents (i.e., arms) to report their contexts truthfully while simultaneously +minimizing regret. We also show that failing to account for the strategic +nature of the agents results in linear regret. However, a trade-off between +mechanism design and regret minimization appears to be unavoidable. More +broadly, this work aims to provide insight into the intersection of online +learning and mechanism design. + +
+
+ comment: To appear at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion-based Generative Image Outpainting for Recovery of + FOV-Truncated CT Images + + +
+ Field-of-view (FOV) recovery of truncated chest CT scans is crucial for +accurate body composition analysis, which involves quantifying skeletal muscle +and subcutaneous adipose tissue (SAT) on CT slices. This, in turn, enables +disease prognostication. Here, we present a method for recovering truncated CT +slices using generative image outpainting. We train a diffusion model and apply +it to truncated CT slices generated by simulating a small FOV. Our model +reliably recovers the truncated anatomy and outperforms the previous +state-of-the-art despite being trained on 87% less data. + +
+
+ comment: Shared last authorship: Florian J. Fintelmann and Philip M\"uller +
+
+
+
+
+ + ♻ ☆ Characterizing stable regions in the residual stream of LLMs + + +
+ We identify "stable regions" in the residual stream of Transformers, where +the model's output remains insensitive to small activation changes, but +exhibits high sensitivity at region boundaries. These regions emerge during +training and become more defined as training progresses or model size +increases. The regions appear to be much larger than previously studied +polytopes. Our analysis suggests that these stable regions align with semantic +distinctions, where similar prompts cluster within regions, and activations +from the same region lead to similar next token predictions. This work provides +a promising research direction for understanding the complexity of neural +networks, shedding light on training dynamics, and advancing interpretability. + +
+
+
+
+
+ + ♻ ☆ Learning Constrained Markov Decision Processes With Non-stationary + Rewards and Constraints + + +
+ In constrained Markov decision processes (CMDPs) with adversarial rewards and +constraints, a well-known impossibility result prevents any algorithm from +attaining both sublinear regret and sublinear constraint violation, when +competing against a best-in-hindsight policy that satisfies constraints on +average. In this paper, we show that this negative result can be eased in CMDPs +with non-stationary rewards and constraints, by providing algorithms whose +performances smoothly degrade as non-stationarity increases. Specifically, we +propose algorithms attaining $\tilde{\mathcal{O}} (\sqrt{T} + C)$ regret and +positive constraint violation under bandit feedback, where $C$ is a corruption +value measuring the environment non-stationarity. This can be $\Theta(T)$ in +the worst case, coherently with the impossibility result for adversarial CMDPs. +First, we design an algorithm with the desired guarantees when $C$ is known. +Then, in the case $C$ is unknown, we show how to obtain the same results by +embedding such an algorithm in a general meta-procedure. This is of independent +interest, as it can be applied to any non-stationary constrained online +learning setting. + +
+
+
+
+
+ + ♻ ☆ Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation CoRL 2024 + + +
+ Given the high cost of collecting robotic data in the real world, sample +efficiency is a consistently compelling pursuit in robotics. In this paper, we +introduce SGRv2, an imitation learning framework that enhances sample +efficiency through improved visual and action representations. Central to the +design of SGRv2 is the incorporation of a critical inductive bias-action +locality, which posits that robot's actions are predominantly influenced by the +target object and its interactions with the local environment. Extensive +experiments in both simulated and real-world settings demonstrate that action +locality is essential for boosting sample efficiency. SGRv2 excels in RLBench +tasks with keyframe control using merely 5 demonstrations and surpasses the RVT +baseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and +MimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR. +In real-world environments, with only eight demonstrations, SGRv2 can perform a +variety of tasks at a markedly higher success rate compared to baseline models. +Project website: http://sgrv2-robot.github.io + +
+
+ comment: CoRL 2024. Project website: http://sgrv2-robot.github.io +
+
+
+
+
+ + ♻ ☆ What happens to diffusion model likelihood when your model is + conditional? + + +
+ Diffusion Models (DMs) iteratively denoise random samples to produce +high-quality data. The iterative sampling process is derived from Stochastic +Differential Equations (SDEs), allowing a speed-quality trade-off chosen at +inference. Another advantage of sampling with differential equations is exact +likelihood computation. These likelihoods have been used to rank unconditional +DMs and for out-of-domain classification. Despite the many existing and +possible uses of DM likelihoods, the distinct properties captured are unknown, +especially in conditional contexts such as Text-To-Image (TTI) or +Text-To-Speech synthesis (TTS). Surprisingly, we find that TTS DM likelihoods +are agnostic to the text input. TTI likelihood is more expressive but cannot +discern confounding prompts. Our results show that applying DMs to conditional +tasks reveals inconsistencies and strengthens claims that the properties of DM +likelihood are unknown. This impact sheds light on the previously unknown +nature of DM likelihoods. Although conditional DMs maximise likelihood, the +likelihood in question is not as sensitive to the conditioning input as one +expects. This investigation provides a new point-of-view on diffusion +likelihoods. + +
+
+
+
+
+ + ♻ ☆ Explainable AI needs formal notions of explanation correctness + + +
+ The use of machine learning (ML) in critical domains such as medicine poses +risks and requires regulation. One requirement is that decisions of ML systems +in high-risk applications should be human-understandable. The field of +"explainable artificial intelligence" (XAI) seemingly addresses this need. +However, in its current form, XAI is unfit to provide quality control for ML; +it itself needs scrutiny. Popular XAI methods cannot reliably answer important +questions about ML models, their training data, or a given test input. We +recapitulate results demonstrating that popular XAI methods systematically +attribute importance to input features that are independent of the prediction +target. This limits their utility for purposes such as model and data +(in)validation, model improvement, and scientific discovery. We argue that the +fundamental reason for this limitation is that current XAI methods do not +address well-defined problems and are not evaluated against objective criteria +of explanation correctness. Researchers should formally define the problems +they intend to solve first and then design methods accordingly. This will lead +to notions of explanation correctness that can be theoretically verified and +objective metrics of explanation performance that can be assessed using +ground-truth data. + +
+
+
+
+
+ + ♻ ☆ Efficient Combinatorial Optimization via Heat Diffusion NeurIPS 2024 + + +
+ Combinatorial optimization problems are widespread but inherently challenging +due to their discrete nature. The primary limitation of existing methods is +that they can only access a small fraction of the solution space at each +iteration, resulting in limited efficiency for searching the global optimal. To +overcome this challenge, diverging from conventional efforts of expanding the +solver's search scope, we focus on enabling information to actively propagate +to the solver through heat diffusion. By transforming the target function while +preserving its optima, heat diffusion facilitates information flow from distant +regions to the solver, providing more efficient navigation. Utilizing heat +diffusion, we propose a framework for solving general combinatorial +optimization problems. The proposed methodology demonstrates superior +performance across a range of the most challenging and widely encountered +combinatorial optimizations. Echoing recent advancements in harnessing +thermodynamics for generative artificial intelligence, our study further +reveals its significant potential in advancing combinatorial optimization. + +
+
+ comment: After the rebuttal version for NeurIPS 2024 (poster). Code is + available in https://github.com/AwakerMhy/HeO +
+
+
+
+
+ + ♻ ☆ Learning to Receive Help: Intervention-Aware Concept Embedding Models NeurIPS 2023 + + +
+ Concept Bottleneck Models (CBMs) tackle the opacity of neural architectures +by constructing and explaining their predictions using a set of high-level +concepts. A special property of these models is that they permit concept +interventions, wherein users can correct mispredicted concepts and thus improve +the model's performance. Recent work, however, has shown that intervention +efficacy can be highly dependent on the order in which concepts are intervened +on and on the model's architecture and training hyperparameters. We argue that +this is rooted in a CBM's lack of train-time incentives for the model to be +appropriately receptive to concept interventions. To address this, we propose +Intervention-aware Concept Embedding models (IntCEMs), a novel CBM-based +architecture and training paradigm that improves a model's receptiveness to +test-time interventions. Our model learns a concept intervention policy in an +end-to-end fashion from where it can sample meaningful intervention +trajectories at train-time. This conditions IntCEMs to effectively select and +receive concept interventions when deployed at test-time. Our experiments show +that IntCEMs significantly outperform state-of-the-art concept-interpretable +models when provided with test-time concept interventions, demonstrating the +effectiveness of our approach. + +
+
+ comment: Accepted as a spotlight at the Thirty-seventh Conference on Neural + Information Processing Systems (NeurIPS 2023) +
+
+
+
+
+ + ♻ ☆ Neural Exploratory Landscape Analysis + + +
+ Recent research in Meta-Black-Box Optimization (MetaBBO) have shown that +meta-trained neural networks can effectively guide the design of black-box +optimizers, significantly reducing the need for expert tuning and delivering +robust performance across complex problem distributions. Despite their success, +a paradox remains: MetaBBO still rely on human-crafted Exploratory Landscape +Analysis features to inform the meta-level agent about the low-level +optimization progress. To address the gap, this paper proposes Neural +Exploratory Landscape Analysis (NeurELA), a novel framework that dynamically +profiles landscape features through a two-stage, attention-based neural +network, executed in an entirely end-to-end fashion. NeurELA is pre-trained +over a variety of MetaBBO algorithms using a multi-task neuroevolution +strategy. Extensive experiments show that NeurELA achieves consistently +superior performance when integrated into different and even unseen MetaBBO +tasks and can be efficiently fine-tuned for further performance boost. This +advancement marks a pivotal step in making MetaBBO algorithms more autonomous +and broadly applicable.The source code of NeurELA can be accessed at +https://anonymous.4open.science/r/Neur-ELA-303C. + +
+
+
+
+
+ + ♻ ☆ Unsupervisedly Learned Representations: Should the Quest be Over? + + +
+ After four decades of research there still exists a Classification accuracy +gap of about 20% between our best Unsupervisedly Learned Representations +methods and the accuracy rates achieved by intelligent animals. It thus may +well be that we are looking in the wrong direction. A possible solution to this +puzzle is presented. We demonstrate that Reinforcement Learning can learn +representations which achieve the same accuracy as that of animals. Our main +modest contribution lies in the observations that: a. when applied to a real +world environment Reinforcement Learning does not require labels, and thus may +be legitimately considered as Unsupervised Learning, and b. in contrast, when +Reinforcement Learning is applied in a simulated environment it does inherently +require labels and should thus be generally be considered as Supervised +Learning. The corollary of these observations is that further search for +Unsupervised Learning competitive paradigms which may be trained in simulated +environments may be futile. + +
+
+ comment: To be published at The 6th International Conference on Machine + Learning, Optimization and Data Science - LOD 2020 +
+
+
+
+
+ + ♻ ☆ Exploring Selective Layer Fine-Tuning in Federated Learning + + +
+ Federated learning (FL) has emerged as a promising paradigm for fine-tuning +foundation models using distributed data in a privacy-preserving manner. Under +limited computational resources, clients often find it more practical to +fine-tune a selected subset of layers, rather than the entire model, based on +their task-specific data. In this study, we provide a thorough theoretical +exploration of selective layer fine-tuning in FL, emphasizing a flexible +approach that allows the clients to adjust their selected layers according to +their local data and resources. We theoretically demonstrate that the layer +selection strategy has a significant impact on model convergence in two +critical aspects: the importance of selected layers and the heterogeneous +choices across clients. Drawing from these insights, we further propose a +strategic layer selection method that utilizes local gradients and regulates +layer selections across clients. The extensive experiments on both image and +text datasets demonstrate the effectiveness of the proposed strategy compared +with several baselines, highlighting its advances in identifying critical +layers that adapt to the client heterogeneity and training dynamics in FL. + +
+
+
+
+
+ + ♻ ☆ Modeling and Analyzing the Influence of Non-Item Pages on Sequential + Next-Item Prediction + + +
+ Analyzing sequences of interactions between users and items, sequential +recommendation models can learn user intent and make predictions about the next +item. Next to item interactions, most systems also have interactions with what +we call non-item pages: these pages are not related to specific items but still +can provide insights of the user's interests, as, for example, navigation +pages. + We therefore propose a general way to include these non-item pages in +sequential recommendation models to enhance next-item prediction. First, we +demonstrate the influence of non-item pages on following interactions with the +hypotheses testing framework HypTrails and propose methods for representing +non-item pages in sequential recommendation models. Subsequently, we adapt +popular sequential recommender models to integrate non-item pages and +investigate their performance with different item representation strategies as +well as their ability to handle noisy data. To show the general capabilities of +the models to integrate non-item pages, we create a synthetic dataset for a +controlled setting and then evaluate the improvements from including non-item +pages on two real-world datasets. + Our results show that non-item pages are a valuable source of information, +and incorporating them in sequential recommendation models increases the +performance of next-item prediction across all analyzed model architectures. + +
+
+ comment: 37 pages, 19 figures; Submitted to ACM TORS +
+
+
+
+
+ + ♻ ☆ On the Design and Analysis of LLM-Based Algorithms + + +
+ We initiate a formal investigation into the design and analysis of LLM-based +algorithms, i.e. algorithms that contain one or multiple calls of large +language models (LLMs) as sub-routines and critically rely on the capabilities +of LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt +engineering to complicated LLM-powered agent systems and compound AI systems, +have achieved remarkable empirical success, the design and optimization of them +have mostly relied on heuristics and trial-and-errors, which is largely due to +a lack of formal and analytical study for these algorithms. To fill this gap, +we start by identifying the computational-graph representation of LLM-based +algorithms, the design principle of task decomposition, and some key +abstractions, which then facilitate our formal analysis for the accuracy and +efficiency of LLM-based algorithms, despite the black-box nature of LLMs. +Through extensive analytical and empirical investigation in a series of case +studies, we demonstrate that the proposed framework is broadly applicable to a +wide range of scenarios and diverse patterns of LLM-based algorithms, such as +parallel, hierarchical and recursive task decomposition. Our proposed framework +holds promise for advancing LLM-based algorithms, by revealing the reasons +behind curious empirical phenomena, guiding the choices of hyperparameters, +predicting the empirical performance of algorithms, and inspiring new algorithm +design. To promote further study of LLM-based algorithms, we release our source +code at +https://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm. + +
+
+
+
+
+ + ♻ ☆ dlordinal: a Python package for deep ordinal classification + + +
+ dlordinal is a new Python library that unifies many recent deep ordinal +classification methodologies available in the literature. Developed using +PyTorch as underlying framework, it implements the top performing +state-of-the-art deep learning techniques for ordinal classification problems. +Ordinal approaches are designed to leverage the ordering information present in +the target variable. Specifically, it includes loss functions, various output +layers, dropout techniques, soft labelling methodologies, and other +classification strategies, all of which are appropriately designed to +incorporate the ordinal information. Furthermore, as the performance metrics to +assess novel proposals in ordinal classification depend on the distance between +target and predicted classes in the ordinal scale, suitable ordinal evaluation +metrics are also included. dlordinal is distributed under the BSD-3-Clause +license and is available at https://github.com/ayrna/dlordinal. + +
+
+
+
+
+ + ♻ ☆ Tenplex: Dynamic Parallelism for Deep Learning using Parallelizable + Tensor Collections + + +
+ Deep learning (DL) jobs use multi-dimensional parallelism, i.e. combining +data, model, and pipeline parallelism, to use large GPU clusters efficiently. +Long-running jobs may experience changes to their GPU allocation: (i) resource +elasticity during training adds or removes GPUs; (ii) hardware maintenance may +require redeployment on different GPUs; and (iii) GPU failures force jobs to +run with fewer devices. Current DL frameworks tie jobs to a set of GPUs and +thus lack support for these scenarios. In particular, they cannot change the +multi-dimensional parallelism of an already-running job in an efficient and +model-independent way. + We describe Scalai, a state management library for DL systems that enables +jobs to change their parallelism dynamically after the GPU allocation is +updated at runtime. Scalai achieves this through a new abstraction, a +parallelizable tensor collection (PTC), that externalizes the job state during +training. After a GPU change, Scalai uses the PTC to transform the job state: +the PTC repartitions the dataset state under data parallelism and exposes it to +DL workers through a virtual file system; and the PTC obtains the model state +as partitioned checkpoints and transforms them to reflect the new +parallelization configuration. For efficiency, Scalai executes PTC +transformations in parallel with minimum data movement between workers. Our +experiments show that Scalai enables DL jobs to support dynamic parallelization +with low overhead. + +
+
+ comment: The 30th Symposium on Operating Systems Principles (SOSP24) +
+
+
+
+
+ + ♻ ☆ IDP-PGFE: An Interpretable Disruption Predictor based on Physics-Guided + Feature Extraction + + +
+ Disruption prediction has made rapid progress in recent years, especially in +machine learning (ML)-based methods. Understanding why a predictor makes a +certain prediction can be as crucial as the prediction's accuracy for future +tokamak disruption predictors. The purpose of most disruption predictors is +accuracy or cross-machine capability. However, if a disruption prediction model +can be interpreted, it can tell why certain samples are classified as +disruption precursors. This allows us to tell the types of incoming disruption +and gives us insight into the mechanism of disruption. This paper designs a +disruption predictor called Interpretable Disruption Predictor based On +Physics-guided feature extraction (IDP-PGFE) on J-TEXT. The prediction +performance of the model is effectively improved by extracting physics-guided +features. A high-performance model is required to ensure the validity of the +interpretation results. The interpretability study of IDP-PGFE provides an +understanding of J-TEXT disruption and is generally consistent with existing +comprehension of disruption. IDP-PGFE has been applied to the disruption due to +continuously increasing density towards density limit experiments on J-TEXT. +The time evolution of the PGFE features contribution demonstrates that the +application of ECRH triggers radiation-caused disruption, which lowers the +density at disruption. While the application of RMP indeed raises the density +limit in J-TEXT. The interpretability study guides intuition on the physical +mechanisms of density limit disruption that RMPs affect not only the MHD +instabilities but also the radiation profile, which delays density limit +disruption. + +
+
+ comment: 17 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ SatFed: A Resource-Efficient LEO Satellite-Assisted Heterogeneous + Federated Learning Framework + + +
+ Traditional federated learning (FL) frameworks rely heavily on terrestrial +networks, where coverage limitations and increasing bandwidth congestion +significantly hinder model convergence. Fortunately, the advancement of +low-Earth orbit (LEO) satellite networks offers promising new communication +avenues to augment traditional terrestrial FL. Despite this potential, the +limited satellite-ground communication bandwidth and the heterogeneous +operating environments of ground devices-including variations in data, +bandwidth, and computing power-pose substantial challenges for effective and +robust satellite-assisted FL. To address these challenges, we propose SatFed, a +resource-efficient satellite-assisted heterogeneous FL framework. SatFed +implements freshness-based model prioritization queues to optimize the use of +highly constrained satellite-ground bandwidth, ensuring the transmission of the +most critical models. Additionally, a multigraph is constructed to capture +real-time heterogeneous relationships between devices, including data +distribution, terrestrial bandwidth, and computing capability. This multigraph +enables SatFed to aggregate satellite-transmitted models into peer guidance, +enhancing local training in heterogeneous environments. Extensive experiments +with real-world LEO satellite networks demonstrate that SatFed achieves +superior performance and robustness compared to state-of-the-art benchmarks. + +
+
+ comment: 10 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ VARADE: a Variational-based AutoRegressive model for Anomaly Detection + on the Edge + + +
+ Detecting complex anomalies on massive amounts of data is a crucial task in +Industry 4.0, best addressed by deep learning. However, available solutions are +computationally demanding, requiring cloud architectures prone to latency and +bandwidth issues. This work presents VARADE, a novel solution implementing a +light autoregressive framework based on variational inference, which is best +suited for real-time execution on the edge. The proposed approach was validated +on a robotic arm, part of a pilot production line, and compared with several +state-of-the-art algorithms, obtaining the best trade-off between anomaly +detection accuracy, power consumption and inference frequency on two different +edge platforms. + +
+
+
+
+
+ + ♻ ☆ Recursive Distillation for Open-Set Distributed Robot Localization + + +
+ A typical assumption in state-of-the-art self-localization models is that an +annotated training dataset is available for the target workspace. However, this +is not necessarily true when a robot travels around the general open world. +This work introduces a novel training scheme for open-world distributed robot +systems. In our scheme, a robot (``student") can ask the other robots it meets +at unfamiliar places (``teachers") for guidance. Specifically, a +pseudo-training dataset is reconstructed from the teacher model and then used +for continual learning of the student model under domain, class, and vocabulary +incremental setup. Unlike typical knowledge transfer schemes, our scheme +introduces only minimal assumptions on the teacher model, so that it can handle +various types of open-set teachers, including those uncooperative, untrainable +(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In +this paper, we investigate a ranking function as an instance of such generic +models, using a challenging data-free recursive distillation scenario, where a +student once trained can recursively join the next-generation open teacher set. + +
+
+ comment: 5 pages, 4 figures, technical report +
+
+
+
+
+ + ♻ ☆ On-Air Deep Learning Integrated Semantic Inference Models for Enhanced + Earth Observation Satellite Networks + + +
+ Earth Observation (EO) systems play a crucial role in achieving Sustainable +Development Goals by collecting and analyzing vital global data through +satellite networks. These systems are essential for tasks like mapping, +disaster monitoring, and resource management, but they face challenges in +processing and transmitting large volumes of EO data, especially in specialized +fields such as agriculture and real-time disaster response. Domain-adapted +Large Language Models (LLMs) provide a promising solution by facilitating data +fusion between extensive EO data and semantic EO data. By improving integration +and interpretation of diverse datasets, LLMs address the challenges of +processing specialized information in agriculture and disaster response +applications. This fusion enhances the accuracy and relevance of transmitted +data. This paper presents a framework for semantic communication in EO +satellite networks, aimed at improving data transmission efficiency and overall +system performance through cognitive processing techniques. The proposed system +employs Discrete-Task-Oriented Source-Channel Coding (DT-JSCC) and Semantic +Data Augmentation (SA) to focus on relevant information while minimizing +communication overhead. By integrating cognitive semantic processing and +inter-satellite links, the framework enhances the analysis and transmission of +multispectral satellite imagery, improving object detection, pattern +recognition, and real-time decision-making. The introduction of Cognitive +Semantic Augmentation (CSA) allows satellites to process and transmit semantic +information, boosting adaptability to changing environments and application +needs. This end-to-end architecture is tailored for next-generation satellite +networks, such as those supporting 6G, and demonstrates significant +improvements in efficiency and accuracy. + +
+
+ comment: 18 pages, 10 figures, magazine +
+
+
+
+
+ + ♻ ☆ Realising Synthetic Active Inference Agents, Part II: Variational + Message Updates + + +
+ The Free Energy Principle (FEP) describes (biological) agents as minimising a +variational Free Energy (FE) with respect to a generative model of their +environment. Active Inference (AIF) is a corollary of the FEP that describes +how agents explore and exploit their environment by minimising an expected FE +objective. In two related papers, we describe a scalable, epistemic approach to +synthetic AIF, by message passing on free-form Forney-style Factor Graphs +(FFGs). A companion paper (part I) introduces a Constrained FFG (CFFG) notation +that visually represents (generalised) FE objectives for AIF. The current paper +(part II) derives message passing algorithms that minimise (generalised) FE +objectives on a CFFG by variational calculus. A comparison between simulated +Bethe and generalised FE agents illustrates how the message passing approach to +synthetic AIF induces epistemic behaviour on a T-maze navigation task. +Extension of the T-maze simulation to 1) learning goal statistics, and 2) a +multi-agent bargaining setting, illustrate how this approach encourages reuse +of nodes and updates in alternative settings. With a full message passing +account of synthetic AIF agents, it becomes possible to derive and reuse +message updates across models and move closer to industrial applications of +synthetic AIF. + +
+
+
+
+
+ + ♻ ☆ EPTQ: Enhanced Post-Training Quantization via Hessian-guided + Network-wise Optimization + + +
+ Quantization is a key method for deploying deep neural networks on edge +devices with limited memory and computation resources. Recent improvements in +Post-Training Quantization (PTQ) methods were achieved by an additional local +optimization process for learning the weight quantization rounding policy. +However, a gap exists when employing network-wise optimization with small +representative datasets. In this paper, we propose a new method for enhanced +PTQ (EPTQ) that employs a network-wise quantization optimization process, which +benefits from considering cross-layer dependencies during optimization. EPTQ +enables network-wise optimization with a small representative dataset using a +novel sample-layer attention score based on a label-free Hessian matrix upper +bound. The label-free approach makes our method suitable for the PTQ scheme. We +give a theoretical analysis for the said bound and use it to construct a +knowledge distillation loss that guides the optimization to focus on the more +sensitive layers and samples. In addition, we leverage the Hessian upper bound +to improve the weight quantization parameters selection by focusing on the more +sensitive elements in the weight tensors. Empirically, by employing EPTQ we +achieve state-of-the-art results on various models, tasks, and datasets, +including ImageNet classification, COCO object detection, and Pascal-VOC for +semantic segmentation. + +
+
+
+
+
+ + ♻ ☆ Recurrent Stochastic Configuration Networks for Temporal Data Analytics + + +
+ Temporal data modelling techniques with neural networks are useful in many +domain applications, including time-series forecasting and control engineering. +This paper aims at developing a recurrent version of stochastic configuration +networks (RSCNs) for problem solving, where we have no underlying assumption on +the dynamic orders of the input variables. Given a collection of historical +data, we first build an initial RSCN model in the light of a supervisory +mechanism, followed by an online update of the output weights by using a +projection algorithm. Some theoretical results are established, including the +echo state property, the universal approximation property of RSCNs for both the +offline and online learnings, and the convergence of the output weights. The +proposed RSCN model is remarkably distinguished from the well-known echo state +networks (ESNs) in terms of the way of assigning the input random weight matrix +and a special structure of the random feedback matrix. A comprehensive +comparison study among the long short-term memory (LSTM) network, the original +ESN, and several state-of-the-art ESN methods such as the simple cycle +reservoir (SCR), the polynomial ESN (PESN), the leaky-integrator ESN (LIESN) +and RSCN is carried out. Numerical results clearly indicate that the proposed +RSCN performs favourably over all of the datasets. + +
+
+
+
+
+ + ♻ ☆ Archon: An Architecture Search Framework for Inference-Time Techniques + + +
+ Inference-time techniques are emerging as highly effective tools to increase +large language model (LLM) capabilities. However, there is still limited +understanding of the best practices for developing systems that combine +inference-time techniques with one or more LLMs, with challenges including: (1) +effectively allocating inference compute budget, (2) understanding the +interactions between different combinations of inference-time techniques and +their impact on downstream performance, and 3) efficiently searching over the +large space of model choices, inference-time techniques, and their +compositions. To address these challenges, we introduce Archon, an automated +framework for designing inference-time architectures. Archon defines an +extensible design space, encompassing methods such as generation ensembling, +multi-sampling, ranking, fusion, critiquing, verification, and unit testing. It +then transforms the problem of selecting and combining LLMs and inference-time +techniques into a hyperparameter optimization objective. To optimize this +objective, we introduce automated Inference-Time Architecture Search (ITAS) +algorithms. Given target benchmark(s), an inference compute budget, and +available LLMs, ITAS outputs optimized architectures. We evaluate Archon +architectures across a wide range of instruction-following and reasoning +benchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval, +MixEval Hard, MATH, and CodeContests. We show that automatically designed +inference-time architectures by Archon outperform strong models such as GPT-4o +and Claude 3.5 Sonnet on these benchmarks, achieving an average increase of +15.1 and 11.2 percentage points with all-source models and open-source models, +respectively. We make our code and datasets available publicly on Github: +https://github.com/ScalingIntelligence/Archon. + +
+
+
+
+
+ + ♻ ☆ Quality Matters: Evaluating Synthetic Data for Tool-Using LLMs + + +
+ Training large language models (LLMs) for external tool usage is a rapidly +expanding field, with recent research focusing on generating synthetic data to +address the shortage of available data. However, the absence of systematic data +quality checks poses complications for properly training and testing models. To +that end, we propose two approaches for assessing the reliability of data for +training LLMs to use external tools. The first approach uses intuitive, +human-defined correctness criteria. The second approach uses a model-driven +assessment with in-context evaluation. We conduct a thorough evaluation of data +quality on two popular benchmarks, followed by an extrinsic evaluation that +showcases the impact of data quality on model performance. Our results +demonstrate that models trained on high-quality data outperform those trained +on unvalidated data, even when trained with a smaller quantity of data. These +findings empirically support the significance of assessing and ensuring the +reliability of training data for tool-using LLMs. + +
+
+
+
+
+ + ♻ ☆ Hybrid Spiking Neural Networks for Low-Power Intra-Cortical + Brain-Machine Interfaces + + +
+ Intra-cortical brain-machine interfaces (iBMIs) have the potential to +dramatically improve the lives of people with paraplegia by restoring their +ability to perform daily activities. However, current iBMIs suffer from +scalability and mobility limitations due to bulky hardware and wiring. Wireless +iBMIs offer a solution but are constrained by a limited data rate. To overcome +this challenge, we are investigating hybrid spiking neural networks for +embedded neural decoding in wireless iBMIs. The networks consist of a temporal +convolution-based compression followed by recurrent processing and a final +interpolation back to the original sequence length. As recurrent units, we +explore gated recurrent units (GRUs), leaky integrate-and-fire (LIF) neurons, +and a combination of both - spiking GRUs (sGRUs) and analyze their differences +in terms of accuracy, footprint, and activation sparsity. To that end, we train +decoders on the "Nonhuman Primate Reaching with Multichannel Sensorimotor +Cortex Electrophysiology" dataset and evaluate it using the NeuroBench +framework, targeting both tracks of the IEEE BioCAS Grand Challenge on Neural +Decoding. Our approach achieves high accuracy in predicting velocities of +primate reaching movements from multichannel primary motor cortex recordings +while maintaining a low number of synaptic operations, surpassing the current +baseline models in the NeuroBench framework. This work highlights the potential +of hybrid neural networks to facilitate wireless iBMIs with high decoding +precision and a substantial increase in the number of monitored neurons, paving +the way toward more advanced neuroprosthetic technologies. + +
+
+ comment: This work has been accepted at the 2024 IEEE Biomedical Circuits and + Systems Conference +
+
+
+
+
+ + ♻ ☆ Improving Fast Adversarial Training Paradigm: An Example Taxonomy + Perspective + + +
+ While adversarial training is an effective defense method against adversarial +attacks, it notably increases the training cost. To this end, fast adversarial +training (FAT) is presented for efficient training and has become a hot +research topic. However, FAT suffers from catastrophic overfitting, which leads +to a performance drop compared with multi-step adversarial training. However, +the cause of catastrophic overfitting remains unclear and lacks exploration. In +this paper, we present an example taxonomy in FAT, which identifies that +catastrophic overfitting is caused by the imbalance between the inner and outer +optimization in FAT. Furthermore, we investigated the impact of varying degrees +of training loss, revealing a correlation between training loss and +catastrophic overfitting. Based on these observations, we redesign the loss +function in FAT with the proposed dynamic label relaxation to concentrate the +loss range and reduce the impact of misclassified examples. Meanwhile, we +introduce batch momentum initialization to enhance the diversity to prevent +catastrophic overfitting in an efficient manner. Furthermore, we also propose +Catastrophic Overfitting aware Loss Adaptation (COLA), which employs a separate +training strategy for examples based on their loss degree. Our proposed method, +named example taxonomy aware FAT (ETA), establishes an improved paradigm for +FAT. Experiment results demonstrate our ETA achieves state-of-the-art +performance. Comprehensive experiments on four standard datasets demonstrate +the competitiveness of our proposed method. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ GlycanML: A Multi-Task and Multi-Structure Benchmark for Glycan Machine + Learning + + +
+ Glycans are basic biomolecules and perform essential functions within living +organisms. The rapid increase of functional glycan data provides a good +opportunity for machine learning solutions to glycan understanding. However, +there still lacks a standard machine learning benchmark for glycan function +prediction. In this work, we fill this blank by building a comprehensive +benchmark for Glycan Machine Learning (GlycanML). The GlycanML benchmark +consists of diverse types of tasks including glycan taxonomy prediction, glycan +immunogenicity prediction, glycosylation type prediction, and protein-glycan +interaction prediction. Glycans can be represented by both sequences and graphs +in GlycanML, which enables us to extensively evaluate sequence-based models and +graph neural networks (GNNs) on benchmark tasks. Furthermore, by concurrently +performing eight glycan taxonomy prediction tasks, we introduce the +GlycanML-MTL testbed for multi-task learning (MTL) algorithms. Experimental +results show the superiority of modeling glycans with multi-relational GNNs, +and suitable MTL methods can further boost model performance. We provide all +datasets and source codes at https://github.com/GlycanML/GlycanML and maintain +a leaderboard at https://GlycanML.github.io/project + +
+
+ comment: Research project paper. All code and data are released +
+
+
+
+
+ + ♻ ☆ Understanding the Expressivity and Trainability of Fourier Neural + Operator: A Mean-Field Perspective + + +
+ In this paper, we explores the expressivity and trainability of the Fourier +Neural Operator (FNO). We establish a mean-field theory for the FNO, analyzing +the behavior of the random FNO from an edge of chaos perspective. Our +investigation into the expressivity of a random FNO involves examining the +ordered-chaos phase transition of the network based on the weight distribution. +This phase transition demonstrates characteristics unique to the FNO, induced +by mode truncation, while also showcasing similarities to those of densely +connected networks. Furthermore, we identify a connection between expressivity +and trainability: the ordered and chaotic phases correspond to regions of +vanishing and exploding gradients, respectively. This finding provides a +practical prerequisite for the stable training of the FNO. Our experimental +results corroborate our theoretical findings. + +
+
+
+
+
+ + ♻ ☆ INT-FlashAttention: Enabling Flash Attention for INT8 Quantization + + +
+ As the foundation of large language models (LLMs), self-attention module +faces the challenge of quadratic time and memory complexity with respect to +sequence length. FlashAttention accelerates attention computation and reduces +its memory usage by leveraging the GPU memory hierarchy. A promising research +direction is to integrate FlashAttention with quantization methods. This paper +introduces INT-FlashAttention, the first INT8 quantization architecture +compatible with the forward workflow of FlashAttention, which significantly +improves the inference speed of FlashAttention on Ampere GPUs. We implement our +INT-FlashAttention prototype with fully INT8 activations and general +matrix-multiplication (GEMM) kernels, making it the first attention operator +with fully INT8 input. As a general token-level post-training quantization +framework, INT-FlashAttention is also compatible with other data formats like +INT4, etc. Experimental results show INT-FlashAttention achieves 72% faster +inference speed and 82% smaller quantization error compared to standard +FlashAttention with FP16 and FP8 data format. + +
+
+
+
+
+ + ♻ ☆ Bivariate DeepKriging for Large-scale Spatial Interpolation of Wind + Fields + + +
+ High spatial resolution wind data are essential for a wide range of +applications in climate, oceanographic and meteorological studies. Large-scale +spatial interpolation or downscaling of bivariate wind fields having velocity +in two dimensions is a challenging task because wind data tend to be +non-Gaussian with high spatial variability and heterogeneity. In spatial +statistics, cokriging is commonly used for predicting bivariate spatial fields. +However, the cokriging predictor is not optimal except for Gaussian processes. +Additionally, cokriging is computationally prohibitive for large datasets. In +this paper, we propose a method, called bivariate DeepKriging, which is a +spatially dependent deep neural network (DNN) with an embedding layer +constructed by spatial radial basis functions for bivariate spatial data +prediction. We then develop a distribution-free uncertainty quantification +method based on bootstrap and ensemble DNN. Our proposed approach outperforms +the traditional cokriging predictor with commonly used covariance functions, +such as the linear model of co-regionalization and flexible bivariate Mat\'ern +covariance. We demonstrate the computational efficiency and scalability of the +proposed DNN model, with computations that are, on average, 20 times faster +than those of conventional techniques. We apply the bivariate DeepKriging +method to the wind data over the Middle East region at 506,771 locations. The +prediction performance of the proposed method is superior over the cokriging +predictors and dramatically reduces computation time. + +
+
+
+
+
+ + ♻ ☆ Learning Variable Compliance Control From a Few Demonstrations for + Bimanual Robot with Haptic Feedback Teleoperation System IROS 2024 + + +
+ Automating dexterous, contact-rich manipulation tasks using rigid robots is a +significant challenge in robotics. Rigid robots, defined by their actuation +through position commands, face issues of excessive contact forces due to their +inability to adapt to contact with the environment, potentially causing damage. +While compliance control schemes have been introduced to mitigate these issues +by controlling forces via external sensors, they are hampered by the need for +fine-tuning task-specific controller parameters. Learning from Demonstrations +(LfD) offers an intuitive alternative, allowing robots to learn manipulations +through observed actions. In this work, we introduce a novel system to enhance +the teaching of dexterous, contact-rich manipulations to rigid robots. Our +system is twofold: firstly, it incorporates a teleoperation interface utilizing +Virtual Reality (VR) controllers, designed to provide an intuitive and +cost-effective method for task demonstration with haptic feedback. Secondly, we +present Comp-ACT (Compliance Control via Action Chunking with Transformers), a +method that leverages the demonstrations to learn variable compliance control +from a few demonstrations. Our methods have been validated across various +complex contact-rich manipulation tasks using single-arm and bimanual robot +setups in simulated and real-world environments, demonstrating the +effectiveness of our system in teaching robots dexterous manipulations with +enhanced adaptability and safety. Code available at: +https://github.com/omron-sinicx/CompACT + +
+
+ comment: Accepted to IROS 2024 +
+
+
+
+
+ + ♻ ☆ Decentralised Variational Inference Frameworks for Multi-object Tracking + on Sensor Network + + +
+ This paper tackles the challenge of multi-sensor multi-object tracking by +proposing various decentralised Variational Inference (VI) schemes that match +the tracking performance of centralised sensor fusion with only local message +exchanges among neighboring sensors. We first establish a centralised VI sensor +fusion scheme as a benchmark and analyse the limitations of its decentralised +counterpart, which requires sensors to await consensus at each VI iteration. +Therefore, we propose a decentralised gradient-based VI framework that +optimises the Locally Maximised Evidence Lower Bound (LM-ELBO) instead of the +standard ELBO, which reduces the parameter search space and enables faster +convergence, making it particularly beneficial for decentralised tracking.This +proposed framework is inherently self-evolving, improving with advancements in +decentralised optimisation techniques for convergence guarantees and +efficiency. Further, we enhance the convergence speed of proposed decentralised +schemes using natural gradients and gradient tracking strategies. Results +verify that our decentralised VI schemes are empirically equivalent to +centralised fusion in tracking performance. Notably, the decentralised natural +gradient VI method is the most communication-efficient, with communication +costs comparable to suboptimal decentralised strategies while delivering +notably higher tracking accuracy. + +
+
+
+
+
+ + ♻ ☆ Trust-Region Sequential Quadratic Programming for Stochastic + Optimization with Random Models + + +
+ In this work, we consider solving optimization problems with a stochastic +objective and deterministic equality constraints. We propose a Trust-Region +Sequential Quadratic Programming method to find both first- and second-order +stationary points. Our method utilizes a random model to represent the +objective function, which is constructed from stochastic observations of the +objective and is designed to satisfy proper adaptive accuracy conditions with a +high but fixed probability. To converge to first-order stationary points, our +method computes a gradient step in each iteration defined by minimizing a +quadratic approximation of the objective subject to a (relaxed) linear +approximation of the problem constraints and a trust-region constraint. To +converge to second-order stationary points, our method additionally computes an +eigen step to explore the negative curvature of the reduced Hessian matrix, as +well as a second-order correction step to address the potential Maratos effect, +which arises due to the nonlinearity of the problem constraints. Such an effect +may impede the method from moving away from saddle points. Both gradient and +eigen step computations leverage a novel parameter-free decomposition of the +step and the trust-region radius, accounting for the proportions among the +feasibility residual, optimality residual, and negative curvature. We establish +global almost sure first- and second-order convergence guarantees for our +method, and present computational results on CUTEst problems, regression +problems, and saddle-point problems to demonstrate its superiority over +existing line-search-based stochastic methods. + +
+
+ comment: 41 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ CMamba: Channel Correlation Enhanced State Space Models for Multivariate + Time Series Forecasting + + +
+ Recent advancements in multivariate time series forecasting have been +propelled by Linear-based, Transformer-based, and Convolution-based models, +with Transformer-based architectures gaining prominence for their efficacy in +temporal and cross-channel mixing. More recently, Mamba, a state space model, +has emerged with robust sequence and feature mixing capabilities. However, the +suitability of the vanilla Mamba design for time series forecasting remains an +open question, particularly due to its inadequate handling of cross-channel +dependencies. Capturing cross-channel dependencies is critical in enhancing the +performance of multivariate time series prediction. Recent findings show that +self-attention excels in capturing cross-channel dependencies, whereas other +simpler mechanisms, such as MLP, may degrade model performance. This is +counterintuitive, as MLP, being a learnable architecture, should theoretically +capture both correlations and irrelevances, potentially leading to neutral or +improved performance. Diving into the self-attention mechanism, we attribute +the observed degradation in MLP performance to its lack of data dependence and +global receptive field, which result in MLP's lack of generalization ability. +Based on the above insights, we introduce a refined Mamba variant tailored for +time series forecasting. Our proposed model, \textbf{CMamba}, incorporates a +modified Mamba (M-Mamba) module for temporal dependencies modeling, a global +data-dependent MLP (GDD-MLP) to effectively capture cross-channel dependencies, +and a Channel Mixup mechanism to mitigate overfitting. Comprehensive +experiments conducted on seven real-world datasets demonstrate the efficacy of +our model in improving forecasting performance. + +
+
+
+
+
+ + ♻ ☆ Bayesian Matrix Decomposition and Applications + + +
+ The sole aim of this book is to give a self-contained introduction to +concepts and mathematical tools in Bayesian matrix decomposition in order to +seamlessly introduce matrix decomposition techniques and their applications in +subsequent sections. However, we clearly realize our inability to cover all the +useful and interesting results concerning Bayesian matrix decomposition and +given the paucity of scope to present this discussion, e.g., the separated +analysis of variational inference for conducting the optimization. We refer the +reader to literature in the field of Bayesian analysis for a more detailed +introduction to the related fields. + This book is primarily a summary of purpose, significance of important +Bayesian matrix decomposition methods, e.g., real-valued decomposition, +nonnegative matrix factorization, Bayesian interpolative decomposition, and the +origin and complexity of the methods which shed light on their applications. +The mathematical prerequisite is a first course in statistics and linear +algebra. Other than this modest background, the development is self-contained, +with rigorous proof provided throughout. + +
+
+
+
+
+ + ♻ ☆ Surge Phenomenon in Optimal Learning Rate and Batch Size Scaling + + +
+ In current deep learning tasks, Adam style optimizers such as Adam, Adagrad, +RMSProp, Adafactor, and Lion have been widely used as alternatives to SGD style +optimizers. These optimizers typically update model parameters using the sign +of gradients, resulting in more stable convergence curves. The learning rate +and the batch size are the most critical hyperparameters for optimizers, which +require careful tuning to enable effective convergence. Previous research has +shown that the optimal learning rate increases linearly or follows similar +rules with batch size for SGD style optimizers. However, this conclusion is not +applicable to Adam style optimizers. In this paper, we elucidate the connection +between optimal learning rates and batch sizes for Adam style optimizers +through both theoretical analysis and extensive experiments. First, we raise +the scaling law between batch sizes and optimal learning rates in the sign of +gradient case, in which we prove that the optimal learning rate first rises and +then falls as the batch size increases. Moreover, the peak value of the surge +will gradually move toward the larger batch size as training progresses. +Second, we conducted experiments on various CV and NLP tasks and verified the +correctness of the scaling law. + +
+
+
+
+
+ + ♻ ☆ Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles + Using Latent Space Generative World Models ICRA 2025 + + +
+ We propose the use of latent space generative world models to address the +covariate shift problem in autonomous driving. A world model is a neural +network capable of predicting an agent's next state given past states and +actions. By leveraging a world model during training, the driving policy +effectively mitigates covariate shift without requiring an excessive amount of +training data. During end-to-end training, our policy learns how to recover +from errors by aligning with states observed in human demonstrations, so that +at runtime it can recover from perturbations outside the training distribution. +Additionally, we introduce a novel transformer-based perception encoder that +employs multi-view cross-attention and a learned scene query. We present +qualitative and quantitative results, demonstrating significant improvements +upon prior state of the art in closed-loop testing in the CARLA simulator, as +well as showing the ability to handle perturbations in both CARLA and NVIDIA's +DRIVE Sim. + +
+
+ comment: 7 pages, 6 figures, for ICRA 2025 conference, for associated video + file, see https://youtu.be/fO7RZ57gVxk +
+
+
+
+
+ + ♻ ☆ EDA-DM: Enhanced Distribution Alignment for Post-Training Quantization + of Diffusion Models + + +
+ Diffusion models have achieved great success in image generation tasks +through iterative noise estimation. However, the heavy denoising process and +complex neural networks hinder their low-latency applications in real-world +scenarios. Quantization can effectively reduce model complexity, and +post-training quantization (PTQ), which does not require fine-tuning, is highly +promising for compressing and accelerating diffusion models. Unfortunately, we +find that due to the highly dynamic distribution of activations in different +denoising steps, existing PTQ methods for diffusion models suffer from +distribution mismatch issues at both calibration sample level and +reconstruction output level, which makes the performance far from satisfactory, +especially in low-bit cases. In this paper, we propose Enhanced Distribution +Alignment for Post-Training Quantization of Diffusion Models (EDA-DM) to +address the above issues. Specifically, at the calibration sample level, we +select calibration samples based on the density and variety in the latent +space, thus facilitating the alignment of their distribution with the overall +samples; and at the reconstruction output level, we modify the loss of block +reconstruction with the losses of layers, aligning the outputs of quantized +model and full-precision model at different network granularity. Extensive +experiments demonstrate that EDA-DM significantly outperforms the existing PTQ +methods across various models (DDIM, LDM-4, LDM-8, Stable-Diffusion) and +different datasets (CIFAR-10, LSUN-Bedroom, LSUN-Church, ImageNet, MS-COCO). + +
+
+ comment: Code: http://github.com/BienLuky/EDA-DM +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ Revisiting Acoustic Similarity in Emotional Speech and Music via + Self-Supervised Representations + + +
+ Emotion recognition from speech and music shares similarities due to their +acoustic overlap, which has led to interest in transferring knowledge between +these domains. However, the shared acoustic cues between speech and music, +particularly those encoded by Self-Supervised Learning (SSL) models, remain +largely unexplored, given the fact that SSL models for speech and music have +rarely been applied in cross-domain research. In this work, we revisit the +acoustic similarity between emotion speech and music, starting with an analysis +of the layerwise behavior of SSL models for Speech Emotion Recognition (SER) +and Music Emotion Recognition (MER). Furthermore, we perform cross-domain +adaptation by comparing several approaches in a two-stage fine-tuning process, +examining effective ways to utilize music for SER and speech for MER. Lastly, +we explore the acoustic similarities between emotional speech and music using +Frechet audio distance for individual emotions, uncovering the issue of emotion +bias in both speech and music SSL models. Our findings reveal that while speech +and music SSL models do capture shared acoustic features, their behaviors can +vary depending on different emotions due to their training strategies and +domain-specificities. Additionally, parameter-efficient fine-tuning can enhance +SER and MER performance by leveraging knowledge from each other. This study +provides new insights into the acoustic similarity between emotional speech and +music, and highlights the potential for cross-domain generalization to improve +SER and MER systems. + +
+
+
+
+
+ + ☆ A Multimodal Single-Branch Embedding Network for Recommendation in + Cold-Start and Missing Modality Scenarios + + +
+ Most recommender systems adopt collaborative filtering (CF) and provide +recommendations based on past collective interactions. Therefore, the +performance of CF algorithms degrades when few or no interactions are +available, a scenario referred to as cold-start. To address this issue, +previous work relies on models leveraging both collaborative data and side +information on the users or items. Similar to multimodal learning, these models +aim at combining collaborative and content representations in a shared +embedding space. In this work we propose a novel technique for multimodal +recommendation, relying on a multimodal Single-Branch embedding network for +Recommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction +data as well as multimodal side information using the same single-branch +embedding network on different modalities. This makes SiBraR effective in +scenarios of missing modality, including cold start. Our extensive experiments +on large-scale recommendation datasets from three different recommendation +domains (music, movie, and e-commerce) and providing multimodal content +information (audio, text, image, labels, and interactions) show that SiBraR +significantly outperforms CF as well as state-of-the-art content-based RSs in +cold-start scenarios, and is competitive in warm scenarios. We show that +SiBraR's recommendations are accurate in missing modality scenarios, and that +the model is able to map different modalities to the same region of the shared +embedding space, hence reducing the modality gap. + +
+
+ comment: Accepted at 18th ACM Conference on Recommender Systems (RecSys '24) +
+
+
+
+
+ + ☆ Modeling the Popularity of Events on Web by Sparsity and + Mutual-Excitation Guided Graph Neural Network + + +
+ The content of a webpage described or posted an event in the cyberspace +inevitably reflects viewpoints, values and trends of the physical society. +Mapping an event on web to the popularity score plays a pivot role to sense the +social trends from the cyberspace. However, the complex semantic correspondence +between texts and images, as well as the implicit text-image-popularity mapping +mechanics pose a significant challenge to this non-trivial task. In this paper, +we address this problem from a viewpoint of understanding the interpretable +mapping mechanics. Concretely, we organize the keywords from different events +into an unified graph. The unified graph facilitates to model the popularity of +events via two-level mappings, i.e., the self excitation and the mutual +excitation. The self-excitation assumes that each keyword forms the popularity +while the mutual-excitation models that two keywords would excite each other to +determine the popularity of an event. Specifically, we use Graph Neural Network +(GNN) as the backbone to model the self-excitation, the mutual excitation and +the context of images into a sparse and deep factor model. Besides, to our best +knowledge, we release a challenge web event dataset for the popularity +prediction task. The experimental results on three public datasets demonstrate +that our method achieves significant improvements and outperforms the +state-of-the-art methods. Dataset is publicly available at: +https://github.com/pangjunbiao/Hot-events-dataset. + +
+
+
+
+
+ + ☆ Subjective and Objective Quality-of-Experience Evaluation Study for Live + Video Streaming + + +
+ In recent years, live video streaming has gained widespread popularity across +various social media platforms. Quality of experience (QoE), which reflects +end-users' satisfaction and overall experience, plays a critical role for media +service providers to optimize large-scale live compression and transmission +strategies to achieve perceptually optimal rate-distortion trade-off. Although +many QoE metrics for video-on-demand (VoD) have been proposed, there remain +significant challenges in developing QoE metrics for live video streaming. To +bridge this gap, we conduct a comprehensive study of subjective and objective +QoE evaluations for live video streaming. For the subjective QoE study, we +introduce the first live video streaming QoE dataset, TaoLive QoE, which +consists of $42$ source videos collected from real live broadcasts and $1,155$ +corresponding distorted ones degraded due to a variety of streaming +distortions, including conventional streaming distortions such as compression, +stalling, as well as live streaming-specific distortions like frame skipping, +variable frame rate, etc. Subsequently, a human study was conducted to derive +subjective QoE scores of videos in the TaoLive QoE dataset. For the objective +QoE study, we benchmark existing QoE models on the TaoLive QoE dataset as well +as publicly available QoE datasets for VoD scenarios, highlighting that current +models struggle to accurately assess video QoE, particularly for live content. +Hence, we propose an end-to-end QoE evaluation model, Tao-QoE, which integrates +multi-scale semantic features and optical flow-based motion features to +predicting a retrospective QoE score, eliminating reliance on statistical +quality of service (QoS) features. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ A Simple but Strong Baseline for Sounding Video Generation: Effective + Adaptation of Audio and Video Diffusion Models for Joint Generation + + +
+ In this work, we build a simple but strong baseline for sounding video +generation. Given base diffusion models for audio and video, we integrate them +with additional modules into a single model and train it to make the model +jointly generate audio and video. To enhance alignment between audio-video +pairs, we introduce two novel mechanisms in our model. The first one is +timestep adjustment, which provides different timestep information to each base +model. It is designed to align how samples are generated along with timesteps +across modalities. The second one is a new design of the additional modules, +termed Cross-Modal Conditioning as Positional Encoding (CMC-PE). In CMC-PE, +cross-modal information is embedded as if it represents temporal position +information, and the embeddings are fed into the model like positional +encoding. Compared with the popular cross-attention mechanism, CMC-PE provides +a better inductive bias for temporal alignment in the generated data. +Experimental results validate the effectiveness of the two newly introduced +mechanisms and also demonstrate that our method outperforms existing methods. + +
+
+ comment: The source code will be released soon +
+
+
+
+
+ + ♻ ☆ Exploring Event-based Human Pose Estimation with 3D Event + Representations + + +
+ Human pose estimation is a fundamental and appealing task in computer vision. +Although traditional cameras are commonly applied, their reliability decreases +in scenarios under high dynamic range or heavy motion blur, where event cameras +offer a robust solution. Predominant event-based methods accumulate events into +frames, ignoring the asynchronous and high temporal resolution that is crucial +for distinguishing distinct actions. To address this issue and to unlock the 3D +potential of event information, we introduce two 3D event representations: the +Rasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The +RasEPC aggregates events within concise temporal slices at identical positions, +preserving their 3D attributes along with statistical information, thereby +significantly reducing memory and computational demands. Meanwhile, the DEV +representation discretizes events into voxels and projects them across three +orthogonal planes, utilizing decoupled event attention to retrieve 3D cues from +the 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic +event-based dataset crafted to facilitate training and quantitative analysis in +outdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD +dataset, and our EV-3DPW dataset, with further qualitative validation via a +derived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our +code and dataset have been made publicly available at +https://github.com/MasterHow/EventPointPose. + +
+
+ comment: Accepted to Computer Vision and Image Understanding (CVPU). Extended + version of arXiv:2206.04511. The code and dataset are available at + https://github.com/MasterHow/EventPointPose +
+
+
+
+
+ + ♻ ☆ Multimodal Fusion via Hypergraph Autoencoder and Contrastive Learning + for Emotion Recognition in Conversation + + +
+ Multimodal emotion recognition in conversation (MERC) seeks to identify the +speakers' emotions expressed in each utterance, offering significant potential +across diverse fields. The challenge of MERC lies in balancing speaker modeling +and context modeling, encompassing both long-distance and short-distance +contexts, as well as addressing the complexity of multimodal information +fusion. Recent research adopts graph-based methods to model intricate +conversational relationships effectively. Nevertheless, the majority of these +methods utilize a fixed fully connected structure to link all utterances, +relying on convolution to interpret complex context. This approach can +inherently heighten the redundancy in contextual messages and excessive graph +network smoothing, particularly in the context of long-distance conversations. +To address this issue, we propose a framework that dynamically adjusts +hypergraph connections by variational hypergraph autoencoder (VHGAE), and +employs contrastive learning to mitigate uncertainty factors during the +reconstruction process. Experimental results demonstrate the effectiveness of +our proposal against the state-of-the-art methods on IEMOCAP and MELD datasets. +We release the code to support the reproducibility of this work at +https://github.com/yzjred/-HAUCL. + +
+
+ comment: Accepted by ACM MULTIMEDIA 2024 +
+
+
+
+
+ + ♻ ☆ Arena: A Patch-of-Interest ViT Inference Acceleration System for + Edge-Assisted Video Analytics + + +
+ The advent of edge computing has made real-time intelligent video analytics +feasible. Previous works, based on traditional model architecture (e.g., CNN, +RNN, etc.), employ various strategies to filter out non-region-of-interest +content to minimize bandwidth and computation consumption but show inferior +performance in adverse environments. Recently, visual foundation models based +on transformers have shown great performance in adverse environments due to +their amazing generalization capability. However, they require a large amount +of computation power, which limits their applications in real-time intelligent +video analytics. In this paper, we find visual foundation models like Vision +Transformer (ViT) also have a dedicated acceleration mechanism for video +analytics. To this end, we introduce Arena, an end-to-end edge-assisted video +inference acceleration system based on ViT. We leverage the capability of ViT +that can be accelerated through token pruning by only offloading and feeding +Patches-of-Interest to the downstream models. Additionally, we design an +adaptive keyframe inference switching algorithm tailored to different videos, +capable of adapting to the current video content to jointly optimize accuracy +and bandwidth. Through extensive experiments, our findings reveal that Arena +can boost inference speeds by up to 1.58\(\times\) and 1.82\(\times\) on +average while consuming only 47\% and 31\% of the bandwidth, respectively, all +with high inference accuracy. + +
+
+
+
+
+
+
+
+ + Robotics 63 + +
+
+
+ + RT-GuIDE: Real-Time Gaussian splatting for Information-Driven + Exploration ICRA2025 + + +
+ We propose a framework for active mapping and exploration that leverages +Gaussian splatting for constructing information-rich maps. Further, we develop +a parallelized motion planning algorithm that can exploit the Gaussian map for +real-time navigation. The Gaussian map constructed onboard the robot is +optimized for both photometric and geometric quality while enabling real-time +situational awareness for autonomy. We show through simulation experiments that +our method is competitive with approaches that use alternate information gain +metrics, while being orders of magnitude faster to compute. In real-world +experiments, our algorithm achieves better map quality (10% higher Peak +Signal-to-Noise Ratio (PSNR) and 30% higher geometric reconstruction accuracy) +than Gaussian maps constructed by traditional exploration baselines. Experiment +videos and more details can be found on our project page: +https://tyuezhan.github.io/RT_GuIDE/ + +
+
+ comment: Submitted to ICRA2025 +
+
+
+
+
+ + ☆ Robot See Robot Do: Imitating Articulated Object Manipulation with + Monocular 4D Reconstruction CoRL 2024 + + +
+ Humans can learn to manipulate new objects by simply watching others; +providing robots with the ability to learn from such demonstrations would +enable a natural interface specifying new behaviors. This work develops Robot +See Robot Do (RSRD), a method for imitating articulated object manipulation +from a single monocular RGB human demonstration given a single static +multi-view object scan. We first propose 4D Differentiable Part Models +(4D-DPM), a method for recovering 3D part motion from a monocular video with +differentiable rendering. This analysis-by-synthesis approach uses part-centric +feature fields in an iterative optimization which enables the use of geometric +regularizers to recover 3D motions from only a single video. Given this 4D +reconstruction, the robot replicates object trajectories by planning bimanual +arm motions that induce the demonstrated object part motion. By representing +demonstrations as part-centric trajectories, RSRD focuses on replicating the +demonstration's intended behavior while considering the robot's own +morphological limits, rather than attempting to reproduce the hand's motion. We +evaluate 4D-DPM's 3D tracking accuracy on ground truth annotated 3D part +trajectories and RSRD's physical execution performance on 9 objects across 10 +trials each on a bimanual YuMi robot. Each phase of RSRD achieves an average of +87% success rate, for a total end-to-end success rate of 60% across 90 trials. +Notably, this is accomplished using only feature fields distilled from large +pretrained vision models -- without any task-specific training, fine-tuning, +dataset collection, or annotation. Project page: +https://robot-see-robot-do.github.io + +
+
+ comment: CoRL 2024, Project page: https://robot-see-robot-do.github.io +
+
+
+
+
+ + EvMAPPER: High Altitude Orthomapping with Event Cameras + + +
+ Traditionally, unmanned aerial vehicles (UAVs) rely on CMOS-based cameras to +collect images about the world below. One of the most successful applications +of UAVs is to generate orthomosaics or orthomaps, in which a series of images +are integrated together to develop a larger map. However, the use of CMOS-based +cameras with global or rolling shutters mean that orthomaps are vulnerable to +challenging light conditions, motion blur, and high-speed motion of +independently moving objects under the camera. Event cameras are less sensitive +to these issues, as their pixels are able to trigger asynchronously on +brightness changes. This work introduces the first orthomosaic approach using +event cameras. In contrast to existing methods relying only on CMOS cameras, +our approach enables map generation even in challenging light conditions, +including direct sunlight and after sunset. + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ☆ Language-Embedded Gaussian Splats (LEGS): Incrementally Building + Room-Scale Representations with a Mobile Robot + + +
+ Building semantic 3D maps is valuable for searching for objects of interest +in offices, warehouses, stores, and homes. We present a mapping system that +incrementally builds a Language-Embedded Gaussian Splat (LEGS): a detailed 3D +scene representation that encodes both appearance and semantics in a unified +representation. LEGS is trained online as a robot traverses its environment to +enable localization of open-vocabulary object queries. We evaluate LEGS on 4 +room-scale scenes where we query for objects in the scene to assess how LEGS +can capture semantic meaning. We compare LEGS to LERF and find that while both +systems have comparable object query success rates, LEGS trains over 3.5x +faster than LERF. Results suggest that a multi-camera setup and incremental +bundle adjustment can boost visual reconstruction quality in constrained robot +trajectories, and suggest LEGS can localize open-vocabulary and long-tail +object queries with up to 66% accuracy. + +
+
+
+
+
+ + ☆ StackGen: Generating Stable Structures from Silhouettes via Diffusion + + +
+ Humans naturally obtain intuition about the interactions between and the +stability of rigid objects by observing and interacting with the world. It is +this intuition that governs the way in which we regularly configure objects in +our environment, allowing us to build complex structures from simple, everyday +objects. Robotic agents, on the other hand, traditionally require an explicit +model of the world that includes the detailed geometry of each object and an +analytical model of the environment dynamics, which are difficult to scale and +preclude generalization. Instead, robots would benefit from an awareness of +intuitive physics that enables them to similarly reason over the stable +interaction of objects in their environment. Towards that goal, we propose +StackGen, a diffusion model that generates diverse stable configurations of +building blocks matching a target silhouette. To demonstrate the capability of +the method, we evaluate it in a simulated environment and deploy it in the real +setting using a robotic arm to assemble structures generated by the model. + +
+
+
+
+
+ + ☆ A Sim-to-Real Vision-based Lane Keeping System for a 1:10-scale + Autonomous Vehicle + + +
+ In recent years, several competitions have highlighted the need to +investigate vision-based solutions to address scenarios with functional +insufficiencies in perception, world modeling and localization. This article +presents the Vision-based Lane Keeping System (VbLKS) developed by the +DEI-Unipd Team within the context of the Bosch Future Mobility Challenge 2022. +The main contribution lies in a Simulation-to-Reality (Sim2Real) GPS-denied +VbLKS for a 1:10-scale autonomous vehicle. In this VbLKS, the input to a +tailored Pure Pursuit (PP) based control strategy, namely the Lookahead Heading +Error (LHE), is estimated at a constant lookahead distance employing a +Convolutional Neural Network (CNN). A training strategy for a compact CNN is +proposed, emphasizing data generation and augmentation on simulated camera +images from a 3D Gazebo simulator, and enabling real-time operation on +low-level hardware. A tailored PP-based lateral controller equipped with a +derivative action and a PP-based velocity reference generation are implemented. +Tuning ranges are established through a systematic time-delay stability +analysis. Validation in a representative controlled laboratory setting is +provided. + +
+
+ comment: 16 pages, 23 figures +
+
+
+
+
+ + ☆ DiffSSC: Semantic LiDAR Scan Completion using Denoising Diffusion + Probabilistic Models + + +
+ Perception systems play a crucial role in autonomous driving, incorporating +multiple sensors and corresponding computer vision algorithms. 3D LiDAR sensors +are widely used to capture sparse point clouds of the vehicle's surroundings. +However, such systems struggle to perceive occluded areas and gaps in the scene +due to the sparsity of these point clouds and their lack of semantics. To +address these challenges, Semantic Scene Completion (SSC) jointly predicts +unobserved geometry and semantics in the scene given raw LiDAR measurements, +aiming for a more complete scene representation. Building on promising results +of diffusion models in image generation and super-resolution tasks, we propose +their extension to SSC by implementing the noising and denoising diffusion +processes in the point and semantic spaces individually. To control the +generation, we employ semantic LiDAR point clouds as conditional input and +design local and global regularization losses to stabilize the denoising +process. We evaluate our approach on autonomous driving datasets and our +approach outperforms the state-of-the-art for SSC. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ GSON: A Group-based Social Navigation Framework with Large Multimodal + Model + + +
+ As the number of service robots and autonomous vehicles in human-centered +environments grows, their requirements go beyond simply navigating to a +destination. They must also take into account dynamic social contexts and +ensure respect and comfort for others in shared spaces, which poses significant +challenges for perception and planning. In this paper, we present a group-based +social navigation framework GSON to enable mobile robots to perceive and +exploit the social group of their surroundings by leveling the visual reasoning +capability of the Large Multimodal Model (LMM). For perception, we apply visual +prompting techniques to zero-shot extract the social relationship among +pedestrians and combine the result with a robust pedestrian detection and +tracking pipeline to alleviate the problem of low inference speed of the LMM. +Given the perception result, the planning system is designed to avoid +disrupting the current social structure. We adopt a social structure-based +mid-level planner as a bridge between global path planning and local motion +planning to preserve the global context and reactive response. The proposed +method is validated on real-world mobile robot navigation tasks involving +complex social structure understanding and reasoning. Experimental results +demonstrate the effectiveness of the system in these scenarios compared with +several baselines. + +
+
+
+
+
+ + ☆ SKT: Integrating State-Aware Keypoint Trajectories with Vision-Language + Models for Robotic Garment Manipulation + + +
+ Automating garment manipulation poses a significant challenge for assistive +robotics due to the diverse and deformable nature of garments. Traditional +approaches typically require separate models for each garment type, which +limits scalability and adaptability. In contrast, this paper presents a unified +approach using vision-language models (VLMs) to improve keypoint prediction +across various garment categories. By interpreting both visual and semantic +information, our model enables robots to manage different garment states with a +single model. We created a large-scale synthetic dataset using advanced +simulation techniques, allowing scalable training without extensive real-world +data. Experimental results indicate that the VLM-based method significantly +enhances keypoint detection accuracy and task success rates, providing a more +flexible and general solution for robotic garment manipulation. In addition, +this research also underscores the potential of VLMs to unify various garment +manipulation tasks within a single framework, paving the way for broader +applications in home automation and assistive robotics for future. + +
+
+
+
+
+ + ☆ DualAD: Dual-Layer Planning for Reasoning in Autonomous Driving + + +
+ We present a novel autonomous driving framework, DualAD, designed to imitate +human reasoning during driving. DualAD comprises two layers: a rule-based +motion planner at the bottom layer that handles routine driving tasks requiring +minimal reasoning, and an upper layer featuring a rule-based text encoder that +converts driving scenarios from absolute states into text description. This +text is then processed by a large language model (LLM) to make driving +decisions. The upper layer intervenes in the bottom layer's decisions when +potential danger is detected, mimicking human reasoning in critical situations. +Closed-loop experiments demonstrate that DualAD, using a zero-shot pre-trained +model, significantly outperforms rule-based motion planners that lack reasoning +abilities. Our experiments also highlight the effectiveness of the text +encoder, which considerably enhances the model's scenario understanding. +Additionally, the integrated DualAD model improves with stronger LLMs, +indicating the framework's potential for further enhancement. We make code and +benchmarks publicly available. + +
+
+ comment: Autonomous Driving, Large Language Models (LLMs), Human Reasoning, + Critical Scenario +
+
+
+
+
+ + ☆ Explaining Explaining + + +
+ Explanation is key to people having confidence in high-stakes AI systems. +However, machine-learning-based systems - which account for almost all current +AI - can't explain because they are usually black boxes. The explainable AI +(XAI) movement hedges this problem by redefining "explanation". The +human-centered explainable AI (HCXAI) movement identifies the +explanation-oriented needs of users but can't fulfill them because of its +commitment to machine learning. In order to achieve the kinds of explanations +needed by real people operating in critical domains, we must rethink how to +approach AI. We describe a hybrid approach to developing cognitive agents that +uses a knowledge-based infrastructure supplemented by data obtained through +machine learning when applicable. These agents will serve as assistants to +humans who will bear ultimate responsibility for the decisions and actions of +the human-robot team. We illustrate the explanatory potential of such agents +using the under-the-hood panels of a demonstration system in which a team of +simulated robots collaborates on a search task assigned by a human. + +
+
+
+
+
+ + ☆ Revisit Anything: Visual Place Recognition via Image Segment Retrieval ECCV 2024 + + +
+ Accurately recognizing a revisited place is crucial for embodied agents to +localize and navigate. This requires visual representations to be distinct, +despite strong variations in camera viewpoint and scene appearance. Existing +visual place recognition pipelines encode the "whole" image and search for +matches. This poses a fundamental challenge in matching two images of the same +place captured from different camera viewpoints: "the similarity of what +overlaps can be dominated by the dissimilarity of what does not overlap". We +address this by encoding and searching for "image segments" instead of the +whole images. We propose to use open-set image segmentation to decompose an +image into `meaningful' entities (i.e., things and stuff). This enables us to +create a novel image representation as a collection of multiple overlapping +subgraphs connecting a segment with its neighboring segments, dubbed +SuperSegment. Furthermore, to efficiently encode these SuperSegments into +compact vector representations, we propose a novel factorized representation of +feature aggregation. We show that retrieving these partial representations +leads to significantly higher recognition recall than the typical whole image +based retrieval. Our segments-based approach, dubbed SegVLAD, sets a new +state-of-the-art in place recognition on a diverse selection of benchmark +datasets, while being applicable to both generic and task-specialized image +encoders. Finally, we demonstrate the potential of our method to ``revisit +anything'' by evaluating our method on an object instance retrieval task, which +bridges the two disparate areas of research: visual place recognition and +object-goal navigation, through their common aim of recognizing goal objects +specific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything. + +
+
+ comment: Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures +
+
+
+
+
+ + ☆ HARMONIC: Cognitive and Control Collaboration in Human-Robotic Teams ICRA 2025 + + +
+ This paper presents a novel approach to multi-robot planning and +collaboration. We demonstrate a cognitive strategy for robots in human-robot +teams that incorporates metacognition, natural language communication, and +explainability. The system is embodied using the HARMONIC architecture that +flexibly integrates cognitive and control capabilities across the team. We +evaluate our approach through simulation experiments involving a joint search +task by a team of heterogeneous robots (a UGV and a drone) and a human. We +detail the system's handling of complex, real-world scenarios, effective action +coordination between robots with different capabilities, and natural +human-robot communication. This work demonstrates that the robots' ability to +reason about plans, goals, and attitudes, and to provide explanations for +actions and decisions are essential prerequisites for realistic human-robot +teaming. + +
+
+ comment: Submitted to ICRA 2025 Conference, Atlanta, GA, USA +
+
+
+
+
+ + ☆ MMDVS-LF: A Multi-Modal Dynamic-Vision-Sensor Line Following Dataset + + +
+ Dynamic Vision Sensors (DVS), offer a unique advantage in control +applications, due to their high temporal resolution, and asynchronous +event-based data. Still, their adoption in machine learning algorithms remains +limited. To address this gap, and promote the development of models that +leverage the specific characteristics of DVS data, we introduce the Multi-Modal +Dynamic-Vision-Sensor Line Following dataset (MMDVS-LF). This comprehensive +dataset, is the first to integrate multiple sensor modalities, including DVS +recordings, RGB video, odometry, and Inertial Measurement Unit (IMU) data, from +a small-scale standardized vehicle. Additionally, the dataset includes +eye-tracking and demographic data of drivers performing a Line Following task +on a track. With its diverse range of data, MMDVS-LF opens new opportunities +for developing deep learning algorithms, and conducting data science projects +across various domains, supporting innovation in autonomous systems and control +applications. + +
+
+
+
+
+ + ☆ HARMONIC: A Framework for Explanatory Cognitive Robots ICRA + + +
+ We present HARMONIC, a framework for implementing cognitive robots that +transforms general-purpose robots into trusted teammates capable of complex +decision-making, natural communication and human-level explanation. The +framework supports interoperability between a strategic (cognitive) layer for +high-level decision-making and a tactical (robot) layer for low-level control +and execution. We describe the core features of the framework and our initial +implementation, in which HARMONIC was deployed on a simulated UGV and drone +involved in a multi-robot search and retrieval task. + +
+
+ comment: Accepted for presentation at ICRA@40. 23-26 September 2024, + Rotterdam, Netherlands +
+
+
+
+
+ + ☆ Reasoning Multi-Agent Behavioral Topology for Interactive Autonomous + Driving + + +
+ Autonomous driving system aims for safe and social-consistent driving through +the behavioral integration among interactive agents. However, challenges remain +due to multi-agent scene uncertainty and heterogeneous interaction. Current +dense and sparse behavioral representations struggle with inefficiency and +inconsistency in multi-agent modeling, leading to instability of collective +behavioral patterns when integrating prediction and planning (IPP). To address +this, we initiate a topological formation that serves as a compliant behavioral +foreground to guide downstream trajectory generations. Specifically, we +introduce Behavioral Topology (BeTop), a pivotal topological formulation that +explicitly represents the consensual behavioral pattern among multi-agent +future. BeTop is derived from braid theory to distill compliant interactive +topology from multi-agent future trajectories. A synergistic learning framework +(BeTopNet) supervised by BeTop facilitates the consistency of behavior +prediction and planning within the predicted topology priors. Through imitative +contingency learning, BeTop also effectively manages behavioral uncertainty for +prediction and planning. Extensive verification on large-scale real-world +datasets, including nuPlan and WOMD, demonstrates that BeTop achieves +state-of-the-art performance in both prediction and planning tasks. Further +validations on the proposed interactive scenario benchmark showcase planning +compliance in interactive cases. + +
+
+
+
+
+ + ☆ ReliOcc: Towards Reliable Semantic Occupancy Prediction via Uncertainty + Learning + + +
+ Vision-centric semantic occupancy prediction plays a crucial role in +autonomous driving, which requires accurate and reliable predictions from +low-cost sensors. Although having notably narrowed the accuracy gap with LiDAR, +there is still few research effort to explore the reliability in predicting +semantic occupancy from camera. In this paper, we conduct a comprehensive +evaluation of existing semantic occupancy prediction models from a reliability +perspective for the first time. Despite the gradual alignment of camera-based +models with LiDAR in term of accuracy, a significant reliability gap persists. +To addresses this concern, we propose ReliOcc, a method designed to enhance the +reliability of camera-based occupancy networks. ReliOcc provides a +plug-and-play scheme for existing models, which integrates hybrid uncertainty +from individual voxels with sampling-based noise and relative voxels through +mix-up learning. Besides, an uncertainty-aware calibration strategy is devised +to further enhance model reliability in offline mode. Extensive experiments +under various settings demonstrate that ReliOcc significantly enhances model +reliability while maintaining the accuracy of both geometric and semantic +predictions. Importantly, our proposed approach exhibits robustness to sensor +failures and out of domain noises during inference. + +
+
+ comment: Technical report. Work in progress +
+
+
+
+
+ + ☆ Control Industrial Automation System with Large Language Models + + +
+ Traditional industrial automation systems require specialized expertise to +operate and complex reprogramming to adapt to new processes. Large language +models offer the intelligence to make them more flexible and easier to use. +However, LLMs' application in industrial settings is underexplored. This paper +introduces a framework for integrating LLMs to achieve end-to-end control of +industrial automation systems. At the core of the framework are an agent system +designed for industrial tasks, a structured prompting method, and an +event-driven information modeling mechanism that provides real-time data for +LLM inference. The framework supplies LLMs with real-time events on different +context semantic levels, allowing them to interpret the information, generate +production plans, and control operations on the automation system. It also +supports structured dataset creation for fine-tuning on this downstream +application of LLMs. Our contribution includes a formal system design, +proof-of-concept implementation, and a method for generating task-specific +datasets for LLM fine-tuning and testing. This approach enables a more adaptive +automation system that can respond to spontaneous events, while allowing easier +operation and configuration through natural language for more intuitive +human-machine interaction. We provide demo videos and detailed data on GitHub: +https://github.com/YuchenXia/LLM4IAS + +
+
+
+
+
+ + ☆ Joint Localization and Planning using Diffusion ICRA 2025 + + +
+ Diffusion models have been successfully applied to robotics problems such as +manipulation and vehicle path planning. In this work, we explore their +application to end-to-end navigation -- including both perception and planning +-- by considering the problem of jointly performing global localization and +path planning in known but arbitrary 2D environments. In particular, we +introduce a diffusion model which produces collision-free paths in a global +reference frame given an egocentric LIDAR scan, an arbitrary map, and a desired +goal position. To this end, we implement diffusion in the space of paths in +SE(2), and describe how to condition the denoising process on both obstacles +and sensor observations. In our evaluation, we show that the proposed +conditioning techniques enable generalization to realistic maps of considerably +different appearance than the training environment, demonstrate our model's +ability to accurately describe ambiguous solutions, and run extensive +simulation experiments showcasing our model's use as a real-time, end-to-end +localization and planning stack. + +
+
+ comment: 7 pages, 9 figures. Submitted to ICRA 2025, under review +
+
+
+
+
+ + ☆ LoopSR: Looping Sim-and-Real for Lifelong Policy Adaptation of Legged + Robots + + +
+ Reinforcement Learning (RL) has shown its remarkable and generalizable +capability in legged locomotion through sim-to-real transfer. However, while +adaptive methods like domain randomization are expected to make policy more +robust to diverse environments, such comprehensiveness potentially detracts +from the policy's performance in any specific environment according to the No +Free Lunch theorem, leading to a suboptimal solution once deployed in the real +world. To address this issue, we propose a lifelong policy adaptation framework +named LoopSR, which utilizes a transformer-based encoder to project real-world +trajectories into a latent space, and accordingly reconstruct the real-world +environments back in simulation for further improvement. Autoencoder +architecture and contrastive learning methods are adopted to better extract the +characteristics of real-world dynamics. The simulation parameters for continual +training are derived by combining predicted parameters from the decoder with +retrieved parameters from the simulation trajectory dataset. By leveraging the +continual training, LoopSR achieves superior data efficiency compared with +strong baselines, with only a limited amount of data to yield eminent +performance in both sim-to-sim and sim-to-real experiments. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Deblur e-NeRF: NeRF from Motion-Blurred Events under High-speed or + Low-light Conditions ECCV 2024 + + +
+ The stark contrast in the design philosophy of an event camera makes it +particularly ideal for operating under high-speed, high dynamic range and +low-light conditions, where standard cameras underperform. Nonetheless, event +cameras still suffer from some amount of motion blur, especially under these +challenging conditions, in contrary to what most think. This is attributed to +the limited bandwidth of the event sensor pixel, which is mostly proportional +to the light intensity. Thus, to ensure that event cameras can truly excel in +such conditions where it has an edge over standard cameras, it is crucial to +account for event motion blur in downstream applications, especially +reconstruction. However, none of the recent works on reconstructing Neural +Radiance Fields (NeRFs) from events, nor event simulators, have considered the +full effects of event motion blur. To this end, we propose, Deblur e-NeRF, a +novel method to directly and effectively reconstruct blur-minimal NeRFs from +motion-blurred events generated under high-speed motion or low-light +conditions. The core component of this work is a physically-accurate pixel +bandwidth model proposed to account for event motion blur under arbitrary speed +and lighting conditions. We also introduce a novel threshold-normalized total +variation loss to improve the regularization of large textureless patches. +Experiments on real and novel realistically simulated sequences verify our +effectiveness. Our code, event simulator and synthetic event dataset will be +open-sourced. + +
+
+ comment: Accepted to ECCV 2024. Project website is accessible at + https://wengflow.github.io/deblur-e-nerf. arXiv admin note: text overlap with + arXiv:2006.07722 by other authors +
+
+
+
+
+ + ☆ Model-Free versus Model-Based Reinforcement Learning for Fixed-Wing UAV + Attitude Control Under Varying Wind Conditions + + +
+ This paper evaluates and compares the performance of model-free and +model-based reinforcement learning for the attitude control of fixed-wing +unmanned aerial vehicles using PID as a reference point. The comparison focuses +on their ability to handle varying flight dynamics and wind disturbances in a +simulated environment. Our results show that the Temporal Difference Model +Predictive Control agent outperforms both the PID controller and other +model-free reinforcement learning methods in terms of tracking accuracy and +robustness over different reference difficulties, particularly in nonlinear +flight regimes. Furthermore, we introduce actuation fluctuation as a key metric +to assess energy efficiency and actuator wear, and we test two different +approaches from the literature: action variation penalty and conditioning for +action policy smoothness. We also evaluate all control methods when subject to +stochastic turbulence and gusts separately, so as to measure their effects on +tracking performance, observe their limitations and outline their implications +on the Markov decision process formalism. + +
+
+ comment: Published at ICINCO 2024 +
+
+
+
+
+ + ☆ Swarm-LIO2: Decentralized, Efficient LiDAR-inertial Odometry for UAV + Swarms + + +
+ Aerial swarm systems possess immense potential in various aspects, such as +cooperative exploration, target tracking, search and rescue. Efficient, +accurate self and mutual state estimation are the critical preconditions for +completing these swarm tasks, which remain challenging research topics. This +paper proposes Swarm-LIO2: a fully decentralized, plug-and-play, +computationally efficient, and bandwidth-efficient LiDAR-inertial odometry for +aerial swarm systems. Swarm-LIO2 uses a decentralized, plug-and-play network as +the communication infrastructure. Only bandwidth-efficient and low-dimensional +information is exchanged, including identity, ego-state, mutual observation +measurements, and global extrinsic transformations. To support the +plug-and-play of new teammate participants, Swarm-LIO2 detects potential +teammate UAVs and initializes the temporal offset and global extrinsic +transformation all automatically. To enhance the initialization efficiency, +novel reflectivity-based UAV detection, trajectory matching, and factor graph +optimization methods are proposed. For state estimation, Swarm-LIO2 fuses +LiDAR, IMU, and mutual observation measurements within an efficient ESIKF +framework, with careful compensation of temporal delay and modeling of +measurements to enhance the accuracy and consistency. + +
+
+ comment: 23 Pages +
+
+
+
+
+ + ☆ SECURE: Semantics-aware Embodied Conversation under Unawareness for + Lifelong Robot Learning + + +
+ This paper addresses a challenging interactive task learning scenario we call +rearrangement under unawareness: to manipulate a rigid-body environment in a +context where the robot is unaware of a concept that's key to solving the +instructed task. We propose SECURE, an interactive task learning framework +designed to solve such problems by fixing a deficient domain model using +embodied conversation. Through dialogue, the robot discovers and then learns to +exploit unforeseen possibilities. Using SECURE, the robot not only learns from +the user's corrective feedback when it makes a mistake, but it also learns to +make strategic dialogue decisions for revealing useful evidence about novel +concepts for solving the instructed task. Together, these abilities allow the +robot to generalise to subsequent tasks using newly acquired knowledge. We +demonstrate that a robot that is semantics-aware -- that is, it exploits the +logical consequences of both sentence and discourse semantics in the learning +and inference process -- learns to solve rearrangement under unawareness more +effectively than a robot that lacks such capabilities. + +
+
+ comment: 10 pages,4 figures, 2 tables +
+
+
+
+
+ + ☆ Robust Ladder Climbing with a Quadrupedal Robot + + +
+ Quadruped robots are proliferating in industrial environments where they +carry sensor suites and serve as autonomous inspection platforms. Despite the +advantages of legged robots over their wheeled counterparts on rough and uneven +terrain, they are still yet to be able to reliably negotiate ubiquitous +features of industrial infrastructure: ladders. Inability to traverse ladders +prevents quadrupeds from inspecting dangerous locations, puts humans in harm's +way, and reduces industrial site productivity. In this paper, we learn +quadrupedal ladder climbing via a reinforcement learning-based control policy +and a complementary hooked end-effector. We evaluate the robustness in +simulation across different ladder inclinations, rung geometries, and +inter-rung spacings. On hardware, we demonstrate zero-shot transfer with an +overall 90% success rate at ladder angles ranging from 70{\deg} to 90{\deg}, +consistent climbing performance during unmodeled perturbations, and climbing +speeds 232x faster than the state of the art. This work expands the scope of +industrial quadruped robot applications beyond inspection on nominal terrains +to challenging infrastructural features in the environment, highlighting +synergies between robot morphology and control policy when performing complex +skills. More information can be found at the project website: +https://sites.google.com/leggedrobotics.com/climbingladders. + +
+
+ comment: Project website: + https://sites.google.com/leggedrobotics.com/climbingladders +
+
+
+
+
+ + ☆ Robotic-CLIP: Fine-tuning CLIP on Action Data for Robotic Applications + + +
+ Vision language models have played a key role in extracting meaningful +features for various robotic applications. Among these, Contrastive +Language-Image Pretraining (CLIP) is widely used in robotic tasks that require +both vision and natural language understanding. However, CLIP was trained +solely on static images paired with text prompts and has not yet been fully +adapted for robotic tasks involving dynamic actions. In this paper, we +introduce Robotic-CLIP to enhance robotic perception capabilities. We first +gather and label large-scale action data, and then build our Robotic-CLIP by +fine-tuning CLIP on 309,433 videos (~7.4 million frames) of action data using +contrastive learning. By leveraging action data, Robotic-CLIP inherits CLIP's +strong image performance while gaining the ability to understand actions in +robotic contexts. Intensive experiments show that our Robotic-CLIP outperforms +other CLIP-based models across various language-driven robotic tasks. +Additionally, we demonstrate the practical effectiveness of Robotic-CLIP in +real-world grasping applications. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Stable Object Placement Under Geometric Uncertainty via Differentiable + Contact Dynamics + + +
+ From serving a cup of coffee to carefully rearranging delicate items, stable +object placement is a crucial skill for future robots. This skill is +challenging due to the required accuracy, which is difficult to achieve under +geometric uncertainty. We leverage differentiable contact dynamics to develop a +principled method for stable object placement under geometric uncertainty. We +estimate the geometric uncertainty by minimizing the discrepancy between the +force-torque sensor readings and the model predictions through gradient +descent. We further keep track of a belief over multiple possible geometric +parameters to mitigate the gradient-based method's sensitivity to the +initialization. We verify our approach in the real world on various geometric +uncertainties, including the in-hand pose uncertainty of the grasped object, +the object's shape uncertainty, and the environment's shape uncertainty. + +
+
+
+
+
+ + ☆ Scene Understanding in Pick-and-Place Tasks: Analyzing Transformations + Between Initial and Final Scenes + + +
+ With robots increasingly collaborating with humans in everyday tasks, it is +important to take steps toward robotic systems capable of understanding the +environment. This work focuses on scene understanding to detect pick and place +tasks given initial and final images from the scene. To this end, a dataset is +collected for object detection and pick and place task detection. A YOLOv5 +network is subsequently trained to detect the objects in the initial and final +scenes. Given the detected objects and their bounding boxes, two methods are +proposed to detect the pick and place tasks which transform the initial scene +into the final scene. A geometric method is proposed which tracks objects' +movements in the two scenes and works based on the intersection of the bounding +boxes which moved within scenes. Contrarily, the CNN-based method utilizes a +Convolutional Neural Network to classify objects with intersected bounding +boxes into 5 classes, showing the spatial relationship between the involved +objects. The performed pick and place tasks are then derived from analyzing the +experiments with both scenes. Results show that the CNN-based method, using a +VGG16 backbone, outscores the geometric method by roughly 12 percentage points +in certain scenarios, with an overall success rate of 84.3%. + +
+
+ comment: Conference Paper, ICEE 2024, 7 pages, 5 figures +
+
+
+
+
+ + ☆ Episodic Memory Verbalization using Hierarchical Representations of + Life-Long Robot Experience + + +
+ Verbalization of robot experience, i.e., summarization of and question +answering about a robot's past, is a crucial ability for improving human-robot +interaction. Previous works applied rule-based systems or fine-tuned deep +models to verbalize short (several-minute-long) streams of episodic data, +limiting generalization and transferability. In our work, we apply large +pretrained models to tackle this task with zero or few examples, and +specifically focus on verbalizing life-long experiences. For this, we derive a +tree-like data structure from episodic memory (EM), with lower levels +representing raw perception and proprioception data, and higher levels +abstracting events to natural language concepts. Given such a hierarchical +representation built from the experience stream, we apply a large language +model as an agent to interactively search the EM given a user's query, +dynamically expanding (initially collapsed) tree nodes to find the relevant +information. The approach keeps computational costs low even when scaling to +months of robot experience data. We evaluate our method on simulated household +robot data, human egocentric videos, and real-world robot recordings, +demonstrating its flexibility and scalability. + +
+
+ comment: Code, data and demo videos at https://hierarchical-emv.github.io +
+
+
+
+
+ + ☆ Event-based Stereo Depth Estimation: A Survey + + +
+ Stereopsis has widespread appeal in robotics as it is the predominant way by +which living beings perceive depth to navigate our 3D world. Event cameras are +novel bio-inspired sensors that detect per-pixel brightness changes +asynchronously, with very high temporal resolution and high dynamic range, +enabling machine perception in high-speed motion and broad illumination +conditions. The high temporal precision also benefits stereo matching, making +disparity (depth) estimation a popular research area for event cameras ever +since its inception. Over the last 30 years, the field has evolved rapidly, +from low-latency, low-power circuit design to current deep learning (DL) +approaches driven by the computer vision community. The bibliography is vast +and difficult to navigate for non-experts due its highly interdisciplinary +nature. Past surveys have addressed distinct aspects of this topic, in the +context of applications, or focusing only on a specific class of techniques, +but have overlooked stereo datasets. This survey provides a comprehensive +overview, covering both instantaneous stereo and long-term methods suitable for +simultaneous localization and mapping (SLAM), along with theoretical and +empirical comparisons. It is the first to extensively review DL methods as well +as stereo datasets, even providing practical suggestions for creating new +benchmarks to advance the field. The main advantages and challenges faced by +event-based stereo depth estimation are also discussed. Despite significant +progress, challenges remain in achieving optimal performance in not only +accuracy but also efficiency, a cornerstone of event-based computing. We +identify several gaps and propose future research directions. We hope this +survey inspires future research in this area, by serving as an accessible entry +point for newcomers, as well as a practical guide for seasoned researchers in +the community. + +
+
+ comment: 28 pages, 20 figures, 7 tables +
+
+
+
+
+ + ☆ AssistantX: An LLM-Powered Proactive Assistant in Collaborative + Human-Populated Environment + + +
+ The increasing demand for intelligent assistants in human-populated +environments has motivated significant research in autonomous robotic systems. +Traditional service robots and virtual assistants, however, struggle with +real-world task execution due to their limited capacity for dynamic reasoning +and interaction, particularly when human collaboration is required. Recent +developments in Large Language Models have opened new avenues for improving +these systems, enabling more sophisticated reasoning and natural interaction +capabilities. In this paper, we introduce AssistantX, an LLM-powered proactive +assistant designed to operate autonomously in a physical office environment. +Unlike conventional service robots, AssistantX leverages a novel multi-agent +architecture, PPDR4X, which provides advanced inference capabilities and +comprehensive collaboration awareness. By effectively bridging the gap between +virtual operations and physical interactions, AssistantX demonstrates robust +performance in managing complex real-world scenarios. Our evaluation highlights +the architecture's effectiveness, showing that AssistantX can respond to clear +instructions, actively retrieve supplementary information from memory, and +proactively seek collaboration from team members to ensure successful task +completion. More details and videos can be found at +https://assistantx-agent.github.io/AssistantX/. + +
+
+ comment: 6 pages, 8 figures, 4 tables +
+
+
+
+
+ + ☆ FactorSim: Generative Simulation via Factorized Representation + + +
+ Generating simulations to train intelligent agents in game-playing and +robotics from natural language input, from user input or task documentation, +remains an open-ended challenge. Existing approaches focus on parts of this +challenge, such as generating reward functions or task hyperparameters. Unlike +previous work, we introduce FACTORSIM that generates full simulations in code +from language input that can be used to train agents. Exploiting the structural +modularity specific to coded simulations, we propose to use a factored +partially observable Markov decision process representation that allows us to +reduce context dependence during each step of the generation. For evaluation, +we introduce a generative simulation benchmark that assesses the generated +simulation code's accuracy and effectiveness in facilitating zero-shot +transfers in reinforcement learning settings. We show that FACTORSIM +outperforms existing methods in generating simulations regarding prompt +alignment (e.g., accuracy), zero-shot transfer abilities, and human evaluation. +We also demonstrate its effectiveness in generating robotic tasks. + +
+
+ comment: neurips 2024, project website: + https://cs.stanford.edu/~sunfanyun/factorsim/ +
+
+
+
+
+ + ☆ AP-VLM: Active Perception Enabled by Vision-Language Models + + +
+ Active perception enables robots to dynamically gather information by +adjusting their viewpoints, a crucial capability for interacting with complex, +partially observable environments. In this paper, we present AP-VLM, a novel +framework that combines active perception with a Vision-Language Model (VLM) to +guide robotic exploration and answer semantic queries. Using a 3D virtual grid +overlaid on the scene and orientation adjustments, AP-VLM allows a robotic +manipulator to intelligently select optimal viewpoints and orientations to +resolve challenging tasks, such as identifying objects in occluded or inclined +positions. We evaluate our system on two robotic platforms: a 7-DOF Franka +Panda and a 6-DOF UR5, across various scenes with differing object +configurations. Our results demonstrate that AP-VLM significantly outperforms +passive perception methods and baseline models, including Toward Grounded +Common Sense Reasoning (TGCSR), particularly in scenarios where fixed camera +views are inadequate. The adaptability of AP-VLM in real-world settings shows +promise for enhancing robotic systems' understanding of complex environments, +bridging the gap between high-level semantic reasoning and low-level control. + +
+
+
+
+
+ + ☆ System-Level Safety Monitoring and Recovery for Perception Failures in + Autonomous Vehicles + + +
+ The safety-critical nature of autonomous vehicle (AV) operation necessitates +development of task-relevant algorithms that can reason about safety at the +system level and not just at the component level. To reason about the impact of +a perception failure on the entire system performance, such task-relevant +algorithms must contend with various challenges: complexity of AV stacks, high +uncertainty in the operating environments, and the need for real-time +performance. To overcome these challenges, in this work, we introduce a +Q-network called SPARQ (abbreviation for Safety evaluation for Perception And +Recovery Q-network) that evaluates the safety of a plan generated by a planning +algorithm, accounting for perception failures that the planning process may +have overlooked. This Q-network can be queried during system runtime to assess +whether a proposed plan is safe for execution or poses potential safety risks. +If a violation is detected, the network can then recommend a corrective plan +while accounting for the perceptual failure. We validate our algorithm using +the NuPlan-Vegas dataset, demonstrating its ability to handle cases where a +perception failure compromises a proposed plan while the corrective plan +remains safe. We observe an overall accuracy and recall of 90% while sustaining +a frequency of 42Hz on the unseen testing dataset. We compare our performance +to a popular reachability-based baseline and analyze some interesting +properties of our approach in improving the safety properties of an AV +pipeline. + +
+
+
+
+
+ + HGS-Planner: Hierarchical Planning Framework for Active Scene + Reconstruction Using 3D Gaussian Splatting + + +
+ In complex missions such as search and rescue,robots must make intelligent +decisions in unknown environments, relying on their ability to perceive and +understand their surroundings. High-quality and real-time reconstruction +enhances situational awareness and is crucial for intelligent robotics. +Traditional methods often struggle with poor scene representation or are too +slow for real-time use. Inspired by the efficacy of 3D Gaussian Splatting +(3DGS), we propose a hierarchical planning framework for fast and high-fidelity +active reconstruction. Our method evaluates completion and quality gain to +adaptively guide reconstruction, integrating global and local planning for +efficiency. Experiments in simulated and real-world environments show our +approach outperforms existing real-time methods. + +
+
+
+
+
+ + ☆ Leveraging Semantic and Geometric Information for Zero-Shot + Robot-to-Human Handover + + +
+ Human-robot interaction (HRI) encompasses a wide range of collaborative +tasks, with handover being one of the most fundamental. As robots become more +integrated into human environments, the potential for service robots to assist +in handing objects to humans is increasingly promising. In robot-to-human (R2H) +handover, selecting the optimal grasp is crucial for success, as it requires +avoiding interference with the humans preferred grasp region and minimizing +intrusion into their workspace. Existing methods either inadequately consider +geometric information or rely on data-driven approaches, which often struggle +to generalize across diverse objects. To address these limitations, we propose +a novel zero-shot system that combines semantic and geometric information to +generate optimal handover grasps. Our method first identifies grasp regions +using semantic knowledge from vision-language models (VLMs) and, by +incorporating customized visual prompts, achieves finer granularity in region +grounding. A grasp is then selected based on grasp distance and approach angle +to maximize human ease and avoid interference. We validate our approach through +ablation studies and real-world comparison experiments. Results demonstrate +that our system improves handover success rates and provides a more +user-preferred interaction experience. Videos, appendixes and more are +available at https://sites.google.com/view/vlm-handover/. + +
+
+ comment: 6 pages, 5 figures, conference +
+
+
+
+
+ + Learning Occlusion-aware Decision-making from Agent Interaction via + Active Perception + + +
+ Occlusion-aware decision-making is essential in autonomous driving due to the +high uncertainty of various occlusions. Recent occlusion-aware decision-making +methods encounter issues such as high computational complexity, scenario +scalability challenges, or reliance on limited expert data. Benefiting from +automatically generating data by exploration randomization, we uncover that +reinforcement learning (RL) may show promise in occlusion-aware +decision-making. However, previous occlusion-aware RL faces challenges in +expanding to various dynamic and static occlusion scenarios, low learning +efficiency, and lack of predictive ability. To address these issues, we +introduce Pad-AI, a self-reinforcing framework to learn occlusion-aware +decision-making through active perception. Pad-AI utilizes vectorized +representation to represent occluded environments efficiently and learns over +the semantic motion primitives to focus on high-level active perception +exploration. Furthermore, Pad-AI integrates prediction and RL within a unified +framework to provide risk-aware learning and security guarantees. Our framework +was tested in challenging scenarios under both dynamic and static occlusions +and demonstrated efficient and general perception-aware exploration performance +to other strong baselines in closed-loop evaluations. + +
+
+
+
+
+ + ☆ Software for the SpaceDREAM Robotic Arm + + +
+ Impedance-controlled robots are widely used on Earth to perform +interaction-rich tasks and will be a key enabler for In-Space Servicing, +Assembly and Manufacturing (ISAM) activities. This paper introduces the +software architecture used on the On-Board Computer (OBC) for the planned +SpaceDREAM mission aiming to validate such robotic arm in Lower Earth Orbit +(LEO) conducted by the German Aerospace Center (DLR) in cooperation with +KINETIK Space GmbH and the Technical University of Munich (TUM). During the +mission several free motion as well as contact tasks are to be performed in +order to verify proper functionality of the robot in position and impedance +control on joint level as well as in cartesian control. The tasks are selected +to be representative for subsequent servicing missions e.g. requiring interface +docking or precise manipulation. + The software on the OBC commands the robot's joints via SpaceWire to perform +those mission tasks, reads camera images and data from additional sensors and +sends telemetry data through an Ethernet link via the spacecraft down to Earth. +It is set up to execute a predefined mission after receiving a start signal +from the spacecraft while it should be extendable to receive commands from +Earth for later missions. Core design principle was to reuse as much existing +software and to stay as close as possible to existing robot software stacks at +DLR. This allowed for a quick full operational start of the robot arm compared +to a custom development of all robot software, a lower entry barrier for +software developers as well as a reuse of existing libraries. While not every +line of code can be tested with this design, most of the software has already +proven its functionality through daily execution on multiple robot systems. + +
+
+
+
+
+ + ☆ Canonical Representation and Force-Based Pretraining of 3D Tactile for + Dexterous Visuo-Tactile Policy Learning + + +
+ Tactile sensing plays a vital role in enabling robots to perform +fine-grained, contact-rich tasks. However, the high dimensionality of tactile +data, due to the large coverage on dexterous hands, poses significant +challenges for effective tactile feature learning, especially for 3D tactile +data, as there are no large standardized datasets and no strong pretrained +backbones. To address these challenges, we propose a novel canonical +representation that reduces the difficulty of 3D tactile feature learning and +further introduces a force-based self-supervised pretraining task to capture +both local and net force features, which are crucial for dexterous +manipulation. Our method achieves an average success rate of 78% across four +fine-grained, contact-rich dexterous manipulation tasks in real-world +experiments, demonstrating effectiveness and robustness compared to other +methods. Further analysis shows that our method fully utilizes both spatial and +force information from 3D tactile data to accomplish the tasks. The videos can +be viewed at https://3dtacdex.github.io. + +
+
+
+
+
+ + ☆ Robotic Environmental State Recognition with Pre-Trained Vision-Language + Models and Black-Box Optimization + + +
+ In order for robots to autonomously navigate and operate in diverse +environments, it is essential for them to recognize the state of their +environment. On the other hand, the environmental state recognition has +traditionally involved distinct methods tailored to each state to be +recognized. In this study, we perform a unified environmental state recognition +for robots through the spoken language with pre-trained large-scale +vision-language models. We apply Visual Question Answering and Image-to-Text +Retrieval, which are tasks of Vision-Language Models. We show that with our +method, it is possible to recognize not only whether a room door is +open/closed, but also whether a transparent door is open/closed and whether +water is running in a sink, without training neural networks or manual +programming. In addition, the recognition accuracy can be improved by selecting +appropriate texts from the set of prepared texts based on black-box +optimization. For each state recognition, only the text set and its weighting +need to be changed, eliminating the need to prepare multiple different models +and programs, and facilitating the management of source code and computer +resource. We experimentally demonstrate the effectiveness of our method and +apply it to the recognition behavior on a mobile robot, Fetch. + +
+
+ comment: Accepted at Advanced Robotics, website - + https://haraduka.github.io/vlm-bbo/ +
+
+
+
+
+ + ☆ Precise Interception Flight Targets by Image-based Visual Servoing of + Multicopter + + +
+ Interception of low-altitude intruding targets with low-cost drones equipped +strapdown camera presents a competitive option. However, the malicious +maneuvers by the non-cooperative target and the coupling of the camera make the +task challenging. To solve this problem, an Image-Based Visual Servoing (IBVS) +control algorithm based on proportional navigation guidance with field-of-view +holding capability is designed. The proposed controller reduces the miss +distance while improving the stability of the visual servo system during +interception. Software-in-the-loop (SITL) simulation experiments show a 72.8% +reduction in the circular error probability (CEP) compared to the most recent +study. This improvement enhances interception accuracy from the decimeter to +the centimeter level. Real-world experiments further validate the effectiveness +of the proposed algorithm. + +
+
+ comment: 9 pages, 15 figures, In the process of being submitted to the Journal + of IEEE Transactions on Industrial Electronics +
+
+
+
+
+ + ☆ Traverse the Non-Traversable: Estimating Traversability for Wheeled + Mobility on Vertically Challenging Terrain + + +
+ Most traversability estimation techniques divide off-road terrain into +traversable (e.g., pavement, gravel, and grass) and non-traversable (e.g., +boulders, vegetation, and ditches) regions and then inform subsequent planners +to produce trajectories on the traversable part. However, recent research +demonstrated that wheeled robots can traverse vertically challenging terrain +(e.g., extremely rugged boulders comparable in size to the vehicles +themselves), which unfortunately would be deemed as non-traversable by existing +techniques. Motivated by such limitations, this work aims at identifying the +traversable from the seemingly non-traversable, vertically challenging terrain +based on past kinodynamic vehicle-terrain interactions in a data-driven manner. +Our new Traverse the Non-Traversable(TNT) traversability estimator can +efficiently guide a down-stream sampling-based planner containing a +high-precision 6-DoF kinodynamic model, which becomes deployable onboard a +small-scale vehicle. Additionally, the estimated traversability can also be +used as a costmap to plan global and local paths without sampling. Our +experiment results show that TNT can improve planning performance, efficiency, +and stability by 50%, 26.7%, and 9.2% respectively on a physical robot +platform. + +
+
+ comment: for associated video file, see + https://www.youtube.com/watch?v=Shcalb8sGcA +
+
+
+
+
+ + ☆ Tactile Probabilistic Contact Dynamics Estimation of Unknown Objects + + +
+ We study the problem of rapidly identifying contact dynamics of unknown +objects in partially known environments. The key innovation of our method is a +novel formulation of the contact dynamics estimation problem as the joint +estimation of contact geometries and physical parameters. We leverage DeepSDF, +a compact and expressive neural-network-based geometry representation over a +distribution of geometries, and adopt a particle filter to estimate both the +geometries in contact and the physical parameters. In addition, we couple the +estimator with an active exploration strategy that plans information-gathering +moves to further expedite online estimation. Through simulation and physical +experiments, we show that our method estimates accurate contact dynamics with +fewer than 30 exploration moves for unknown objects touching partially known +environments. + +
+
+
+
+
+ + ☆ Verti-Selector: Automatic Curriculum Learning for Wheeled Mobility on + Vertically Challenging Terrain + + +
+ Reinforcement Learning (RL) has the potential to enable extreme off-road +mobility by circumventing complex kinodynamic modeling, planning, and control +by simulated end-to-end trial-and-error learning experiences. However, most RL +methods are sample-inefficient when training in a large amount of manually +designed simulation environments and struggle at generalizing to the real +world. To address these issues, we introduce Verti-Selector (VS), an automatic +curriculum learning framework designed to enhance learning efficiency and +generalization by selectively sampling training terrain. VS prioritizes +vertically challenging terrain with higher Temporal Difference (TD) errors when +revisited, thereby allowing robots to learn at the edge of their evolving +capabilities. By dynamically adjusting the sampling focus, VS significantly +boosts sample efficiency and generalization within the VW-Chrono simulator +built on the Chrono multi-physics engine. Furthermore, we provide simulation +and physical results using VS on a Verti-4-Wheeler platform. These results +demonstrate that VS can achieve 23.08% improvement in terms of success rate by +efficiently sampling during training and robustly generalizing to the real +world. + +
+
+
+
+
+ + ☆ Cat-and-Mouse Satellite Dynamics: Divergent Adversarial Reinforcement + Learning for Contested Multi-Agent Space Operations + + +
+ As space becomes increasingly crowded and contested, robust autonomous +capabilities for multi-agent environments are gaining critical importance. +Current autonomous systems in space primarily rely on optimization-based path +planning or long-range orbital maneuvers, which have not yet proven effective +in adversarial scenarios where one satellite is actively pursuing another. We +introduce Divergent Adversarial Reinforcement Learning (DARL), a two-stage +Multi-Agent Reinforcement Learning (MARL) approach designed to train autonomous +evasion strategies for satellites engaged with multiple adversarial spacecraft. +Our method enhances exploration during training by promoting diverse +adversarial strategies, leading to more robust and adaptable evader models. We +validate DARL through a cat-and-mouse satellite scenario, modeled as a +partially observable multi-agent capture the flag game where two adversarial +`cat' spacecraft pursue a single `mouse' evader. DARL's performance is compared +against several benchmarks, including an optimization-based satellite path +planner, demonstrating its ability to produce highly robust models for +adversarial multi-agent space environments. + +
+
+
+
+
+ + ☆ Active Vision Might Be All You Need: Exploring Active Vision in Bimanual + Robotic Manipulation + + +
+ Imitation learning has demonstrated significant potential in performing +high-precision manipulation tasks using visual feedback from cameras. However, +it is common practice in imitation learning for cameras to be fixed in place, +resulting in issues like occlusion and limited field of view. Furthermore, +cameras are often placed in broad, general locations, without an effective +viewpoint specific to the robot's task. In this work, we investigate the +utility of active vision (AV) for imitation learning and manipulation, in +which, in addition to the manipulation policy, the robot learns an AV policy +from human demonstrations to dynamically change the robot's camera viewpoint to +obtain better information about its environment and the given task. We +introduce AV-ALOHA, a new bimanual teleoperation robot system with AV, an +extension of the ALOHA 2 robot system, incorporating an additional 7-DoF robot +arm that only carries a stereo camera and is solely tasked with finding the +best viewpoint. This camera streams stereo video to an operator wearing a +virtual reality (VR) headset, allowing the operator to control the camera pose +using head and body movements. The system provides an immersive teleoperation +experience, with bimanual first-person control, enabling the operator to +dynamically explore and search the scene and simultaneously interact with the +environment. We conduct imitation learning experiments of our system both in +real-world and in simulation, across a variety of tasks that emphasize +viewpoint planning. Our results demonstrate the effectiveness of human-guided +AV for imitation learning, showing significant improvements over fixed cameras +in tasks with limited visibility. Project website: +https://soltanilara.github.io/av-aloha/ + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Exploring Event-based Human Pose Estimation with 3D Event + Representations + + +
+ Human pose estimation is a fundamental and appealing task in computer vision. +Although traditional cameras are commonly applied, their reliability decreases +in scenarios under high dynamic range or heavy motion blur, where event cameras +offer a robust solution. Predominant event-based methods accumulate events into +frames, ignoring the asynchronous and high temporal resolution that is crucial +for distinguishing distinct actions. To address this issue and to unlock the 3D +potential of event information, we introduce two 3D event representations: the +Rasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The +RasEPC aggregates events within concise temporal slices at identical positions, +preserving their 3D attributes along with statistical information, thereby +significantly reducing memory and computational demands. Meanwhile, the DEV +representation discretizes events into voxels and projects them across three +orthogonal planes, utilizing decoupled event attention to retrieve 3D cues from +the 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic +event-based dataset crafted to facilitate training and quantitative analysis in +outdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD +dataset, and our EV-3DPW dataset, with further qualitative validation via a +derived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our +code and dataset have been made publicly available at +https://github.com/MasterHow/EventPointPose. + +
+
+ comment: Accepted to Computer Vision and Image Understanding (CVPU). Extended + version of arXiv:2206.04511. The code and dataset are available at + https://github.com/MasterHow/EventPointPose +
+
+
+
+
+ + ♻ ☆ Valeo4Cast: A Modular Approach to End-to-End Forecasting ECCV + + +
+ Motion forecasting is crucial in autonomous driving systems to anticipate the +future trajectories of surrounding agents such as pedestrians, vehicles, and +traffic signals. In end-to-end forecasting, the model must jointly detect and +track from sensor data (cameras or LiDARs) the past trajectories of the +different elements of the scene and predict their future locations. We depart +from the current trend of tackling this task via end-to-end training from +perception to forecasting, and instead use a modular approach. We individually +build and train detection, tracking and forecasting modules. We then only use +consecutive finetuning steps to integrate the modules better and alleviate +compounding errors. We conduct an in-depth study on the finetuning strategies +and it reveals that our simple yet effective approach significantly improves +performance on the end-to-end forecasting benchmark. Consequently, our solution +ranks first in the Argoverse 2 End-to-end Forecasting Challenge, with 63.82 +mAPf. We surpass forecasting results by +17.1 points over last year's winner +and by +13.3 points over this year's runner-up. This remarkable performance in +forecasting can be explained by our modular paradigm, which integrates +finetuning strategies and significantly outperforms the end-to-end-trained +counterparts. The code, model weights and results are made available +https://github.com/valeoai/valeo4cast. + +
+
+ comment: Winning solution of the Argoverse 2 "Unified Detection, Tracking, and + Forecasting" challenge; work accepted at Road++ ECCVW 2024 +
+
+
+
+
+ + ♻ ☆ TypeFly: Flying Drones with Large Language Model + + +
+ Recent advancements in robot control using large language models (LLMs) have +demonstrated significant potential, primarily due to LLMs' capabilities to +understand natural language commands and generate executable plans in various +languages. However, in real-time and interactive applications involving mobile +robots, particularly drones, the sequential token generation process inherent +to LLMs introduces substantial latency, i.e. response time, in control plan +generation. + In this paper, we present a system called ChatFly that tackles this problem +using a combination of a novel programming language called MiniSpec and its +runtime to reduce the plan generation time and drone response time. That is, +instead of asking an LLM to write a program (robotic plan) in the popular but +verbose Python, ChatFly gets it to do it in MiniSpec specially designed for +token efficiency and stream interpretation. Using a set of challenging drone +tasks, we show that design choices made by ChatFly can reduce up to 62% +response time and provide a more consistent user experience, enabling +responsive and intelligent LLM-based drone control with efficient completion. + +
+
+
+
+
+ + ♻ ☆ LingoQA: Visual Question Answering for Autonomous Driving ECCV 2024 + + +
+ We introduce LingoQA, a novel dataset and benchmark for visual question +answering in autonomous driving. The dataset contains 28K unique short video +scenarios, and 419K annotations. Evaluating state-of-the-art vision-language +models on our benchmark shows that their performance is below human +capabilities, with GPT-4V responding truthfully to 59.6% of the questions +compared to 96.6% for humans. For evaluation, we propose a truthfulness +classifier, called Lingo-Judge, that achieves a 0.95 Spearman correlation +coefficient to human evaluations, surpassing existing techniques like METEOR, +BLEU, CIDEr, and GPT-4. We establish a baseline vision-language model and run +extensive ablation studies to understand its performance. We release our +dataset and benchmark as an evaluation platform for vision-language models in +autonomous driving. + +
+
+ comment: Accepted to ECCV 2024. Benchmark and dataset are available at + https://github.com/wayveai/LingoQA/ +
+
+
+
+
+ + ♻ An Active Perception Game for Robust Information Gathering + + +
+ Active perception approaches select future viewpoints by using some estimate +of the information gain. An inaccurate estimate can be detrimental in critical +situations, e.g., locating a person in distress. However the true information +gained can only be calculated post hoc, i.e., after the observation is +realized. We present an approach for estimating the discrepancy between the +information gain (which is the average over putative future observations) and +the true information gain. The key idea is to analyze the mathematical +relationship between active perception and the estimation error of the +information gain in a game-theoretic setting. Using this, we develop an online +estimation approach that achieves sub-linear regret (in the number of +time-steps) for the estimation of the true information gain and reduces the +sub-optimality of active perception systems. + We demonstrate our approach for active perception using a comprehensive set +of experiments on: (a) different types of environments, including a quadrotor +in a photorealistic simulation, real-world robotic data, and real-world +experiments with ground robots exploring indoor and outdoor scenes; (b) +different types of robotic perception data; and (c) different map +representations. On average, our approach reduces information gain estimation +errors by 42%, increases the information gain by 7%, PSNR by 5%, and semantic +accuracy (measured as the number of objects that are localized correctly) by +6%. In real-world experiments with a Jackal ground robot, our approach +demonstrated complex trajectories to explore occluded regions. + +
+
+
+
+
+ + ♻ ☆ OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera + Fusion for Colorizing Point Clouds ICRA + + +
+ A Colored point cloud, as a simple and efficient 3D representation, has many +advantages in various fields, including robotic navigation and scene +reconstruction. This representation is now commonly used in 3D reconstruction +tasks relying on cameras and LiDARs. However, fusing data from these two types +of sensors is poorly performed in many existing frameworks, leading to +unsatisfactory mapping results, mainly due to inaccurate camera poses. This +paper presents OmniColor, a novel and efficient algorithm to colorize point +clouds using an independent 360-degree camera. Given a LiDAR-based point cloud +and a sequence of panorama images with initial coarse camera poses, our +objective is to jointly optimize the poses of all frames for mapping images +onto geometric reconstructions. Our pipeline works in an off-the-shelf manner +that does not require any feature extraction or matching process. Instead, we +find optimal poses by directly maximizing the photometric consistency of LiDAR +maps. In experiments, we show that our method can overcome the severe visual +distortion of omnidirectional images and greatly benefit from the wide field of +view (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy +and stability. The code will be released at +https://github.com/liubonan123/OmniColor/. + +
+
+ comment: 2024 IEEE International Conference on Robotics and Automation (ICRA) +
+
+
+
+
+ + ♻ ☆ Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation CoRL 2024 + + +
+ Given the high cost of collecting robotic data in the real world, sample +efficiency is a consistently compelling pursuit in robotics. In this paper, we +introduce SGRv2, an imitation learning framework that enhances sample +efficiency through improved visual and action representations. Central to the +design of SGRv2 is the incorporation of a critical inductive bias-action +locality, which posits that robot's actions are predominantly influenced by the +target object and its interactions with the local environment. Extensive +experiments in both simulated and real-world settings demonstrate that action +locality is essential for boosting sample efficiency. SGRv2 excels in RLBench +tasks with keyframe control using merely 5 demonstrations and surpasses the RVT +baseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and +MimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR. +In real-world environments, with only eight demonstrations, SGRv2 can perform a +variety of tasks at a markedly higher success rate compared to baseline models. +Project website: http://sgrv2-robot.github.io + +
+
+ comment: CoRL 2024. Project website: http://sgrv2-robot.github.io +
+
+
+
+
+ + ♻ ☆ Gaussian-LIC: Real-Time Photo-Realistic SLAM with Gaussian Splatting and + LiDAR-Inertial-Camera Fusion + + +
+ In this paper, we present a real-time photo-realistic SLAM method based on +marrying Gaussian Splatting with LiDAR-Inertial-Camera SLAM. Most existing +radiance-field-based SLAM systems mainly focus on bounded indoor environments, +equipped with RGB-D or RGB sensors. However, they are prone to decline when +expanding to unbounded scenes or encountering adverse conditions, such as +violent motions and changing illumination. In contrast, oriented to general +scenarios, our approach additionally tightly fuses LiDAR, IMU, and camera for +robust pose estimation and photo-realistic online mapping. To compensate for +regions unobserved by the LiDAR, we propose to integrate both the triangulated +visual points from images and LiDAR points for initializing 3D Gaussians. In +addition, the modeling of the sky and varying camera exposure have been +realized for high-quality rendering. Notably, we implement our system purely +with C++ and CUDA, and meticulously design a series of strategies to accelerate +the online optimization of the Gaussian-based scene representation. Extensive +experiments demonstrate that our method outperforms its counterparts while +maintaining real-time capability. Impressively, regarding photo-realistic +mapping, our method with our estimated poses even surpasses all the compared +approaches that utilize privileged ground-truth poses for mapping. Our code +will be released on project page https://xingxingzuo.github.io/gaussian_lic. + +
+
+
+
+
+ + ♻ ☆ AnoVox: A Benchmark for Multimodal Anomaly Detection in Autonomous + Driving ECCV 2024 + + +
+ The scale-up of autonomous vehicles depends heavily on their ability to deal +with anomalies, such as rare objects on the road. In order to handle such +situations, it is necessary to detect anomalies in the first place. Anomaly +detection for autonomous driving has made great progress in the past years but +suffers from poorly designed benchmarks with a strong focus on camera data. In +this work, we propose AnoVox, the largest benchmark for ANOmaly detection in +autonomous driving to date. AnoVox incorporates large-scale multimodal sensor +data and spatial VOXel ground truth, allowing for the comparison of methods +independent of their used sensor. We propose a formal definition of normality +and provide a compliant training dataset. AnoVox is the first benchmark to +contain both content and temporal anomalies. + +
+
+ comment: Daniel Bogdoll, Iramm Hamdard, and Lukas Namgyu R\"o{\ss}ler + contributed equally. Accepted for publication at ECCV 2024 W-CODA workshop +
+
+
+
+
+ + ♻ Humanoid Parkour Learning CoRL 2024 + + +
+ Parkour is a grand challenge for legged locomotion, even for quadruped +robots, requiring active perception and various maneuvers to overcome multiple +challenging obstacles. Existing methods for humanoid locomotion either optimize +a trajectory for a single parkour track or train a reinforcement learning +policy only to walk with a significant amount of motion references. In this +work, we propose a framework for learning an end-to-end vision-based +whole-body-control parkour policy for humanoid robots that overcomes multiple +parkour skills without any motion prior. Using the parkour policy, the humanoid +robot can jump on a 0.42m platform, leap over hurdles, 0.8m gaps, and much +more. It can also run at 1.8m/s in the wild and walk robustly on different +terrains. We test our policy in indoor and outdoor environments to demonstrate +that it can autonomously select parkour skills while following the rotation +command of the joystick. We override the arm actions and show that this +framework can easily transfer to humanoid mobile manipulation tasks. Videos can +be found at https://humanoid4parkour.github.io + +
+
+ comment: Published on CoRL 2024 +
+
+
+
+
+ + ♻ ☆ General-purpose Clothes Manipulation with Semantic Keypoints + + +
+ Clothes manipulation is a critical skill for household robots. Recent +advancements have been made in task-specific clothes manipulation, such as +folding, flattening, and hanging. However, due to clothes' complex geometries +and deformability, creating a general-purpose robot system that can manipulate +a diverse range of clothes in many ways remains challenging. Since clothes are +typically designed with specific structures, we propose identifying these +specific features like ``left sleeve'' as semantic keypoints. Semantic +keypoints can provide semantic cues for task planning and geometric cues for +low-level action generation. With this insight, we develop a hierarchical +learning framework using the large language model (LLM) for general-purpose +CLothes mAnipulation with Semantic keyPoints (CLASP). Extensive simulation +experiments show that CLASP outperforms baseline methods on both seen and +unseen tasks across various clothes manipulation tasks. Real-world experiments +show that CLASP can be directly deployed in the real world and applied to a +wide variety of clothes. + +
+
+
+
+
+ + ♻ ☆ Recursive Distillation for Open-Set Distributed Robot Localization + + +
+ A typical assumption in state-of-the-art self-localization models is that an +annotated training dataset is available for the target workspace. However, this +is not necessarily true when a robot travels around the general open world. +This work introduces a novel training scheme for open-world distributed robot +systems. In our scheme, a robot (``student") can ask the other robots it meets +at unfamiliar places (``teachers") for guidance. Specifically, a +pseudo-training dataset is reconstructed from the teacher model and then used +for continual learning of the student model under domain, class, and vocabulary +incremental setup. Unlike typical knowledge transfer schemes, our scheme +introduces only minimal assumptions on the teacher model, so that it can handle +various types of open-set teachers, including those uncooperative, untrainable +(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In +this paper, we investigate a ranking function as an instance of such generic +models, using a challenging data-free recursive distillation scenario, where a +student once trained can recursively join the next-generation open teacher set. + +
+
+ comment: 5 pages, 4 figures, technical report +
+
+
+
+
+ + ♻ ☆ SliceIt! -- A Dual Simulator Framework for Learning Robot Food Slicing ICRA 2024 + + +
+ Cooking robots can enhance the home experience by reducing the burden of +daily chores. However, these robots must perform their tasks dexterously and +safely in shared human environments, especially when handling dangerous tools +such as kitchen knives. This study focuses on enabling a robot to autonomously +and safely learn food-cutting tasks. More specifically, our goal is to enable a +collaborative robot or industrial robot arm to perform food-slicing tasks by +adapting to varying material properties using compliance control. Our approach +involves using Reinforcement Learning (RL) to train a robot to compliantly +manipulate a knife, by reducing the contact forces exerted by the food items +and by the cutting board. However, training the robot in the real world can be +inefficient, and dangerous, and result in a lot of food waste. Therefore, we +proposed SliceIt!, a framework for safely and efficiently learning robot +food-slicing tasks in simulation. Following a real2sim2real approach, our +framework consists of collecting a few real food slicing data, calibrating our +dual simulation environment (a high-fidelity cutting simulator and a robotic +simulator), learning compliant control policies on the calibrated simulation +environment, and finally, deploying the policies on the real robot. + +
+
+ comment: Accepted to ICRA 2024 +
+
+
+
+
+ + ♻ ☆ Learning Variable Compliance Control From a Few Demonstrations for + Bimanual Robot with Haptic Feedback Teleoperation System IROS 2024 + + +
+ Automating dexterous, contact-rich manipulation tasks using rigid robots is a +significant challenge in robotics. Rigid robots, defined by their actuation +through position commands, face issues of excessive contact forces due to their +inability to adapt to contact with the environment, potentially causing damage. +While compliance control schemes have been introduced to mitigate these issues +by controlling forces via external sensors, they are hampered by the need for +fine-tuning task-specific controller parameters. Learning from Demonstrations +(LfD) offers an intuitive alternative, allowing robots to learn manipulations +through observed actions. In this work, we introduce a novel system to enhance +the teaching of dexterous, contact-rich manipulations to rigid robots. Our +system is twofold: firstly, it incorporates a teleoperation interface utilizing +Virtual Reality (VR) controllers, designed to provide an intuitive and +cost-effective method for task demonstration with haptic feedback. Secondly, we +present Comp-ACT (Compliance Control via Action Chunking with Transformers), a +method that leverages the demonstrations to learn variable compliance control +from a few demonstrations. Our methods have been validated across various +complex contact-rich manipulation tasks using single-arm and bimanual robot +setups in simulated and real-world environments, demonstrating the +effectiveness of our system in teaching robots dexterous manipulations with +enhanced adaptability and safety. Code available at: +https://github.com/omron-sinicx/CompACT + +
+
+ comment: Accepted to IROS 2024 +
+
+
+
+
+ + ♻ ☆ A Learning Framework for Diverse Legged Robot Locomotion Using + Barrier-Based Style Rewards + + +
+ This work introduces a model-free reinforcement learning framework that +enables various modes of motion (quadruped, tripod, or biped) and diverse tasks +for legged robot locomotion. We employ a motion-style reward based on a relaxed +logarithmic barrier function as a soft constraint, to bias the learning process +toward the desired motion style, such as gait, foot clearance, joint position, +or body height. The predefined gait cycle is encoded in a flexible manner, +facilitating gait adjustments throughout the learning process. Extensive +experiments demonstrate that KAIST HOUND, a 45 kg robotic system, can achieve +biped, tripod, and quadruped locomotion using the proposed framework; +quadrupedal capabilities include traversing uneven terrain, galloping at 4.67 +m/s, and overcoming obstacles up to 58 cm (67 cm for HOUND2); bipedal +capabilities include running at 3.6 m/s, carrying a 7.5 kg object, and +ascending stairs-all performed without exteroceptive input. + +
+
+ comment: 7 pages, 5 figures, Videos at https://youtu.be/JV2_HfTlOKI +
+
+
+
+
+ + ♻ ☆ Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles + Using Latent Space Generative World Models ICRA 2025 + + +
+ We propose the use of latent space generative world models to address the +covariate shift problem in autonomous driving. A world model is a neural +network capable of predicting an agent's next state given past states and +actions. By leveraging a world model during training, the driving policy +effectively mitigates covariate shift without requiring an excessive amount of +training data. During end-to-end training, our policy learns how to recover +from errors by aligning with states observed in human demonstrations, so that +at runtime it can recover from perturbations outside the training distribution. +Additionally, we introduce a novel transformer-based perception encoder that +employs multi-view cross-attention and a learned scene query. We present +qualitative and quantitative results, demonstrating significant improvements +upon prior state of the art in closed-loop testing in the CARLA simulator, as +well as showing the ability to handle perturbations in both CARLA and NVIDIA's +DRIVE Sim. + +
+
+ comment: 7 pages, 6 figures, for ICRA 2025 conference, for associated video + file, see https://youtu.be/fO7RZ57gVxk +
+
+
+
+
+ + ♻ ☆ Plant Robots: Harnessing Growth Actuation of Plants for Locomotion and + Object Manipulation + + +
+ Plants display physical displacements during their growth due to +photosynthesis, which converts light into chemical energy. This can be +interpreted as plants acting as actuators with a built-in power source. This +paper presents a method to create plant robots that move and perform tasks by +harnessing the actuation output of plants: displacement and force generated +from the growing process. As the target plant, radish sprouts are employed, and +their displacement and force are characterized, followed by the calculation of +power and energy densities. Based on the characterization, two different plant +robots are designed and fabricated: a rotational robot and a gripper. The +former demonstrates ground locomotion, achieving a travel distance of 14.6 mm +with an average speed of 0.8 mm/h. The latter demonstrates the picking and +placing of an object with a 0.1-g mass by the light-controlled open-close +motion of plant fingers. A good agreement between the experimental and model +values is observed in the specific data of the mobile robot, suggesting that +obtaining the actuation characteristics of plants can enable the design and +prediction of behavior in plant robots. These results pave the way for the +realization of novel types of environmentally friendly and sustainable robots. + +
+
+ comment: 16 pages, 4 figures +
+
+
+
+
+
+
+
+ + Systems and Control 24 + +
+
+
+ + ☆ A Sim-to-Real Vision-based Lane Keeping System for a 1:10-scale + Autonomous Vehicle + + +
+ In recent years, several competitions have highlighted the need to +investigate vision-based solutions to address scenarios with functional +insufficiencies in perception, world modeling and localization. This article +presents the Vision-based Lane Keeping System (VbLKS) developed by the +DEI-Unipd Team within the context of the Bosch Future Mobility Challenge 2022. +The main contribution lies in a Simulation-to-Reality (Sim2Real) GPS-denied +VbLKS for a 1:10-scale autonomous vehicle. In this VbLKS, the input to a +tailored Pure Pursuit (PP) based control strategy, namely the Lookahead Heading +Error (LHE), is estimated at a constant lookahead distance employing a +Convolutional Neural Network (CNN). A training strategy for a compact CNN is +proposed, emphasizing data generation and augmentation on simulated camera +images from a 3D Gazebo simulator, and enabling real-time operation on +low-level hardware. A tailored PP-based lateral controller equipped with a +derivative action and a PP-based velocity reference generation are implemented. +Tuning ranges are established through a systematic time-delay stability +analysis. Validation in a representative controlled laboratory setting is +provided. + +
+
+ comment: 16 pages, 23 figures +
+
+
+
+
+ + ☆ End-to-end guarantees for indirect data-driven control of bilinear + systems with finite stochastic data + + +
+ In this paper we propose an end-to-end algorithm for indirect data-driven +control for bilinear systems with stability guarantees. We consider the case +where the collected i.i.d. data is affected by probabilistic noise with +possibly unbounded support and leverage tools from statistical learning theory +to derive finite sample identification error bounds. To this end, we solve the +bilinear identification problem by solving a set of linear and affine +identification problems, by a particular choice of a control input during the +data collection phase. We provide a priori as well as data-dependent finite +sample identification error bounds on the individual matrices as well as +ellipsoidal bounds, both of which are structurally suitable for control. +Further, we integrate the structure of the derived identification error bounds +in a robust controller design to obtain an exponentially stable closed-loop. By +means of an extensive numerical study we showcase the interplay between the +controller design and the derived identification error bounds. Moreover, we +note appealing connections of our results to indirect data-driven control of +general nonlinear systems through Koopman operator theory and discuss how our +results may be applied in this setup. + +
+
+
+
+
+ + ☆ Control Industrial Automation System with Large Language Models + + +
+ Traditional industrial automation systems require specialized expertise to +operate and complex reprogramming to adapt to new processes. Large language +models offer the intelligence to make them more flexible and easier to use. +However, LLMs' application in industrial settings is underexplored. This paper +introduces a framework for integrating LLMs to achieve end-to-end control of +industrial automation systems. At the core of the framework are an agent system +designed for industrial tasks, a structured prompting method, and an +event-driven information modeling mechanism that provides real-time data for +LLM inference. The framework supplies LLMs with real-time events on different +context semantic levels, allowing them to interpret the information, generate +production plans, and control operations on the automation system. It also +supports structured dataset creation for fine-tuning on this downstream +application of LLMs. Our contribution includes a formal system design, +proof-of-concept implementation, and a method for generating task-specific +datasets for LLM fine-tuning and testing. This approach enables a more adaptive +automation system that can respond to spontaneous events, while allowing easier +operation and configuration through natural language for more intuitive +human-machine interaction. We provide demo videos and detailed data on GitHub: +https://github.com/YuchenXia/LLM4IAS + +
+
+
+
+
+ + ☆ Distributed Invariant Unscented Kalman Filter based on Inverse + Covariance Intersection with Intermittent Measurements + + +
+ This paper studies the problem of distributed state estimation (DSE) over +sensor networks on matrix Lie groups, which is crucial for applications where +system states evolve on Lie groups rather than vector spaces. We propose a +diffusion-based distributed invariant Unscented Kalman Filter using the inverse +covariance intersection (DIUKF-ICI) method to address target tracking in 3D +environments. Unlike existing distributed UKFs confined to vector spaces, our +approach extends the distributed UKF framework to Lie groups, enabling local +estimates to be fused with intermediate information from neighboring agents on +Lie groups. To handle the unknown correlations across local estimates, we +extend the ICI fusion strategy to matrix Lie groups for the first time and +integrate it into the diffusion algorithm. We demonstrate that the estimation +error of the proposed method is bounded. Additionally, the algorithm is fully +distributed, robust against intermittent measurements, and adaptable to +time-varying communication topologies. The effectiveness of the proposed method +is validated through extensive Monte-Carlo simulations. + +
+
+
+
+
+ + ☆ Deblur e-NeRF: NeRF from Motion-Blurred Events under High-speed or + Low-light Conditions ECCV 2024 + + +
+ The stark contrast in the design philosophy of an event camera makes it +particularly ideal for operating under high-speed, high dynamic range and +low-light conditions, where standard cameras underperform. Nonetheless, event +cameras still suffer from some amount of motion blur, especially under these +challenging conditions, in contrary to what most think. This is attributed to +the limited bandwidth of the event sensor pixel, which is mostly proportional +to the light intensity. Thus, to ensure that event cameras can truly excel in +such conditions where it has an edge over standard cameras, it is crucial to +account for event motion blur in downstream applications, especially +reconstruction. However, none of the recent works on reconstructing Neural +Radiance Fields (NeRFs) from events, nor event simulators, have considered the +full effects of event motion blur. To this end, we propose, Deblur e-NeRF, a +novel method to directly and effectively reconstruct blur-minimal NeRFs from +motion-blurred events generated under high-speed motion or low-light +conditions. The core component of this work is a physically-accurate pixel +bandwidth model proposed to account for event motion blur under arbitrary speed +and lighting conditions. We also introduce a novel threshold-normalized total +variation loss to improve the regularization of large textureless patches. +Experiments on real and novel realistically simulated sequences verify our +effectiveness. Our code, event simulator and synthetic event dataset will be +open-sourced. + +
+
+ comment: Accepted to ECCV 2024. Project website is accessible at + https://wengflow.github.io/deblur-e-nerf. arXiv admin note: text overlap with + arXiv:2006.07722 by other authors +
+
+
+
+
+ + ☆ Intelligent Energy Management: Remaining Useful Life Prediction and + Charging Automation System Comprised of Deep Learning and the Internet of + Things + + +
+ Remaining Useful Life (RUL) of battery is an important parameter to know the +battery's remaining life and need for recharge. The goal of this research +project is to develop machine learning-based models for the battery RUL +dataset. Different ML models are developed to classify the RUL of the vehicle, +and the IoT (Internet of Things) concept is simulated for automating the +charging system and managing any faults aligning. The graphs plotted depict the +relationship between various vehicle parameters using the Blynk IoT platform. +Results show that the catboost, Multi-Layer Perceptron (MLP), Gated Recurrent +Unit (GRU), and hybrid model developed could classify RUL into three classes +with 99% more accuracy. The data is fed using the tkinter GUI for simulating +artificial intelligence (AI)-based charging, and with a pyserial backend, data +can be entered into the Esp-32 microcontroller for making charge discharge +possible with the model's predictions. Also, with an IoT system, the charging +can be disconnected, monitored, and analyzed for automation. The results show +that an accuracy of 99% can be obtained on models MLP, catboost model and +similar accuracy on GRU model can be obtained, and finally relay-based +triggering can be made by prediction through the model used for automating the +charging and energy-saving mechanism. By showcasing an exemplary Blynk +platform-based monitoring and automation phenomenon, we further present +innovative ways of monitoring parameters and automating the system. + +
+
+
+
+
+ + ☆ Observer-Based Discontinuous Communication in the Secondary Control of + AC Microgrids + + +
+ This paper proposes an observer-based event-driven approach to decrease the +overuse of communication networks. The suggested approach aims to estimate the +required data for sharing between units in line with as much communication +reduction as possible. In other words, the proposed approach effectively +determines which state variables should be shared (observer concept) among the +units during specific time intervals (event-triggered concept). This strategy +significantly reduces the overall communication load. It is shown that the +estimation error remains bounded and Zeno behavior, characterized by an endless +number of transmissions occurring within a limited time frame, does not occur. +The proposed methodology can be systematically applied to any +communication-based secondary controller in alternating current (AC) +microgrids. Simulation results demonstrate a high degree of precision in +estimating the states under the proposed approach. Also, the secondary +controller performance under the proposed method is evaluated in +MATLAB/Simulink environment. + +
+
+ comment: 2024 IEEE PES Innovative Smart Grid Technologies Europe (ISGT Europe) +
+
+
+
+
+ + ☆ PhantomLiDAR: Cross-modality Signal Injection Attacks against LiDAR + + +
+ LiDAR (Light Detection and Ranging) is a pivotal sensor for autonomous +driving, offering precise 3D spatial information. Previous signal attacks +against LiDAR systems mainly exploit laser signals. In this paper, we +investigate the possibility of cross-modality signal injection attacks, i.e., +injecting intentional electromagnetic interference (IEMI) to manipulate LiDAR +output. Our insight is that the internal modules of a LiDAR, i.e., the laser +receiving circuit, the monitoring sensors, and the beam-steering modules, even +with strict electromagnetic compatibility (EMC) testing, can still couple with +the IEMI attack signals and result in the malfunction of LiDAR systems. Based +on the above attack surfaces, we propose the PhantomLiDAR attack, which +manipulates LiDAR output in terms of Points Interference, Points Injection, +Points Removal, and even LiDAR Power-Off. We evaluate and demonstrate the +effectiveness of PhantomLiDAR with both simulated and real-world experiments on +five COTS LiDAR systems. We also conduct feasibility experiments in real-world +moving scenarios. We provide potential defense measures that can be implemented +at both the sensor level and the vehicle system level to mitigate the risks +associated with IEMI attacks. Video demonstrations can be viewed at +https://sites.google.com/view/phantomlidar. + +
+
+
+
+
+ + ☆ Model-Free versus Model-Based Reinforcement Learning for Fixed-Wing UAV + Attitude Control Under Varying Wind Conditions + + +
+ This paper evaluates and compares the performance of model-free and +model-based reinforcement learning for the attitude control of fixed-wing +unmanned aerial vehicles using PID as a reference point. The comparison focuses +on their ability to handle varying flight dynamics and wind disturbances in a +simulated environment. Our results show that the Temporal Difference Model +Predictive Control agent outperforms both the PID controller and other +model-free reinforcement learning methods in terms of tracking accuracy and +robustness over different reference difficulties, particularly in nonlinear +flight regimes. Furthermore, we introduce actuation fluctuation as a key metric +to assess energy efficiency and actuator wear, and we test two different +approaches from the literature: action variation penalty and conditioning for +action policy smoothness. We also evaluate all control methods when subject to +stochastic turbulence and gusts separately, so as to measure their effects on +tracking performance, observe their limitations and outline their implications +on the Markov decision process formalism. + +
+
+ comment: Published at ICINCO 2024 +
+
+
+
+
+ + ☆ Discontinuous Reception with Adjustable Inactivity Timer for IIoT + + +
+ Discontinuous reception (DRX) is a key technology for reducing the energy +consumption of industrial Internet of Things (IIoT) devices. Specifically, DRX +allows the devices to operate in a low-power mode when no data reception is +scheduled, and its effectiveness depends on the proper configuration of the DRX +parameters. In this paper, we characterize the DRX process departing from a +semi-Markov chain modeling. We detail two ways to set DRX parameters to +minimize the device power consumption while meeting a mean delay constraint. +The first method exhaustively searches for the optimal configuration. In +contrast, the second method uses a low-complexity metaheuristic to find a +sub-optimal configuration, thus considering ideal and practical DRX +configurations. Notably, within the DRX parameters, the inactivity timer (IT) +is a caution time that specifies how long a device remains active after the +last information exchange. Traditionally, a device implementing DRX will +restart the IT after each data reception as a precedent to a low-power mode. +The usual approach lies in restarting the IT whenever new data is received +during this cautious period, which might sometimes needlessly extend the active +time. Herein, we propose a more efficient method in which the transmit base +station (BS) explicitly indicates restarting the timer through the control +channel only when appropriate. The decision is taken based on the BS's +knowledge about its buffer status. We consider Poisson and bursty traffic +models, which are typical in IIoT setups, and verify the suitability of our +proposal for reducing the energy consumption of the devices without +significantly compromising the communication latency through extensive +numerical simulations. Specifically, energy-saving gains of up to 30% can be +obtained regardless of the arrival rate and delay constraints. + +
+
+ comment: IEEE Transactions on Industrial Informatics (2024) +
+
+
+
+
+ + ☆ Scene Understanding in Pick-and-Place Tasks: Analyzing Transformations + Between Initial and Final Scenes + + +
+ With robots increasingly collaborating with humans in everyday tasks, it is +important to take steps toward robotic systems capable of understanding the +environment. This work focuses on scene understanding to detect pick and place +tasks given initial and final images from the scene. To this end, a dataset is +collected for object detection and pick and place task detection. A YOLOv5 +network is subsequently trained to detect the objects in the initial and final +scenes. Given the detected objects and their bounding boxes, two methods are +proposed to detect the pick and place tasks which transform the initial scene +into the final scene. A geometric method is proposed which tracks objects' +movements in the two scenes and works based on the intersection of the bounding +boxes which moved within scenes. Contrarily, the CNN-based method utilizes a +Convolutional Neural Network to classify objects with intersected bounding +boxes into 5 classes, showing the spatial relationship between the involved +objects. The performed pick and place tasks are then derived from analyzing the +experiments with both scenes. Results show that the CNN-based method, using a +VGG16 backbone, outscores the geometric method by roughly 12 percentage points +in certain scenarios, with an overall success rate of 84.3%. + +
+
+ comment: Conference Paper, ICEE 2024, 7 pages, 5 figures +
+
+
+
+
+ + ☆ On the Output Redundancy of LTI Systems: A Geometric Approach with + Application to Privacy + + +
+ This paper examines the properties of output-redundant systems, that is, +systems possessing a larger number of outputs than inputs, through the lenses +of the geometric approach of Wonham et al. We begin by formulating a simple +output allocation synthesis problem, which involves ``concealing" input +information from a malicious eavesdropper having access to the system output, +while still allowing for a legitimate user to reconstruct it. It is shown that +the solvability of this problem requires the availability of a redundant set of +outputs. This very problem is instrumental to unveiling the fundamental +geometric properties of output-redundant systems, which form the basis for our +subsequent constructions and results. As a direct application, we demonstrate +how output allocation can be employed to effectively protect the information of +input information from certain output eavesdroppers with guaranteed results. + +
+
+
+
+
+ + ☆ Semantic model for the description of energy data in the Module Type + Package + + +
+ Modular production systems that employ the Module Type Package (MTP) to +describe module interfaces can, at present, only communicate energy data +through proprietary solutions. Due to this limitation, users face additional +effort when calculating energy KPIs for modules or determining the energy +efficiency of modules. To address this issue, we present a model that +facilitates energy data to be described semantically and uniformly in the MTP +on the basis of an industrial standard (OPC 34100). MTPs incorporating this +model can transmit semantically consistent energy data from modules to the +process control system, making the data available for further applications, +such as monitoring or optimization. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ Stereographic Projection of Probabilistic Frequency-Domain Uncertainty + + +
+ This paper investigates the stereographic projection of points along the +Nyquist plots of single input single output (SISO) linear time invariant (LTI) +systems subject to probabilistic uncertainty. At each frequency, there +corresponds a complex-valued random variable with given probability +distribution in the complex plane. The chordal distance between the +stereographic projections of this complex value and the corresponding value for +a nominal model, as per the well-known Nu-Gap metric of Vinnicombe, is also a +random quantity. The main result provides the cumulative density function (CDF) +of the chordal distance at a given frequency. Such a stochastic distance +framework opens up a fresh and a fertile research direction on probabilistic +robust control theory. + +
+
+
+
+
+ + ☆ GLinSAT: The General Linear Satisfiability Neural Network Layer By + Accelerated Gradient Descent + + +
+ Ensuring that the outputs of neural networks satisfy specific constraints is +crucial for applying neural networks to real-life decision-making problems. In +this paper, we consider making a batch of neural network outputs satisfy +bounded and general linear constraints. We first reformulate the neural network +output projection problem as an entropy-regularized linear programming problem. +We show that such a problem can be equivalently transformed into an +unconstrained convex optimization problem with Lipschitz continuous gradient +according to the duality theorem. Then, based on an accelerated gradient +descent algorithm with numerical performance enhancement, we present our +architecture, GLinSAT, to solve the problem. To the best of our knowledge, this +is the first general linear satisfiability layer in which all the operations +are differentiable and matrix-factorization-free. Despite the fact that we can +explicitly perform backpropagation based on automatic differentiation +mechanism, we also provide an alternative approach in GLinSAT to calculate the +derivatives based on implicit differentiation of the optimality condition. +Experimental results on constrained traveling salesman problems, partial graph +matching with outliers, predictive portfolio allocation and power system unit +commitment demonstrate the advantages of GLinSAT over existing satisfiability +layers. + +
+
+
+
+
+ + ☆ Optimal control of stochastic reaction networks with entropic control + cost and emergence of mode-switching strategies + + +
+ Controlling the stochastic dynamics of biological populations is a challenge +that arises across various biological contexts. However, these dynamics are +inherently nonlinear and involve a discrete state space, i.e., the number of +molecules, cells, or organisms. Additionally, the possibility of extinction has +a significant impact on both the dynamics and control strategies, particularly +when the population size is small. These factors hamper the direct application +of conventional control theories to biological systems. To address these +challenges, we formulate the optimal control problem for stochastic population +dynamics by utilizing a control cost function based on the Kullback-Leibler +divergence. This approach naturally accounts for population-specific factors +and simplifies the complex nonlinear Hamilton-Jacobi-Bellman equation into a +linear form, facilitating efficient computation of optimal solutions. We +demonstrate the effectiveness of our approach by applying it to the control of +interacting random walkers, Moran processes, and SIR models, and observe the +mode-switching phenomena in the control strategies. Our approach provides new +opportunities for applying control theory to a wide range of biological +problems. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Network-aware Recommender System via Online Feedback Optimization + + +
+ Personalized content on social platforms can exacerbate negative phenomena +such as polarization, partly due to the feedback interactions between +recommendations and the users. In this paper, we present a control-theoretic +recommender system that explicitly accounts for this feedback loop to mitigate +polarization. Our approach extends online feedback optimization - a control +paradigm for steady-state optimization of dynamical systems - to develop a +recommender system that trades off users engagement and polarization reduction, +while relying solely on online click data. We establish theoretical guarantees +for optimality and stability of the proposed design and validate its +effectiveness via numerical experiments with a user population governed by +Friedkin-Johnsen dynamics. Our results show these "network-aware" +recommendations can significantly reduce polarization while maintaining high +levels of user engagement. + +
+
+
+
+
+ + ♻ ☆ Data-based approaches to learning and control by similarity between + heterogeneous systems + + +
+ This paper proposes basic definitions of similarity and similarity indexes +between admissible behaviors of heterogeneous host and guest systems and +further presents a similarity-based learning control framework by exploiting +the offline sampled data. By exploring helpful geometric properties of the +admissible behavior and decomposing it into the subspace and offset components, +the similarity indexes between two admissible behaviors are defined as the +principal angles between their corresponding subspace components. By +reconstructing the admissible behaviors leveraging sampled data, an efficient +strategy for calculating the similarity indexes is developed, based on which a +similarity-based learning control framework is proposed. It is shown that, with +the application of similarity-based learning control, the host system can +directly accomplish the same control tasks by utilizing the successful +experience provided by the guest system, without having to undergo the +trial-and-error process. All results in this paper are supported by simulation +examples. + +
+
+
+
+
+ + ♻ ☆ Data-Driven Abstractions for Control Systems via Random Exploration + + +
+ At the intersection of dynamical systems, control theory, and formal methods +lies the construction of symbolic abstractions: these typically represent +simpler, finite-state models whose behavior mimics that of an underlying +concrete system but are easier to analyse. Building an abstraction usually +requires an accurate knowledge of the underlying model: this knowledge may be +costly to gather, especially in real-life applications. We aim to bridge this +gap by building abstractions based on sampling finite length trajectories. To +refine a controller built for the abstraction to one for the concrete system, +we newly define a notion of probabilistic alternating simulation, and provide +Probably Approximately Correct (PAC) guarantees that the constructed +abstraction includes all behaviors of the concrete system and that it is +suitable for control design, for arbitrarily long time horizons, leveraging +scenario theory. Our method is then tested on several numerical benchmarks. + +
+
+
+
+
+ + ♻ ☆ Adaptive Control of an Inverted Pendulum by a Reinforcement + Learning-based LQR Method + + +
+ Inverted pendulums constitute one of the popular systems for benchmarking +control algorithms. Several methods have been proposed for the control of this +system, the majority of which rely on the availability of a mathematical model. +However, deriving a mathematical model using physical parameters or system +identification techniques requires manual effort. Moreover, the designed +controllers may perform poorly if system parameters change. To mitigate these +problems, recently, some studies used Reinforcement Learning (RL) based +approaches for the control of inverted pendulum systems. Unfortunately, these +methods suffer from slow convergence and local minimum problems. Moreover, they +may require hyperparameter tuning which complicates the design process +significantly. To alleviate these problems, the present study proposes an +LQR-based RL method for adaptive balancing control of an inverted pendulum. As +shown by numerical experiments, the algorithm stabilizes the system very fast +without requiring a mathematical model or extensive hyperparameter tuning. In +addition, it can adapt to parametric changes online. + +
+
+
+
+
+ + ♻ ☆ Convection-Enabled Boundary Control of a 2D Channel Flow + + +
+ Nonlinear convection, the source of turbulence in fluid flows, may hold the +key to stabilizing turbulence by solving a specific cubic polynomial equation. +We consider the incompressible Navier-Stokes equations in a two-dimensional +channel. The tangential and normal velocities are assumed to be periodic in the +streamwise direction. The pressure difference between the left and right ends +of the channel is constant. Moreover, we consider no-slip boundary conditions, +that is, zero tangential velocity, at the top and bottom walls of the channel, +and normal velocity actuation at the top and bottom walls. We design the +boundary control inputs to achieve global exponential stabilization, in the L2 +sense, of a chosen Poiseuille equilibrium profile for an arbitrarily large +Reynolds number. The key idea behind our approach is to select the boundary +controllers such that they have zero spatial mean (to guarantee mass +conservation) but non-zero spatial cubic mean. We reveal that, because of +convection, the time derivative of the L2 energy of the regulation error is a +cubic polynomial in the cubic mean of the boundary inputs. Regulation is then +achieved by solving a specific cubic equation, using the Cardano root formula. +The results are illustrated via a numerical example. + +
+
+ comment: To be presented at the 63rd IEEE Conference on Decision and Control + (CDC 2024) +
+
+
+
+
+ + ♻ ☆ Safe stabilization using generalized Lyapunov barrier function + + +
+ This paper addresses the safe stabilization problem, focusing on controlling +the system state to the origin while avoiding entry into unsafe state sets. The +current methods for solving this issue rely on smooth Lyapunov and barrier +functions, which do not always ensure the existence of an effective controller +even when such smooth functions are created. To tackle this challenge, we +introduce the concept of a generalized (nonsmooth) Lyapunov barrier function +(GenLBF), which guarantees the existence of a safe and stable controller. We +outline a systematic approach for constructing a GenLBF, including a technique +for efficiently calculating the upper generalized derivative of the GenLBF. +Using the constructed GenLBF, we propose a method for certifying safe +stabilization of autonomous systems and design a piecewise continuous feedback +control to achieve safe stabilization of non-autonomous systems. A general +controller refinement strategy is further proposed to help the state trajectory +escape from undesired local points occurring in systems with special physical +structure. A thorough theoretical analysis demonstrates the effectiveness of +our method in addressing the safe stabilization problem for systems with single +or multiple bounded unsafe state sets. Extensive simulations of linear and +nonlinear systems further illustrate the efficacy of the proposed method and +its superiority over the smooth control Lyapunov barrier function method. + +
+
+ comment: 19 pages, 14 figures, under review by a journal +
+
+
+
+
+ + ♻ ☆ Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles + Using Latent Space Generative World Models ICRA 2025 + + +
+ We propose the use of latent space generative world models to address the +covariate shift problem in autonomous driving. A world model is a neural +network capable of predicting an agent's next state given past states and +actions. By leveraging a world model during training, the driving policy +effectively mitigates covariate shift without requiring an excessive amount of +training data. During end-to-end training, our policy learns how to recover +from errors by aligning with states observed in human demonstrations, so that +at runtime it can recover from perturbations outside the training distribution. +Additionally, we introduce a novel transformer-based perception encoder that +employs multi-view cross-attention and a learned scene query. We present +qualitative and quantitative results, demonstrating significant improvements +upon prior state of the art in closed-loop testing in the CARLA simulator, as +well as showing the ability to handle perturbations in both CARLA and NVIDIA's +DRIVE Sim. + +
+
+ comment: 7 pages, 6 figures, for ICRA 2025 conference, for associated video + file, see https://youtu.be/fO7RZ57gVxk +
+
+
+
+
+ + ♻ ☆ The Top Manifold Connectedness of Quantum Control Landscapes + + +
+ The control of quantum systems has been proven to possess trap-free +optimization landscapes under the satisfaction of proper assumptions. However, +many details of the landscape geometry and their influence on search efficiency +still need to be fully understood. This paper numerically explores the +path-connectedness of globally optimal control solutions forming the top +manifold of the landscape. We randomly sample a plurality of optimal controls +in the top manifold to assess the existence of a continuous path at the top of +the landscape that connects two arbitrary optimal solutions. It is shown that +for different quantum control objectives including state-to-state transition +probabilities, observable expectation values and unitary transformations, such +a continuous path can be readily found, implying that these top manifolds are +fundamentally path-connected. The significance of the latter conjecture lies in +seeking locations in the top manifold where an ancillary objective can also be +optimized while maintaining the full optimality of the original objective that +defined the landscape. + +
+
+ comment: 34 pages, 10 figures +
+
+
+
+
+
+
+
+ + Artificial Intelligence 150 + +
+
+
+ + ☆ Multi-View and Multi-Scale Alignment for Contrastive Language-Image + Pre-training in Mammography MICCAI 2024 + + +
+ Contrastive Language-Image Pre-training (CLIP) shows promise in medical image +analysis but requires substantial data and computational resources. Due to +these restrictions, existing CLIP applications in medical imaging focus mainly +on modalities like chest X-rays that have abundant image-report data available, +leaving many other important modalities under-explored. Here, we propose the +first adaptation of the full CLIP model to mammography, which presents +significant challenges due to labeled data scarcity, high-resolution images +with small regions of interest, and data imbalance. We first develop a +specialized supervision framework for mammography that leverages its multi-view +nature. Furthermore, we design a symmetric local alignment module to better +focus on detailed features in high-resolution images. Lastly, we incorporate a +parameter-efficient fine-tuning approach for large language models pre-trained +with medical knowledge to address data limitations. Our multi-view and +multi-scale alignment (MaMA) method outperforms state-of-the-art baselines for +three different tasks on two large real-world mammography datasets, EMBED and +RSNA-Mammo, with only 52% model size compared with the largest baseline. + +
+
+ comment: This work is also the basis of the overall best solution for the + MICCAI 2024 CXR-LT Challenge +
+
+
+
+
+ + ☆ Find Rhinos without Finding Rhinos: Active Learning with Multimodal + Imagery of South African Rhino Habitats IJCAI 2023 + + +
+ Much of Earth's charismatic megafauna is endangered by human activities, +particularly the rhino, which is at risk of extinction due to the poaching +crisis in Africa. Monitoring rhinos' movement is crucial to their protection +but has unfortunately proven difficult because rhinos are elusive. Therefore, +instead of tracking rhinos, we propose the novel approach of mapping communal +defecation sites, called middens, which give information about rhinos' spatial +behavior valuable to anti-poaching, management, and reintroduction efforts. +This paper provides the first-ever mapping of rhino midden locations by +building classifiers to detect them using remotely sensed thermal, RGB, and +LiDAR imagery in passive and active learning settings. As existing active +learning methods perform poorly due to the extreme class imbalance in our +dataset, we design MultimodAL, an active learning system employing a ranking +technique and multimodality to achieve competitive performance with passive +learning models with 94% fewer labels. Our methods could therefore save over 76 +hours in labeling time when used on a similarly-sized dataset. Unexpectedly, +our midden map reveals that rhino middens are not randomly distributed +throughout the landscape; rather, they are clustered. Consequently, rangers +should be targeted at areas with high midden densities to strengthen +anti-poaching efforts, in line with UN Target 15.7. + +
+
+ comment: 9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good +
+
+
+
+
+ + ☆ AI-Powered Augmented Reality for Satellite Assembly, Integration and + Test + + +
+ The integration of Artificial Intelligence (AI) and Augmented Reality (AR) is +set to transform satellite Assembly, Integration, and Testing (AIT) processes +by enhancing precision, minimizing human error, and improving operational +efficiency in cleanroom environments. This paper presents a technical +description of the European Space Agency's (ESA) project "AI for AR in +Satellite AIT," which combines real-time computer vision and AR systems to +assist technicians during satellite assembly. Leveraging Microsoft HoloLens 2 +as the AR interface, the system delivers context-aware instructions and +real-time feedback, tackling the complexities of object recognition and 6D pose +estimation in AIT workflows. All AI models demonstrated over 70% accuracy, with +the detection model exceeding 95% accuracy, indicating a high level of +performance and reliability. A key contribution of this work lies in the +effective use of synthetic data for training AI models in AR applications, +addressing the significant challenges of obtaining real-world datasets in +highly dynamic satellite environments, as well as the creation of the Segmented +Anything Model for Automatic Labelling (SAMAL), which facilitates the automatic +annotation of real data, achieving speeds up to 20 times faster than manual +human annotation. The findings demonstrate the efficacy of AI-driven AR systems +in automating critical satellite assembly tasks, setting a foundation for +future innovations in the space industry. + +
+
+
+
+
+ + ☆ EfficientCrackNet: A Lightweight Model for Crack Segmentation + + +
+ Crack detection, particularly from pavement images, presents a formidable +challenge in the domain of computer vision due to several inherent complexities +such as intensity inhomogeneity, intricate topologies, low contrast, and noisy +backgrounds. Automated crack detection is crucial for maintaining the +structural integrity of essential infrastructures, including buildings, +pavements, and bridges. Existing lightweight methods often face challenges +including computational inefficiency, complex crack patterns, and difficult +backgrounds, leading to inaccurate detection and impracticality for real-world +applications. To address these limitations, we propose EfficientCrackNet, a +lightweight hybrid model combining Convolutional Neural Networks (CNNs) and +transformers for precise crack segmentation. EfficientCrackNet integrates +depthwise separable convolutions (DSC) layers and MobileViT block to capture +both global and local features. The model employs an Edge Extraction Method +(EEM) and for efficient crack edge detection without pretraining, and +Ultra-Lightweight Subspace Attention Module (ULSAM) to enhance feature +extraction. Extensive experiments on three benchmark datasets Crack500, +DeepCrack, and GAPs384 demonstrate that EfficientCrackNet achieves superior +performance compared to existing lightweight models, while requiring only 0.26M +parameters, and 0.483 FLOPs (G). The proposed model offers an optimal balance +between accuracy and computational efficiency, outperforming state-of-the-art +lightweight models, and providing a robust and adaptable solution for +real-world crack segmentation. + +
+
+
+
+
+ + ☆ DiffSSC: Semantic LiDAR Scan Completion using Denoising Diffusion + Probabilistic Models + + +
+ Perception systems play a crucial role in autonomous driving, incorporating +multiple sensors and corresponding computer vision algorithms. 3D LiDAR sensors +are widely used to capture sparse point clouds of the vehicle's surroundings. +However, such systems struggle to perceive occluded areas and gaps in the scene +due to the sparsity of these point clouds and their lack of semantics. To +address these challenges, Semantic Scene Completion (SSC) jointly predicts +unobserved geometry and semantics in the scene given raw LiDAR measurements, +aiming for a more complete scene representation. Building on promising results +of diffusion models in image generation and super-resolution tasks, we propose +their extension to SSC by implementing the noising and denoising diffusion +processes in the point and semantic spaces individually. To control the +generation, we employ semantic LiDAR point clouds as conditional input and +design local and global regularization losses to stabilize the denoising +process. We evaluate our approach on autonomous driving datasets and our +approach outperforms the state-of-the-art for SSC. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ GSON: A Group-based Social Navigation Framework with Large Multimodal + Model + + +
+ As the number of service robots and autonomous vehicles in human-centered +environments grows, their requirements go beyond simply navigating to a +destination. They must also take into account dynamic social contexts and +ensure respect and comfort for others in shared spaces, which poses significant +challenges for perception and planning. In this paper, we present a group-based +social navigation framework GSON to enable mobile robots to perceive and +exploit the social group of their surroundings by leveling the visual reasoning +capability of the Large Multimodal Model (LMM). For perception, we apply visual +prompting techniques to zero-shot extract the social relationship among +pedestrians and combine the result with a robust pedestrian detection and +tracking pipeline to alleviate the problem of low inference speed of the LMM. +Given the perception result, the planning system is designed to avoid +disrupting the current social structure. We adopt a social structure-based +mid-level planner as a bridge between global path planning and local motion +planning to preserve the global context and reactive response. The proposed +method is validated on real-world mobile robot navigation tasks involving +complex social structure understanding and reasoning. Experimental results +demonstrate the effectiveness of the system in these scenarios compared with +several baselines. + +
+
+
+
+
+ + ☆ SKT: Integrating State-Aware Keypoint Trajectories with Vision-Language + Models for Robotic Garment Manipulation + + +
+ Automating garment manipulation poses a significant challenge for assistive +robotics due to the diverse and deformable nature of garments. Traditional +approaches typically require separate models for each garment type, which +limits scalability and adaptability. In contrast, this paper presents a unified +approach using vision-language models (VLMs) to improve keypoint prediction +across various garment categories. By interpreting both visual and semantic +information, our model enables robots to manage different garment states with a +single model. We created a large-scale synthetic dataset using advanced +simulation techniques, allowing scalable training without extensive real-world +data. Experimental results indicate that the VLM-based method significantly +enhances keypoint detection accuracy and task success rates, providing a more +flexible and general solution for robotic garment manipulation. In addition, +this research also underscores the potential of VLMs to unify various garment +manipulation tasks within a single framework, paving the way for broader +applications in home automation and assistive robotics for future. + +
+
+
+
+
+ + ☆ Infer Human's Intentions Before Following Natural Language Instructions + + +
+ For AI agents to be helpful to humans, they should be able to follow natural +language instructions to complete everyday cooperative tasks in human +environments. However, real human instructions inherently possess ambiguity, +because the human speakers assume sufficient prior knowledge about their hidden +goals and intentions. Standard language grounding and planning methods fail to +address such ambiguities because they do not model human internal goals as +additional partially observable factors in the environment. We propose a new +framework, Follow Instructions with Social and Embodied Reasoning (FISER), +aiming for better natural language instruction following in collaborative +embodied tasks. Our framework makes explicit inferences about human goals and +intentions as intermediate reasoning steps. We implement a set of +Transformer-based models and evaluate them over a challenging benchmark, +HandMeThat. We empirically demonstrate that using social reasoning to +explicitly infer human intentions before making action plans surpasses purely +end-to-end approaches. We also compare our implementation with strong +baselines, including Chain of Thought prompting on the largest available +pre-trained language models, and find that FISER provides better performance on +the embodied social reasoning tasks under investigation, reaching the +state-of-the-art on HandMeThat. + +
+
+
+
+
+ + ☆ FreeEdit: Mask-free Reference-based Image Editing with Multi-modal + Instruction + + +
+ Introducing user-specified visual concepts in image editing is highly +practical as these concepts convey the user's intent more precisely than +text-based descriptions. We propose FreeEdit, a novel approach for achieving +such reference-based image editing, which can accurately reproduce the visual +concept from the reference image based on user-friendly language instructions. +Our approach leverages the multi-modal instruction encoder to encode language +instructions to guide the editing process. This implicit way of locating the +editing area eliminates the need for manual editing masks. To enhance the +reconstruction of reference details, we introduce the Decoupled Residual +ReferAttention (DRRA) module. This module is designed to integrate fine-grained +reference features extracted by a detail extractor into the image editing +process in a residual way without interfering with the original self-attention. +Given that existing datasets are unsuitable for reference-based image editing +tasks, particularly due to the difficulty in constructing image triplets that +include a reference image, we curate a high-quality dataset, FreeBench, using a +newly developed twice-repainting scheme. FreeBench comprises the images before +and after editing, detailed editing instructions, as well as a reference image +that maintains the identity of the edited object, encompassing tasks such as +object addition, replacement, and deletion. By conducting phased training on +FreeBench followed by quality tuning, FreeEdit achieves high-quality zero-shot +editing through convenient language instructions. We conduct extensive +experiments to evaluate the effectiveness of FreeEdit across multiple task +types, demonstrating its superiority over existing methods. The code will be +available at: https://freeedit.github.io/. + +
+
+ comment: 14 pages, 14 figures, project website: https://freeedit.github.io/ +
+
+
+
+
+ + ☆ Visual Data Diagnosis and Debiasing with Concept Graphs + + +
+ The widespread success of deep learning models today is owed to the curation +of extensive datasets significant in size and complexity. However, such models +frequently pick up inherent biases in the data during the training process, +leading to unreliable predictions. Diagnosing and debiasing datasets is thus a +necessity to ensure reliable model performance. In this paper, we present +CONBIAS, a novel framework for diagnosing and mitigating Concept co-occurrence +Biases in visual datasets. CONBIAS represents visual datasets as knowledge +graphs of concepts, enabling meticulous analysis of spurious concept +co-occurrences to uncover concept imbalances across the whole dataset. +Moreover, we show that by employing a novel clique-based concept balancing +strategy, we can mitigate these imbalances, leading to enhanced performance on +downstream tasks. Extensive experiments show that data augmentation based on a +balanced concept distribution augmented by CONBIAS improves generalization +performance across multiple datasets compared to state-of-the-art methods. We +will make our code and data publicly available. + +
+
+
+
+
+ + ☆ DualAD: Dual-Layer Planning for Reasoning in Autonomous Driving + + +
+ We present a novel autonomous driving framework, DualAD, designed to imitate +human reasoning during driving. DualAD comprises two layers: a rule-based +motion planner at the bottom layer that handles routine driving tasks requiring +minimal reasoning, and an upper layer featuring a rule-based text encoder that +converts driving scenarios from absolute states into text description. This +text is then processed by a large language model (LLM) to make driving +decisions. The upper layer intervenes in the bottom layer's decisions when +potential danger is detected, mimicking human reasoning in critical situations. +Closed-loop experiments demonstrate that DualAD, using a zero-shot pre-trained +model, significantly outperforms rule-based motion planners that lack reasoning +abilities. Our experiments also highlight the effectiveness of the text +encoder, which considerably enhances the model's scenario understanding. +Additionally, the integrated DualAD model improves with stronger LLMs, +indicating the framework's potential for further enhancement. We make code and +benchmarks publicly available. + +
+
+ comment: Autonomous Driving, Large Language Models (LLMs), Human Reasoning, + Critical Scenario +
+
+
+
+
+ + ☆ Explaining Explaining + + +
+ Explanation is key to people having confidence in high-stakes AI systems. +However, machine-learning-based systems - which account for almost all current +AI - can't explain because they are usually black boxes. The explainable AI +(XAI) movement hedges this problem by redefining "explanation". The +human-centered explainable AI (HCXAI) movement identifies the +explanation-oriented needs of users but can't fulfill them because of its +commitment to machine learning. In order to achieve the kinds of explanations +needed by real people operating in critical domains, we must rethink how to +approach AI. We describe a hybrid approach to developing cognitive agents that +uses a knowledge-based infrastructure supplemented by data obtained through +machine learning when applicable. These agents will serve as assistants to +humans who will bear ultimate responsibility for the decisions and actions of +the human-robot team. We illustrate the explanatory potential of such agents +using the under-the-hood panels of a demonstration system in which a team of +simulated robots collaborates on a search task assigned by a human. + +
+
+
+
+
+ + ☆ Revisit Anything: Visual Place Recognition via Image Segment Retrieval ECCV 2024 + + +
+ Accurately recognizing a revisited place is crucial for embodied agents to +localize and navigate. This requires visual representations to be distinct, +despite strong variations in camera viewpoint and scene appearance. Existing +visual place recognition pipelines encode the "whole" image and search for +matches. This poses a fundamental challenge in matching two images of the same +place captured from different camera viewpoints: "the similarity of what +overlaps can be dominated by the dissimilarity of what does not overlap". We +address this by encoding and searching for "image segments" instead of the +whole images. We propose to use open-set image segmentation to decompose an +image into `meaningful' entities (i.e., things and stuff). This enables us to +create a novel image representation as a collection of multiple overlapping +subgraphs connecting a segment with its neighboring segments, dubbed +SuperSegment. Furthermore, to efficiently encode these SuperSegments into +compact vector representations, we propose a novel factorized representation of +feature aggregation. We show that retrieving these partial representations +leads to significantly higher recognition recall than the typical whole image +based retrieval. Our segments-based approach, dubbed SegVLAD, sets a new +state-of-the-art in place recognition on a diverse selection of benchmark +datasets, while being applicable to both generic and task-specialized image +encoders. Finally, we demonstrate the potential of our method to ``revisit +anything'' by evaluating our method on an object instance retrieval task, which +bridges the two disparate areas of research: visual place recognition and +object-goal navigation, through their common aim of recognizing goal objects +specific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything. + +
+
+ comment: Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures +
+
+
+
+
+ + ☆ HARMONIC: Cognitive and Control Collaboration in Human-Robotic Teams ICRA 2025 + + +
+ This paper presents a novel approach to multi-robot planning and +collaboration. We demonstrate a cognitive strategy for robots in human-robot +teams that incorporates metacognition, natural language communication, and +explainability. The system is embodied using the HARMONIC architecture that +flexibly integrates cognitive and control capabilities across the team. We +evaluate our approach through simulation experiments involving a joint search +task by a team of heterogeneous robots (a UGV and a drone) and a human. We +detail the system's handling of complex, real-world scenarios, effective action +coordination between robots with different capabilities, and natural +human-robot communication. This work demonstrates that the robots' ability to +reason about plans, goals, and attitudes, and to provide explanations for +actions and decisions are essential prerequisites for realistic human-robot +teaming. + +
+
+ comment: Submitted to ICRA 2025 Conference, Atlanta, GA, USA +
+
+
+
+
+ + ☆ IFCap: Image-like Retrieval and Frequency-based Entity Filtering for + Zero-shot Captioning EMNLP 2024 + + +
+ Recent advancements in image captioning have explored text-only training +methods to overcome the limitations of paired image-text data. However, +existing text-only training methods often overlook the modality gap between +using text data during training and employing images during inference. To +address this issue, we propose a novel approach called Image-like Retrieval, +which aligns text features with visually relevant features to mitigate the +modality gap. Our method further enhances the accuracy of generated captions by +designing a Fusion Module that integrates retrieved captions with input +features. Additionally, we introduce a Frequency-based Entity Filtering +technique that significantly improves caption quality. We integrate these +methods into a unified framework, which we refer to as IFCap +($\textbf{I}$mage-like Retrieval and $\textbf{F}$requency-based Entity +Filtering for Zero-shot $\textbf{Cap}$tioning). Through extensive +experimentation, our straightforward yet powerful approach has demonstrated its +efficacy, outperforming the state-of-the-art methods by a significant margin in +both image captioning and video captioning compared to zero-shot captioning +based on text-only training. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ HARMONIC: A Framework for Explanatory Cognitive Robots ICRA + + +
+ We present HARMONIC, a framework for implementing cognitive robots that +transforms general-purpose robots into trusted teammates capable of complex +decision-making, natural communication and human-level explanation. The +framework supports interoperability between a strategic (cognitive) layer for +high-level decision-making and a tactical (robot) layer for low-level control +and execution. We describe the core features of the framework and our initial +implementation, in which HARMONIC was deployed on a simulated UGV and drone +involved in a multi-robot search and retrieval task. + +
+
+ comment: Accepted for presentation at ICRA@40. 23-26 September 2024, + Rotterdam, Netherlands +
+
+
+
+
+ + ☆ Compositional Hardness of Code in Large Language Models -- A + Probabilistic Perspective + + +
+ A common practice in large language model (LLM) usage for complex analytical +tasks such as code generation, is to sample a solution for the entire task +within the model's context window. Previous works have shown that subtask +decomposition within the model's context (chain of thought), is beneficial for +solving such tasks. In this work, we point a limitation of LLMs' ability to +perform several sub-tasks within the same context window - an in-context +hardness of composition, pointing to an advantage for distributing a decomposed +problem in a multi-agent system of LLMs. The hardness of composition is +quantified by a generation complexity metric, i.e., the number of LLM +generations required to sample at least one correct solution. We find a gap +between the generation complexity of solving a compositional problem within the +same context relative to distributing it among multiple agents, that increases +exponentially with the solution's length. We prove our results theoretically +and demonstrate them empirically. + +
+
+
+
+
+ + ☆ An Adversarial Perspective on Machine Unlearning for AI Safety + + +
+ Large language models are finetuned to refuse questions about hazardous +knowledge, but these protections can often be bypassed. Unlearning methods aim +at completely removing hazardous capabilities from models and make them +inaccessible to adversaries. This work challenges the fundamental differences +between unlearning and traditional safety post-training from an adversarial +perspective. We demonstrate that existing jailbreak methods, previously +reported as ineffective against unlearning, can be successful when applied +carefully. Furthermore, we develop a variety of adaptive methods that recover +most supposedly unlearned capabilities. For instance, we show that finetuning +on 10 unrelated examples or removing specific directions in the activation +space can recover most hazardous capabilities for models edited with RMU, a +state-of-the-art unlearning method. Our findings challenge the robustness of +current unlearning approaches and question their advantages over safety +training. + +
+
+
+
+
+ + ☆ Transferring disentangled representations: bridging the gap between + synthetic and real images + + +
+ Developing meaningful and efficient representations that separate the +fundamental structure of the data generation mechanism is crucial in +representation learning. However, Disentangled Representation Learning has not +fully shown its potential on real images, because of correlated generative +factors, their resolution and limited access to ground truth labels. +Specifically on the latter, we investigate the possibility of leveraging +synthetic data to learn general-purpose disentangled representations applicable +to real data, discussing the effect of fine-tuning and what properties of +disentanglement are preserved after the transfer. We provide an extensive +empirical study to address these issues. In addition, we propose a new +interpretable intervention-based metric, to measure the quality of factors +encoding in the representation. Our results indicate that some level of +disentanglement, transferring a representation from synthetic to real data, is +possible and effective. + +
+
+
+
+
+ + ☆ Role-RL: Online Long-Context Processing with Role Reinforcement Learning + for Distinct LLMs in Their Optimal Roles + + +
+ Large language models (LLMs) with long-context processing are still +challenging because of their implementation complexity, training efficiency and +data sparsity. To address this issue, a new paradigm named Online Long-context +Processing (OLP) is proposed when we process a document of unlimited length, +which typically occurs in the information reception and organization of diverse +streaming media such as automated news reporting, live e-commerce, and viral +short videos. Moreover, a dilemma was often encountered when we tried to select +the most suitable LLM from a large number of LLMs amidst explosive growth +aiming for outstanding performance, affordable prices, and short response +delays. In view of this, we also develop Role Reinforcement Learning (Role-RL) +to automatically deploy different LLMs in their respective roles within the OLP +pipeline according to their actual performance. Extensive experiments are +conducted on our OLP-MINI dataset and it is found that OLP with Role-RL +framework achieves OLP benchmark with an average recall rate of 93.2% and the +LLM cost saved by 79.4%. The code and dataset are publicly available at: +https://anonymous.4open.science/r/Role-RL. + +
+
+
+
+
+ + ☆ Control Industrial Automation System with Large Language Models + + +
+ Traditional industrial automation systems require specialized expertise to +operate and complex reprogramming to adapt to new processes. Large language +models offer the intelligence to make them more flexible and easier to use. +However, LLMs' application in industrial settings is underexplored. This paper +introduces a framework for integrating LLMs to achieve end-to-end control of +industrial automation systems. At the core of the framework are an agent system +designed for industrial tasks, a structured prompting method, and an +event-driven information modeling mechanism that provides real-time data for +LLM inference. The framework supplies LLMs with real-time events on different +context semantic levels, allowing them to interpret the information, generate +production plans, and control operations on the automation system. It also +supports structured dataset creation for fine-tuning on this downstream +application of LLMs. Our contribution includes a formal system design, +proof-of-concept implementation, and a method for generating task-specific +datasets for LLM fine-tuning and testing. This approach enables a more adaptive +automation system that can respond to spontaneous events, while allowing easier +operation and configuration through natural language for more intuitive +human-machine interaction. We provide demo videos and detailed data on GitHub: +https://github.com/YuchenXia/LLM4IAS + +
+
+
+
+
+ + ☆ Joint Localization and Planning using Diffusion ICRA 2025 + + +
+ Diffusion models have been successfully applied to robotics problems such as +manipulation and vehicle path planning. In this work, we explore their +application to end-to-end navigation -- including both perception and planning +-- by considering the problem of jointly performing global localization and +path planning in known but arbitrary 2D environments. In particular, we +introduce a diffusion model which produces collision-free paths in a global +reference frame given an egocentric LIDAR scan, an arbitrary map, and a desired +goal position. To this end, we implement diffusion in the space of paths in +SE(2), and describe how to condition the denoising process on both obstacles +and sensor observations. In our evaluation, we show that the proposed +conditioning techniques enable generalization to realistic maps of considerably +different appearance than the training environment, demonstrate our model's +ability to accurately describe ambiguous solutions, and run extensive +simulation experiments showcasing our model's use as a real-time, end-to-end +localization and planning stack. + +
+
+ comment: 7 pages, 9 figures. Submitted to ICRA 2025, under review +
+
+
+
+
+ + ☆ CRoP: Context-wise Robust Static Human-Sensing Personalization + + +
+ The advancement in deep learning and internet-of-things have led to diverse +human sensing applications. However, distinct patterns in human sensing, +influenced by various factors or contexts, challenge generic neural network +model's performance due to natural distribution shifts. To address this, +personalization tailors models to individual users. Yet most personalization +studies overlook intra-user heterogeneity across contexts in sensory data, +limiting intra-user generalizability. This limitation is especially critical in +clinical applications, where limited data availability hampers both +generalizability and personalization. Notably, intra-user sensing attributes +are expected to change due to external factors such as treatment progression, +further complicating the challenges.This work introduces CRoP, a novel static +personalization approach using an off-the-shelf pre-trained model and pruning +to optimize personalization and generalization. CRoP shows superior +personalization effectiveness and intra-user robustness across four +human-sensing datasets, including two from real-world health domains, +highlighting its practical and social impact. Additionally, to support CRoP's +generalization ability and design choices, we provide empirical justification +through gradient inner product analysis, ablation studies, and comparisons +against state-of-the-art baselines. + +
+
+ comment: 31 pages, 10 figues and 13 tables +
+
+
+
+
+ + ☆ HydraViT: Stacking Heads for a Scalable ViT + + +
+ The architecture of Vision Transformers (ViTs), particularly the Multi-head +Attention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs +on devices with varying constraints, such as mobile phones, requires multiple +models of different sizes. However, this approach has limitations, such as +training and storing each required model separately. This paper introduces +HydraViT, a novel approach that addresses these limitations by stacking +attention heads to achieve a scalable ViT. By repeatedly changing the size of +the embedded dimensions throughout each layer and their corresponding number of +attention heads in MHA during training, HydraViT induces multiple subnetworks. +Thereby, HydraViT achieves adaptability across a wide spectrum of hardware +environments while maintaining performance. Our experimental results +demonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10 +subnetworks, covering a wide range of resource constraints. HydraViT achieves +up to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy +with the same throughput on ImageNet-1K compared to the baselines, making it an +effective solution for scenarios where hardware availability is diverse or +varies over time. Source code available at https://github.com/ds-kiel/HydraViT. + +
+
+
+
+
+ + ☆ Enhancing elusive clues in knowledge learning by contrasting attention + of language models + + +
+ Causal language models acquire vast amount of knowledge from general text +corpus during pretraining, but the efficiency of knowledge learning is known to +be unsatisfactory, especially when learning from knowledge-dense and +small-sized corpora. The deficiency can come from long-distance dependencies +which are hard to capture by language models, and overfitting to co-occurrence +patterns and distracting clues in the training text. To address these issues, +the paper proposes a method to enhance knowledge learning during language model +pretraining, by enhancing elusive but important clues in text discovered by the +language model themselves. We found that larger language models pay more +attention to non-obvious but important clues, which are often overlooked by +smaller language models. Therefore, we can identify these clues by contrasting +the attention weights of large and small language models. We use the identified +clues as a guide to perform token-dropout data augmentation on the training +text, and observed a significant boost in both small and large models' +performance in fact memorization. This shows that the behavior contrast between +more and less-performant language models contains important clues for knowledge +learning, and it can be ``amplified" for a straight-forward improvement in +knowledge learning efficiency. + +
+
+ comment: 7 pages and 17 figures +
+
+
+
+
+ + ☆ Weak-To-Strong Backdoor Attacks for LLMs with Contrastive Knowledge + Distillation + + +
+ Despite being widely applied due to their exceptional capabilities, Large +Language Models (LLMs) have been proven to be vulnerable to backdoor attacks. +These attacks introduce targeted vulnerabilities into LLMs by poisoning +training samples and full-parameter fine-tuning. However, this kind of backdoor +attack is limited since they require significant computational resources, +especially as the size of LLMs increases. Besides, parameter-efficient +fine-tuning (PEFT) offers an alternative but the restricted parameter updating +may impede the alignment of triggers with target labels. In this study, we +first verify that backdoor attacks with PEFT may encounter challenges in +achieving feasible performance. To address these issues and improve the +effectiveness of backdoor attacks with PEFT, we propose a novel backdoor attack +algorithm from weak to strong based on contrastive knowledge distillation +(W2SAttack). Specifically, we poison small-scale language models through +full-parameter fine-tuning to serve as the teacher model. The teacher model +then covertly transfers the backdoor to the large-scale student model through +contrastive knowledge distillation, which employs PEFT. Theoretical analysis +reveals that W2SAttack has the potential to augment the effectiveness of +backdoor attacks. We demonstrate the superior performance of W2SAttack on +classification tasks across four language models, four backdoor attack +algorithms, and two different architectures of teacher models. Experimental +results indicate success rates close to 100% for backdoor attacks targeting +PEFT. + +
+
+
+
+
+ + ☆ On Translating Technical Terminology: A Translation Workflow for + Machine-Translated Acronyms + + +
+ The typical workflow for a professional translator to translate a document +from its source language (SL) to a target language (TL) is not always focused +on what many language models in natural language processing (NLP) do - predict +the next word in a series of words. While high-resource languages like English +and French are reported to achieve near human parity using common metrics for +measurement such as BLEU and COMET, we find that an important step is being +missed: the translation of technical terms, specifically acronyms. Some +state-of-the art machine translation systems like Google Translate which are +publicly available can be erroneous when dealing with acronyms - as much as 50% +in our findings. This article addresses acronym disambiguation for MT systems +by proposing an additional step to the SL-TL (FR-EN) translation workflow where +we first offer a new acronym corpus for public consumption and then experiment +with a search-based thresholding algorithm that achieves nearly 10% increase +when compared to Google Translate and OpusMT. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ Predicting Anchored Text from Translation Memories for Machine + Translation Using Deep Learning Methods + + +
+ Translation memories (TMs) are the backbone for professional translation +tools called computer-aided translation (CAT) tools. In order to perform a +translation using a CAT tool, a translator uses the TM to gather translations +similar to the desired segment to translate (s'). Many CAT tools offer a +fuzzy-match algorithm to locate segments (s) in the TM that are close in +distance to s'. After locating two similar segments, the CAT tool will present +parallel segments (s, t) that contain one segment in the source language along +with its translation in the target language. Additionally, CAT tools contain +fuzzy-match repair (FMR) techniques that will automatically use the parallel +segments from the TM to create new TM entries containing a modified version of +the original with the idea in mind that it will be the translation of s'. Most +FMR techniques use machine translation as a way of "repairing" those words that +have to be modified. In this article, we show that for a large part of those +words which are anchored, we can use other techniques that are based on machine +learning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we +show that for anchored words that follow the continuous bag-of-words (CBOW) +paradigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for +some cases, better results than neural machine translation for translating +anchored words from French to English. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ Intelligent Energy Management: Remaining Useful Life Prediction and + Charging Automation System Comprised of Deep Learning and the Internet of + Things + + +
+ Remaining Useful Life (RUL) of battery is an important parameter to know the +battery's remaining life and need for recharge. The goal of this research +project is to develop machine learning-based models for the battery RUL +dataset. Different ML models are developed to classify the RUL of the vehicle, +and the IoT (Internet of Things) concept is simulated for automating the +charging system and managing any faults aligning. The graphs plotted depict the +relationship between various vehicle parameters using the Blynk IoT platform. +Results show that the catboost, Multi-Layer Perceptron (MLP), Gated Recurrent +Unit (GRU), and hybrid model developed could classify RUL into three classes +with 99% more accuracy. The data is fed using the tkinter GUI for simulating +artificial intelligence (AI)-based charging, and with a pyserial backend, data +can be entered into the Esp-32 microcontroller for making charge discharge +possible with the model's predictions. Also, with an IoT system, the charging +can be disconnected, monitored, and analyzed for automation. The results show +that an accuracy of 99% can be obtained on models MLP, catboost model and +similar accuracy on GRU model can be obtained, and finally relay-based +triggering can be made by prediction through the model used for automating the +charging and energy-saving mechanism. By showcasing an exemplary Blynk +platform-based monitoring and automation phenomenon, we further present +innovative ways of monitoring parameters and automating the system. + +
+
+
+
+
+ + ☆ Pioneering Reliable Assessment in Text-to-Image Knowledge Editing: + Leveraging a Fine-Grained Dataset and an Innovative Criterion EMNLP24 + + +
+ During pre-training, the Text-to-Image (T2I) diffusion models encode factual +knowledge into their parameters. These parameterized facts enable realistic +image generation, but they may become obsolete over time, thereby +misrepresenting the current state of the world. Knowledge editing techniques +aim to update model knowledge in a targeted way. However, facing the dual +challenges posed by inadequate editing datasets and unreliable evaluation +criterion, the development of T2I knowledge editing encounter difficulties in +effectively generalizing injected knowledge. In this work, we design a T2I +knowledge editing framework by comprehensively spanning on three phases: First, +we curate a dataset \textbf{CAKE}, comprising paraphrase and multi-object test, +to enable more fine-grained assessment on knowledge generalization. Second, we +propose a novel criterion, \textbf{adaptive CLIP threshold}, to effectively +filter out false successful images under the current criterion and achieve +reliable editing evaluation. Finally, we introduce \textbf{MPE}, a simple but +effective approach for T2I knowledge editing. Instead of tuning parameters, MPE +precisely recognizes and edits the outdated part of the conditioning +text-prompt to accommodate the up-to-date knowledge. A straightforward +implementation of MPE (Based on in-context learning) exhibits better overall +performance than previous model editors. We hope these efforts can further +promote faithful evaluation of T2I knowledge editing methods. + +
+
+ comment: EMNLP24 Findings +
+
+
+
+
+ + ☆ Navigation in a simplified Urban Flow through Deep Reinforcement + Learning + + +
+ The increasing number of unmanned aerial vehicles (UAVs) in urban +environments requires a strategy to minimize their environmental impact, both +in terms of energy efficiency and noise reduction. In order to reduce these +concerns, novel strategies for developing prediction models and optimization of +flight planning, for instance through deep reinforcement learning (DRL), are +needed. Our goal is to develop DRL algorithms capable of enabling the +autonomous navigation of UAVs in urban environments, taking into account the +presence of buildings and other UAVs, optimizing the trajectories in order to +reduce both energetic consumption and noise. This is achieved using fluid-flow +simulations which represent the environment in which UAVs navigate and training +the UAV as an agent interacting with an urban environment. In this work, we +consider a domain domain represented by a two-dimensional flow field with +obstacles, ideally representing buildings, extracted from a three-dimensional +high-fidelity numerical simulation. The presented methodology, using PPO+LSTM +cells, was validated by reproducing a simple but fundamental problem in +navigation, namely the Zermelo's problem, which deals with a vessel navigating +in a turbulent flow, travelling from a starting point to a target location, +optimizing the trajectory. The current method shows a significant improvement +with respect to both a simple PPO and a TD3 algorithm, with a success rate (SR) +of the PPO+LSTM trained policy of 98.7%, and a crash rate (CR) of 0.1%, +outperforming both PPO (SR = 75.6%, CR=18.6%) and TD3 (SR=77.4% and CR=14.5%). +This is the first step towards DRL strategies which will guide UAVs in a +three-dimensional flow field using real-time signals, making the navigation +efficient in terms of flight time and avoiding damages to the vehicle. + +
+
+
+
+
+ + ☆ PhantomLiDAR: Cross-modality Signal Injection Attacks against LiDAR + + +
+ LiDAR (Light Detection and Ranging) is a pivotal sensor for autonomous +driving, offering precise 3D spatial information. Previous signal attacks +against LiDAR systems mainly exploit laser signals. In this paper, we +investigate the possibility of cross-modality signal injection attacks, i.e., +injecting intentional electromagnetic interference (IEMI) to manipulate LiDAR +output. Our insight is that the internal modules of a LiDAR, i.e., the laser +receiving circuit, the monitoring sensors, and the beam-steering modules, even +with strict electromagnetic compatibility (EMC) testing, can still couple with +the IEMI attack signals and result in the malfunction of LiDAR systems. Based +on the above attack surfaces, we propose the PhantomLiDAR attack, which +manipulates LiDAR output in terms of Points Interference, Points Injection, +Points Removal, and even LiDAR Power-Off. We evaluate and demonstrate the +effectiveness of PhantomLiDAR with both simulated and real-world experiments on +five COTS LiDAR systems. We also conduct feasibility experiments in real-world +moving scenarios. We provide potential defense measures that can be implemented +at both the sensor level and the vehicle system level to mitigate the risks +associated with IEMI attacks. Video demonstrations can be viewed at +https://sites.google.com/view/phantomlidar. + +
+
+
+
+
+ + ☆ Learning to Love Edge Cases in Formative Math Assessment: Using the + AMMORE Dataset and Chain-of-Thought Prompting to Improve Grading Accuracy + + +
+ This paper introduces AMMORE, a new dataset of 53,000 math open-response +question-answer pairs from Rori, a learning platform used by students in +several African countries and conducts two experiments to evaluate the use of +large language models (LLM) for grading particularly challenging student +answers. The AMMORE dataset enables various potential analyses and provides an +important resource for researching student math acquisition in understudied, +real-world, educational contexts. In experiment 1 we use a variety of +LLM-driven approaches, including zero-shot, few-shot, and chain-of-thought +prompting, to grade the 1% of student answers that a rule-based classifier +fails to grade accurately. We find that the best-performing approach -- +chain-of-thought prompting -- accurately scored 92% of these edge cases, +effectively boosting the overall accuracy of the grading from 98.7% to 99.9%. +In experiment 2, we aim to better understand the consequential validity of the +improved grading accuracy, by passing grades generated by the best-performing +LLM-based approach to a Bayesian Knowledge Tracing (BKT) model, which estimated +student mastery of specific lessons. We find that relatively modest +improvements in model accuracy at the individual question level can lead to +significant changes in the estimation of student mastery. Where the rules-based +classifier currently used to grade student, answers misclassified the mastery +status of 6.9% of students across their completed lessons, using the LLM +chain-of-thought approach this misclassification rate was reduced to 2.6% of +students. Taken together, these findings suggest that LLMs could be a valuable +tool for grading open-response questions in K-12 mathematics education, +potentially enabling encouraging wider adoption of open-ended questions in +formative assessment. + +
+
+
+
+
+ + ☆ Revisiting Acoustic Similarity in Emotional Speech and Music via + Self-Supervised Representations + + +
+ Emotion recognition from speech and music shares similarities due to their +acoustic overlap, which has led to interest in transferring knowledge between +these domains. However, the shared acoustic cues between speech and music, +particularly those encoded by Self-Supervised Learning (SSL) models, remain +largely unexplored, given the fact that SSL models for speech and music have +rarely been applied in cross-domain research. In this work, we revisit the +acoustic similarity between emotion speech and music, starting with an analysis +of the layerwise behavior of SSL models for Speech Emotion Recognition (SER) +and Music Emotion Recognition (MER). Furthermore, we perform cross-domain +adaptation by comparing several approaches in a two-stage fine-tuning process, +examining effective ways to utilize music for SER and speech for MER. Lastly, +we explore the acoustic similarities between emotional speech and music using +Frechet audio distance for individual emotions, uncovering the issue of emotion +bias in both speech and music SSL models. Our findings reveal that while speech +and music SSL models do capture shared acoustic features, their behaviors can +vary depending on different emotions due to their training strategies and +domain-specificities. Additionally, parameter-efficient fine-tuning can enhance +SER and MER performance by leveraging knowledge from each other. This study +provides new insights into the acoustic similarity between emotional speech and +music, and highlights the potential for cross-domain generalization to improve +SER and MER systems. + +
+
+
+
+
+ + ☆ Why Companies "Democratise" Artificial Intelligence: The Case of Open + Source Software Donations + + +
+ Companies claim to "democratise" artificial intelligence (AI) when they +donate AI open source software (OSS) to non-profit foundations or release AI +models, among others, but what does this term mean and why do they do it? As +the impact of AI on society and the economy grows, understanding the commercial +incentives behind AI democratisation efforts is crucial for ensuring these +efforts serve broader interests beyond commercial agendas. Towards this end, +this study employs a mixed-methods approach to investigate commercial +incentives for 43 AI OSS donations to the Linux Foundation. It makes +contributions to both research and practice. It contributes a taxonomy of both +individual and organisational social, economic, and technological incentives +for AI democratisation. In particular, it highlights the role of democratising +the governance and control rights of an OSS project (i.e., from one company to +open governance) as a structural enabler for downstream goals, such as +attracting external contributors, reducing development costs, and influencing +industry standards, among others. Furthermore, OSS donations are often +championed by individual developers within companies, highlighting the +importance of the bottom-up incentives for AI democratisation. The taxonomy +provides a framework and toolkit for discerning incentives for other AI +democratisation efforts, such as the release of AI models. The paper concludes +with a discussion of future research directions. + +
+
+ comment: 30 pages, 1 figure, 5 tables +
+
+
+
+
+ + ☆ DarkSAM: Fooling Segment Anything Model to Segment Nothing NeurIPS'24 + + +
+ Segment Anything Model (SAM) has recently gained much attention for its +outstanding generalization to unseen data and tasks. Despite its promising +prospect, the vulnerabilities of SAM, especially to universal adversarial +perturbation (UAP) have not been thoroughly investigated yet. In this paper, we +propose DarkSAM, the first prompt-free universal attack framework against SAM, +including a semantic decoupling-based spatial attack and a texture +distortion-based frequency attack. We first divide the output of SAM into +foreground and background. Then, we design a shadow target strategy to obtain +the semantic blueprint of the image as the attack target. DarkSAM is dedicated +to fooling SAM by extracting and destroying crucial object features from images +in both spatial and frequency domains. In the spatial domain, we disrupt the +semantics of both the foreground and background in the image to confuse SAM. In +the frequency domain, we further enhance the attack effectiveness by distorting +the high-frequency components (i.e., texture information) of the image. +Consequently, with a single UAP, DarkSAM renders SAM incapable of segmenting +objects across diverse images with varying prompts. Experimental results on +four datasets for SAM and its two variant models demonstrate the powerful +attack capability and transferability of DarkSAM. + +
+
+ comment: This paper has been accepted by the 38th Annual Conference on Neural + Information Processing Systems (NeurIPS'24) +
+
+
+
+
+ + ☆ Efficient Arbitrary Precision Acceleration for Large Language Models on + GPU Tensor Cores + + +
+ Large language models (LLMs) have been widely applied but face challenges in +efficient inference. While quantization methods reduce computational demands, +ultra-low bit quantization with arbitrary precision is hindered by limited GPU +Tensor Core support and inefficient memory management, leading to suboptimal +acceleration. To address these challenges, we propose a comprehensive +acceleration scheme for arbitrary precision LLMs. At its core, we introduce a +novel bipolar-INT data format that facilitates parallel computing and supports +symmetric quantization, effectively reducing data redundancy. Building on this, +we implement an arbitrary precision matrix multiplication scheme that +decomposes and recovers matrices at the bit level, enabling flexible precision +while maximizing GPU Tensor Core utilization. Furthermore, we develop an +efficient matrix preprocessing method that optimizes data layout for subsequent +computations. Finally, we design a data recovery-oriented memory management +system that strategically utilizes fast shared memory, significantly enhancing +kernel execution speed and minimizing memory access latency. Experimental +results demonstrate our approach's effectiveness, with up to 13\times speedup +in matrix multiplication compared to NVIDIA's CUTLASS. When integrated into +LLMs, we achieve up to 6.7\times inference acceleration. These improvements +significantly enhance LLM inference efficiency, enabling broader and more +responsive applications of LLMs. + +
+
+
+
+
+ + ☆ Implementing a Nordic-Baltic Federated Health Data Network: a case + report + + +
+ Background: Centralized collection and processing of healthcare data across +national borders pose significant challenges, including privacy concerns, data +heterogeneity and legal barriers. To address some of these challenges, we +formed an interdisciplinary consortium to develop a feder-ated health data +network, comprised of six institutions across five countries, to facilitate +Nordic-Baltic cooperation on secondary use of health data. The objective of +this report is to offer early insights into our experiences developing this +network. Methods: We used a mixed-method ap-proach, combining both experimental +design and implementation science to evaluate the factors affecting the +implementation of our network. Results: Technically, our experiments indicate +that the network functions without significant performance degradation compared +to centralized simu-lation. Conclusion: While use of interdisciplinary +approaches holds a potential to solve challeng-es associated with establishing +such collaborative networks, our findings turn the spotlight on the uncertain +regulatory landscape playing catch up and the significant operational costs. + +
+
+ comment: 24 pages (including appendices), 1 figure +
+
+
+
+
+ + ☆ A Multimodal Single-Branch Embedding Network for Recommendation in + Cold-Start and Missing Modality Scenarios + + +
+ Most recommender systems adopt collaborative filtering (CF) and provide +recommendations based on past collective interactions. Therefore, the +performance of CF algorithms degrades when few or no interactions are +available, a scenario referred to as cold-start. To address this issue, +previous work relies on models leveraging both collaborative data and side +information on the users or items. Similar to multimodal learning, these models +aim at combining collaborative and content representations in a shared +embedding space. In this work we propose a novel technique for multimodal +recommendation, relying on a multimodal Single-Branch embedding network for +Recommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction +data as well as multimodal side information using the same single-branch +embedding network on different modalities. This makes SiBraR effective in +scenarios of missing modality, including cold start. Our extensive experiments +on large-scale recommendation datasets from three different recommendation +domains (music, movie, and e-commerce) and providing multimodal content +information (audio, text, image, labels, and interactions) show that SiBraR +significantly outperforms CF as well as state-of-the-art content-based RSs in +cold-start scenarios, and is competitive in warm scenarios. We show that +SiBraR's recommendations are accurate in missing modality scenarios, and that +the model is able to map different modalities to the same region of the shared +embedding space, hence reducing the modality gap. + +
+
+ comment: Accepted at 18th ACM Conference on Recommender Systems (RecSys '24) +
+
+
+
+
+ + ☆ Machine Learning-based vs Deep Learning-based Anomaly Detection in + Multivariate Time Series for Spacecraft Attitude Sensors + + +
+ In the framework of Failure Detection, Isolation and Recovery (FDIR) on +spacecraft, new AI-based approaches are emerging in the state of the art to +overcome the limitations commonly imposed by traditional threshold checking. + The present research aims at characterizing two different approaches to the +problem of stuck values detection in multivariate time series coming from +spacecraft attitude sensors. The analysis reveals the performance differences +in the two approaches, while commenting on their interpretability and +generalization to different scenarios. + +
+
+ comment: Accepted for the ESA SPAICE Conference 2024 +
+
+
+
+
+ + ☆ Detecting and Measuring Confounding Using Causal Mechanism Shifts + + +
+ Detecting and measuring confounding effects from data is a key challenge in +causal inference. Existing methods frequently assume causal sufficiency, +disregarding the presence of unobserved confounding variables. Causal +sufficiency is both unrealistic and empirically untestable. Additionally, +existing methods make strong parametric assumptions about the underlying causal +generative process to guarantee the identifiability of confounding variables. +Relaxing the causal sufficiency and parametric assumptions and leveraging +recent advancements in causal discovery and confounding analysis with +non-i.i.d. data, we propose a comprehensive approach for detecting and +measuring confounding. We consider various definitions of confounding and +introduce tailored methodologies to achieve three objectives: (i) detecting and +measuring confounding among a set of variables, (ii) separating observed and +unobserved confounding effects, and (iii) understanding the relative strengths +of confounding bias between different sets of variables. We present useful +properties of a confounding measure and present measures that satisfy those +properties. Empirical results support the theoretical analysis. + +
+
+
+
+
+ + ☆ Language Models as Zero-shot Lossless Gradient Compressors: Towards + General Neural Parameter Prior Models NeurIPS 2024 + + +
+ Despite the widespread use of statistical prior models in various fields, +such models for neural network gradients have long been overlooked. The +inherent challenge stems from their high-dimensional structures and complex +interdependencies, which complicate effective modeling. In this work, we +demonstrate the potential of large language models (LLMs) to act as gradient +priors in a zero-shot setting. We examine the property by considering lossless +gradient compression -- a critical application in distributed learning -- that +depends heavily on precise probability modeling. To achieve this, we introduce +LM-GC, a novel method that integrates LLMs with arithmetic coding. Our +technique converts plain gradients into text-like formats, enhancing token +efficiency by up to 38 times compared to their plain representations. We ensure +that this data conversion maintains a close alignment with the structure of +plain gradients and the symbols commonly recognized by LLMs. Our experiments +indicate that LM-GC surpasses existing state-of-the-art lossless compression +methods, improving compression rates by 10\% up to 17.2\% across various +datasets and architectures. Additionally, our approach shows promising +compatibility with lossy compression techniques such as quantization and +sparsification. These findings highlight the significant potential of LLMs as a +model for effectively handling gradients. We will release the source code upon +publication. + +
+
+ comment: To appear in NeurIPS 2024 +
+
+
+
+
+ + ☆ Inference-Time Language Model Alignment via Integrated Value Guidance EMNLP 2024 + + +
+ Large language models are typically fine-tuned to align with human +preferences, but tuning large models is computationally intensive and complex. +In this work, we introduce $\textit{Integrated Value Guidance}$ (IVG), a method +that uses implicit and explicit value functions to guide language model +decoding at token and chunk-level respectively, efficiently aligning large +language models purely at inference time. This approach circumvents the +complexities of direct fine-tuning and outperforms traditional methods. +Empirically, we demonstrate the versatility of IVG across various tasks. In +controlled sentiment generation and summarization tasks, our method +significantly improves the alignment of large models using inference-time +guidance from $\texttt{gpt2}$-based value functions. Moreover, in a more +challenging instruction-following benchmark AlpacaEval 2.0, we show that both +specifically tuned and off-the-shelf value functions greatly improve the +length-controlled win rates of large models against $\texttt{gpt-4-turbo}$ +(e.g., $19.51\% \rightarrow 26.51\%$ for $\texttt{Mistral-7B-Instruct-v0.2}$ +and $25.58\% \rightarrow 33.75\%$ for $\texttt{Mixtral-8x7B-Instruct-v0.1}$ +with Tulu guidance). + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ☆ DREAMS: A python framework to train deep learning models with model card + reporting for medical and health applications + + +
+ Electroencephalography (EEG) data provides a non-invasive method for +researchers and clinicians to observe brain activity in real time. The +integration of deep learning techniques with EEG data has significantly +improved the ability to identify meaningful patterns, leading to valuable +insights for both clinical and research purposes. However, most of the +frameworks so far, designed for EEG data analysis, are either too focused on +pre-processing or in deep learning methods per, making their use for both +clinician and developer communities problematic. Moreover, critical issues such +as ethical considerations, biases, uncertainties, and the limitations inherent +in AI models for EEG data analysis are frequently overlooked, posing challenges +to the responsible implementation of these technologies. In this paper, we +introduce a comprehensive deep learning framework tailored for EEG data +processing, model training and report generation. While constructed in way to +be adapted and developed further by AI developers, it enables to report, +through model cards, the outcome and specific information of use for both +developers and clinicians. In this way, we discuss how this framework can, in +the future, provide clinical researchers and developers with the tools needed +to create transparent and accountable AI models for EEG data analysis and +diagnosis. + +
+
+
+
+
+ + ☆ Self-supervised Preference Optimization: Enhance Your Language Model + with Preference Degree Awareness EMNLP 2024 + + +
+ Recently, there has been significant interest in replacing the reward model +in Reinforcement Learning with Human Feedback (RLHF) methods for Large Language +Models (LLMs), such as Direct Preference Optimization (DPO) and its variants. +These approaches commonly use a binary cross-entropy mechanism on pairwise +samples, i.e., minimizing and maximizing the loss based on preferred or +dis-preferred responses, respectively. However, while this training strategy +omits the reward model, it also overlooks the varying preference degrees within +different responses. We hypothesize that this is a key factor hindering LLMs +from sufficiently understanding human preferences. To address this problem, we +propose a novel Self-supervised Preference Optimization (SPO) framework, which +constructs a self-supervised preference degree loss combined with the alignment +loss, thereby helping LLMs improve their ability to understand the degree of +preference. Extensive experiments are conducted on two widely used datasets of +different tasks. The results demonstrate that SPO can be seamlessly integrated +with existing preference optimization methods and significantly boost their +performance to achieve state-of-the-art performance. We also conduct detailed +analyses to offer comprehensive insights into SPO, which verifies its +effectiveness. The code is available at https://github.com/lijian16/SPO. + +
+
+ comment: Accepted at EMNLP 2024 Findings +
+
+
+
+
+ + ☆ Ophthalmic Biomarker Detection with Parallel Prediction of Transformer + and Convolutional Architecture + + +
+ Ophthalmic diseases represent a significant global health issue, +necessitating the use of advanced precise diagnostic tools. Optical Coherence +Tomography (OCT) imagery which offers high-resolution cross-sectional images of +the retina has become a pivotal imaging modality in ophthalmology. +Traditionally physicians have manually detected various diseases and biomarkers +from such diagnostic imagery. In recent times, deep learning techniques have +been extensively used for medical diagnostic tasks enabling fast and precise +diagnosis. This paper presents a novel approach for ophthalmic biomarker +detection using an ensemble of Convolutional Neural Network (CNN) and Vision +Transformer. While CNNs are good for feature extraction within the local +context of the image, transformers are known for their ability to extract +features from the global context of the image. Using an ensemble of both +techniques allows us to harness the best of both worlds. Our method has been +implemented on the OLIVES dataset to detect 6 major biomarkers from the OCT +images and shows significant improvement of the macro averaged F1 score on the +dataset. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Harnessing Shared Relations via Multimodal Mixup Contrastive Learning + for Multimodal Classification + + +
+ Deep multimodal learning has shown remarkable success by leveraging +contrastive learning to capture explicit one-to-one relations across +modalities. However, real-world data often exhibits shared relations beyond +simple pairwise associations. We propose M3CoL, a Multimodal Mixup Contrastive +Learning approach to capture nuanced shared relations inherent in multimodal +data. Our key contribution is a Mixup-based contrastive loss that learns robust +representations by aligning mixed samples from one modality with their +corresponding samples from other modalities thereby capturing shared relations +between them. For multimodal classification tasks, we introduce a framework +that integrates a fusion module with unimodal prediction modules for auxiliary +supervision during training, complemented by our proposed Mixup-based +contrastive loss. Through extensive experiments on diverse datasets (N24News, +ROSMAP, BRCA, and Food-101), we demonstrate that M3CoL effectively captures +shared multimodal relations and generalizes across domains. It outperforms +state-of-the-art methods on N24News, ROSMAP, and BRCA, while achieving +comparable performance on Food-101. Our work highlights the significance of +learning shared relations for robust multimodal learning, opening up promising +avenues for future research. + +
+
+ comment: RK and RS contributed equally to this work, 20 Pages, 8 Figures, 9 + Tables +
+
+
+
+
+ + ☆ Faithfulness and the Notion of Adversarial Sensitivity in NLP + Explanations EMNLP 2024 + + +
+ Faithfulness is arguably the most critical metric to assess the reliability +of explainable AI. In NLP, current methods for faithfulness evaluation are +fraught with discrepancies and biases, often failing to capture the true +reasoning of models. We introduce Adversarial Sensitivity as a novel approach +to faithfulness evaluation, focusing on the explainer's response when the model +is under adversarial attack. Our method accounts for the faithfulness of +explainers by capturing sensitivity to adversarial input changes. This work +addresses significant limitations in existing evaluation techniques, and +furthermore, quantifies faithfulness from a crucial yet underexplored paradigm. + +
+
+ comment: Accepted as a Full Paper at EMNLP 2024 Workshop BlackBoxNLP +
+
+
+
+
+ + ☆ Federated Learning under Attack: Improving Gradient Inversion for Batch + of Images + + +
+ Federated Learning (FL) has emerged as a machine learning approach able to +preserve the privacy of user's data. Applying FL, clients train machine +learning models on a local dataset and a central server aggregates the learned +parameters coming from the clients, training a global machine learning model +without sharing user's data. However, the state-of-the-art shows several +approaches to promote attacks on FL systems. For instance, inverting or leaking +gradient attacks can find, with high precision, the local dataset used during +the training phase of the FL. This paper presents an approach, called Deep +Leakage from Gradients with Feedback Blending (DLG-FB), which is able to +improve the inverting gradient attack, considering the spatial correlation that +typically exists in batches of images. The performed evaluation shows an +improvement of 19.18% and 48,82% in terms of attack success rate and the number +of iterations per attacked image, respectively. + +
+
+ comment: 5 pages, 7 figures +
+
+
+
+
+ + ☆ Confidence intervals uncovered: Are we ready for real-world medical + imaging AI? MICCAI 2024 + + +
+ Medical imaging is spearheading the AI transformation of healthcare. +Performance reporting is key to determine which methods should be translated +into clinical practice. Frequently, broad conclusions are simply derived from +mean performance values. In this paper, we argue that this common practice is +often a misleading simplification as it ignores performance variability. Our +contribution is threefold. (1) Analyzing all MICCAI segmentation papers (n = +221) published in 2023, we first observe that more than 50\% of papers do not +assess performance variability at all. Moreover, only one (0.5\%) paper +reported confidence intervals (CIs) for model performance. (2) To address the +reporting bottleneck, we show that the unreported standard deviation (SD) in +segmentation papers can be approximated by a second-order polynomial function +of the mean Dice similarity coefficient (DSC). Based on external validation +data from 56 previous MICCAI challenges, we demonstrate that this approximation +can accurately reconstruct the CI of a method using information provided in +publications. (3) Finally, we reconstructed 95\% CIs around the mean DSC of +MICCAI 2023 segmentation papers. The median CI width was 0.03 which is three +times larger than the median performance gap between the first and second +ranked method. For more than 60\% of papers, the mean performance of the +second-ranked method was within the CI of the first-ranked method. We conclude +that current publications typically do not provide sufficient evidence to +support which models could potentially be translated into clinical practice. + +
+
+ comment: Paper accepted at MICCAI 2024 conference +
+
+
+
+
+ + ☆ Integrating Hierarchical Semantic into Iterative Generation Model for + Entailment Tree Explanation + + +
+ Manifestly and logically displaying the line of reasoning from evidence to +answer is significant to explainable question answering (QA). The entailment +tree exhibits the lines structurally, which is different from the +self-explanation principle in large-scale language models. Existing methods +rarely consider the semantic association of sentences between and within +hierarchies within the tree structure, which is prone to apparent mistakes in +combinations. In this work, we propose an architecture of integrating the +Hierarchical Semantics of sentences under the framework of Controller-Generator +(HiSCG) to explain answers. The HiSCG designs a hierarchical mapping between +hypotheses and facts, discriminates the facts involved in tree constructions, +and optimizes single-step entailments. To the best of our knowledge, We are the +first to notice hierarchical semantics of sentences between the same layer and +adjacent layers to yield improvements. The proposed method achieves comparable +performance on all three settings of the EntailmentBank dataset. The +generalization results on two out-of-domain datasets also demonstrate the +effectiveness of our method. + +
+
+
+
+
+ + ☆ SECURE: Semantics-aware Embodied Conversation under Unawareness for + Lifelong Robot Learning + + +
+ This paper addresses a challenging interactive task learning scenario we call +rearrangement under unawareness: to manipulate a rigid-body environment in a +context where the robot is unaware of a concept that's key to solving the +instructed task. We propose SECURE, an interactive task learning framework +designed to solve such problems by fixing a deficient domain model using +embodied conversation. Through dialogue, the robot discovers and then learns to +exploit unforeseen possibilities. Using SECURE, the robot not only learns from +the user's corrective feedback when it makes a mistake, but it also learns to +make strategic dialogue decisions for revealing useful evidence about novel +concepts for solving the instructed task. Together, these abilities allow the +robot to generalise to subsequent tasks using newly acquired knowledge. We +demonstrate that a robot that is semantics-aware -- that is, it exploits the +logical consequences of both sentence and discourse semantics in the learning +and inference process -- learns to solve rearrangement under unawareness more +effectively than a robot that lacks such capabilities. + +
+
+ comment: 10 pages,4 figures, 2 tables +
+
+
+
+
+ + ☆ Byzantine-Robust Aggregation for Securing Decentralized Federated + Learning + + +
+ Federated Learning (FL) emerges as a distributed machine learning approach +that addresses privacy concerns by training AI models locally on devices. +Decentralized Federated Learning (DFL) extends the FL paradigm by eliminating +the central server, thereby enhancing scalability and robustness through the +avoidance of a single point of failure. However, DFL faces significant +challenges in optimizing security, as most Byzantine-robust algorithms proposed +in the literature are designed for centralized scenarios. In this paper, we +present a novel Byzantine-robust aggregation algorithm to enhance the security +of Decentralized Federated Learning environments, coined WFAgg. This proposal +handles the adverse conditions and strength robustness of dynamic decentralized +topologies at the same time by employing multiple filters to identify and +mitigate Byzantine attacks. Experimental results demonstrate the effectiveness +of the proposed algorithm in maintaining model accuracy and convergence in the +presence of various Byzantine attack scenarios, outperforming state-of-the-art +centralized Byzantine-robust aggregation schemes (such as Multi-Krum or +Clustering). These algorithms are evaluated on an IID image classification +problem in both centralized and decentralized scenarios. + +
+
+ comment: 18 pages, 7 figures, 1 table +
+
+
+
+
+ + ☆ AlterMOMA: Fusion Redundancy Pruning for Camera-LiDAR Fusion Models with + Alternative Modality Masking NeurIPS 2024 + + +
+ Camera-LiDAR fusion models significantly enhance perception performance in +autonomous driving. The fusion mechanism leverages the strengths of each +modality while minimizing their weaknesses. Moreover, in practice, camera-LiDAR +fusion models utilize pre-trained backbones for efficient training. However, we +argue that directly loading single-modal pre-trained camera and LiDAR backbones +into camera-LiDAR fusion models introduces similar feature redundancy across +modalities due to the nature of the fusion mechanism. Unfortunately, existing +pruning methods are developed explicitly for single-modal models, and thus, +they struggle to effectively identify these specific redundant parameters in +camera-LiDAR fusion models. In this paper, to address the issue above on +camera-LiDAR fusion models, we propose a novelty pruning framework Alternative +Modality Masking Pruning (AlterMOMA), which employs alternative masking on each +modality and identifies the redundant parameters. Specifically, when one +modality parameters are masked (deactivated), the absence of features from the +masked backbone compels the model to reactivate previous redundant features of +the other modality backbone. Therefore, these redundant features and relevant +redundant parameters can be identified via the reactivation process. The +redundant parameters can be pruned by our proposed importance score evaluation +function, Alternative Evaluation (AlterEva), which is based on the observation +of the loss changes when certain modality parameters are activated and +deactivated. Extensive experiments on the nuScene and KITTI datasets +encompassing diverse tasks, baseline models, and pruning algorithms showcase +that AlterMOMA outperforms existing pruning methods, attaining state-of-the-art +performance. + +
+
+ comment: 17 pages, 3 figures, Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Episodic Memory Verbalization using Hierarchical Representations of + Life-Long Robot Experience + + +
+ Verbalization of robot experience, i.e., summarization of and question +answering about a robot's past, is a crucial ability for improving human-robot +interaction. Previous works applied rule-based systems or fine-tuned deep +models to verbalize short (several-minute-long) streams of episodic data, +limiting generalization and transferability. In our work, we apply large +pretrained models to tackle this task with zero or few examples, and +specifically focus on verbalizing life-long experiences. For this, we derive a +tree-like data structure from episodic memory (EM), with lower levels +representing raw perception and proprioception data, and higher levels +abstracting events to natural language concepts. Given such a hierarchical +representation built from the experience stream, we apply a large language +model as an agent to interactively search the EM given a user's query, +dynamically expanding (initially collapsed) tree nodes to find the relevant +information. The approach keeps computational costs low even when scaling to +months of robot experience data. We evaluate our method on simulated household +robot data, human egocentric videos, and real-world robot recordings, +demonstrating its flexibility and scalability. + +
+
+ comment: Code, data and demo videos at https://hierarchical-emv.github.io +
+
+
+
+
+ + ☆ MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard + for Prompt Attacks + + +
+ The proliferation of Large Language Models (LLMs) in diverse applications +underscores the pressing need for robust security measures to thwart potential +jailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger +data integrity and user privacy. Guardrails serve as crucial protective +mechanisms against such threats, but existing models often fall short in terms +of both detection accuracy, and computational efficiency. This paper advocates +for the significance of jailbreak attack prevention on LLMs, and emphasises the +role of input guardrails in safeguarding these models. We introduce MoJE +(Mixture of Jailbreak Expert), a novel guardrail architecture designed to +surpass current limitations in existing state-of-the-art guardrails. By +employing simple linguistic statistical techniques, MoJE excels in detecting +jailbreak attacks while maintaining minimal computational overhead during model +inference. Through rigorous experimentation, MoJE demonstrates superior +performance capable of detecting 90% of the attacks without compromising benign +prompts, enhancing LLMs security against jailbreak attacks. + +
+
+
+
+
+ + ☆ The application of GPT-4 in grading design university students' + assignment and providing feedback: An exploratory study + + +
+ This study aims to investigate whether GPT-4 can effectively grade +assignments for design university students and provide useful feedback. In +design education, assignments do not have a single correct answer and often +involve solving an open-ended design problem. This subjective nature of design +projects often leads to grading problems,as grades can vary between different +raters,for instance instructor from engineering background or architecture +background. This study employs an iterative research approach in developing a +Custom GPT with the aim of achieving more reliable results and testing whether +it can provide design students with constructive feedback. The findings +include: First,through several rounds of iterations the inter-reliability +between GPT and human raters reached a level that is generally accepted by +educators. This indicates that by providing accurate prompts to GPT,and +continuously iterating to build a Custom GPT, it can be used to effectively +grade students' design assignments, serving as a reliable complement to human +raters. Second, the intra-reliability of GPT's scoring at different times is +between 0.65 and 0.78. This indicates that, with adequate instructions, a +Custom GPT gives consistent results which is a precondition for grading +students. As consistency and comparability are the two main rules to ensure the +reliability of educational assessment, this study has looked at whether a +Custom GPT can be developed that adheres to these two rules. We finish the +paper by testing whether Custom GPT can provide students with useful feedback +and reflecting on how educators can develop and iterate a Custom GPT to serve +as a complementary rater. + +
+
+ comment: 25 pages, 5 figures +
+
+
+
+
+ + ☆ MIO: A Foundation Model on Multimodal Tokens + + +
+ In this paper, we introduce MIO, a novel foundation model built on multimodal +tokens, capable of understanding and generating speech, text, images, and +videos in an end-to-end, autoregressive manner. While the emergence of large +language models (LLMs) and multimodal large language models (MM-LLMs) propels +advancements in artificial general intelligence through their versatile +capabilities, they still lack true any-to-any understanding and generation. +Recently, the release of GPT-4o has showcased the remarkable potential of +any-to-any LLMs for complex real-world tasks, enabling omnidirectional input +and output across images, speech, and text. However, it is closed-source and +does not support the generation of multimodal interleaved sequences. To address +this gap, we present MIO, which is trained on a mixture of discrete tokens +across four modalities using causal multimodal modeling. MIO undergoes a +four-stage training process: (1) alignment pre-training, (2) interleaved +pre-training, (3) speech-enhanced pre-training, and (4) comprehensive +supervised fine-tuning on diverse textual, visual, and speech tasks. Our +experimental results indicate that MIO exhibits competitive, and in some cases +superior, performance compared to previous dual-modal baselines, any-to-any +model baselines, and even modality-specific baselines. Moreover, MIO +demonstrates advanced capabilities inherent to its any-to-any feature, such as +interleaved video-text generation, chain-of-visual-thought reasoning, visual +guideline generation, instructional image editing, etc. + +
+
+ comment: Technical Report. Codes and models will be available soon +
+
+
+
+
+ + ☆ Efficient Bias Mitigation Without Privileged Information ECCV + 2024 + + +
+ Deep neural networks trained via empirical risk minimisation often exhibit +significant performance disparities across groups, particularly when group and +task labels are spuriously correlated (e.g., "grassy background" and "cows"). +Existing bias mitigation methods that aim to address this issue often either +rely on group labels for training or validation, or require an extensive +hyperparameter search. Such data and computational requirements hinder the +practical deployment of these methods, especially when datasets are too large +to be group-annotated, computational resources are limited, and models are +trained through already complex pipelines. In this paper, we propose Targeted +Augmentations for Bias Mitigation (TAB), a simple hyperparameter-free framework +that leverages the entire training history of a helper model to identify +spurious samples, and generate a group-balanced training set from which a +robust model can be trained. We show that TAB improves worst-group performance +without any group information or model selection, outperforming existing +methods while maintaining overall accuracy. + +
+
+ comment: Accepted at the 18th European Conference on Computer Vision (ECCV + 2024) as an Oral presentation +
+
+
+
+
+ + ☆ Graph Edit Distance with General Costs Using Neural Set Divergence NeurIPS 2024 + + +
+ Graph Edit Distance (GED) measures the (dis-)similarity between two given +graphs, in terms of the minimum-cost edit sequence that transforms one graph to +the other. However, the exact computation of GED is NP-Hard, which has recently +motivated the design of neural methods for GED estimation. However, they do not +explicitly account for edit operations with different costs. In response, we +propose GRAPHEDX, a neural GED estimator that can work with general costs +specified for the four edit operations, viz., edge deletion, edge addition, +node deletion and node addition. We first present GED as a quadratic assignment +problem (QAP) that incorporates these four costs. Then, we represent each graph +as a set of node and edge embeddings and use them to design a family of neural +set divergence surrogates. We replace the QAP terms corresponding to each +operation with their surrogates. Computing such neural set divergence require +aligning nodes and edges of the two graphs. We learn these alignments using a +Gumbel-Sinkhorn permutation generator, additionally ensuring that the node and +edge alignments are consistent with each other. Moreover, these alignments are +cognizant of both the presence and absence of edges between node-pairs. +Experiments on several datasets, under a variety of edit cost settings, show +that GRAPHEDX consistently outperforms state-of-the-art methods and heuristics +in terms of prediction error. + +
+
+ comment: Published at NeurIPS 2024 +
+
+
+
+
+ + ☆ Artificial Data Point Generation in Clustered Latent Space for Small + Medical Datasets + + +
+ One of the growing trends in machine learning is the use of data generation +techniques, since the performance of machine learning models is dependent on +the quantity of the training dataset. However, in many medical applications, +collecting large datasets is challenging due to resource constraints, which +leads to overfitting and poor generalization. This paper introduces a novel +method, Artificial Data Point Generation in Clustered Latent Space (AGCL), +designed to enhance classification performance on small medical datasets +through synthetic data generation. The AGCL framework involves feature +extraction, K-means clustering, cluster evaluation based on a class separation +metric, and the generation of synthetic data points from clusters with distinct +class representations. This method was applied to Parkinson's disease +screening, utilizing facial expression data, and evaluated across multiple +machine learning classifiers. Experimental results demonstrate that AGCL +significantly improves classification accuracy compared to baseline, GN and +kNNMTD. AGCL achieved the highest overall test accuracy of 83.33% and +cross-validation accuracy of 90.90% in majority voting over different emotions, +confirming its effectiveness in augmenting small datasets. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ Preserving logical and functional dependencies in synthetic tabular data + + +
+ Dependencies among attributes are a common aspect of tabular data. However, +whether existing tabular data generation algorithms preserve these dependencies +while generating synthetic data is yet to be explored. In addition to the +existing notion of functional dependencies, we introduce the notion of logical +dependencies among the attributes in this article. Moreover, we provide a +measure to quantify logical dependencies among attributes in tabular data. +Utilizing this measure, we compare several state-of-the-art synthetic data +generation algorithms and test their capability to preserve logical and +functional dependencies on several publicly available datasets. We demonstrate +that currently available synthetic tabular data generation algorithms do not +fully preserve functional dependencies when they generate synthetic datasets. +In addition, we also showed that some tabular synthetic data generation models +can preserve inter-attribute logical dependencies. Our review and comparison of +the state-of-the-art reveal research needs and opportunities to develop +task-specific synthetic tabular data generation models. + +
+
+ comment: Submitted to Pattern Recognition Journal +
+
+
+
+
+ + ☆ Zero- and Few-shot Named Entity Recognition and Text Expansion in + Medication Prescriptions using ChatGPT + + +
+ Introduction: Medication prescriptions are often in free text and include a +mix of two languages, local brand names, and a wide range of idiosyncratic +formats and abbreviations. Large language models (LLMs) have shown promising +ability to generate text in response to input prompts. We use ChatGPT 3.5 to +automatically structure and expand medication statements in discharge summaries +and thus make them easier to interpret for people and machines. Methods: +Named-entity Recognition (NER) and Text Expansion (EX) are used in a zero- and +few-shot setting with different prompt strategies. 100 medication statements +were manually annotated and curated. NER performance was measured by using +strict and partial matching. For the task EX, two experts interpreted the +results by assessing semantic equivalence between original and expanded +statements. The model performance was measured by precision, recall, and F1 +score. Results: For NER, the best-performing prompt reached an average F1 score +of 0.94 in the test set. For EX, the few-shot prompt showed superior +performance among other prompts, with an average F1 score of 0.87. Conclusion: +Our study demonstrates good performance for NER and EX tasks in free-text +medication statements using ChatGPT. Compared to a zero-shot baseline, a +few-shot approach prevented the system from hallucinating, which would be +unacceptable when processing safety-relevant medication data. + +
+
+
+
+
+ + ☆ Explanation Bottleneck Models + + +
+ Recent concept-based interpretable models have succeeded in providing +meaningful explanations by pre-defined concept sets. However, the dependency on +the pre-defined concepts restricts the application because of the limited +number of concepts for explanations. This paper proposes a novel interpretable +deep neural network called explanation bottleneck models (XBMs). XBMs generate +a text explanation from the input without pre-defined concepts and then predict +a final task prediction based on the generated explanation by leveraging +pre-trained vision-language encoder-decoder models. To achieve both the target +task performance and the explanation quality, we train XBMs through the target +task loss with the regularization penalizing the explanation decoder via the +distillation from the frozen pre-trained decoder. Our experiments, including a +comparison to state-of-the-art concept bottleneck models, confirm that XBMs +provide accurate and fluent natural language explanations without pre-defined +concept sets. Code will be available at https://github.com/yshinya6/xbm/. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ A Fuzzy-based Approach to Predict Human Interaction by Functional + Near-Infrared Spectroscopy + + +
+ The paper introduces a Fuzzy-based Attention (Fuzzy Attention Layer) +mechanism, a novel computational approach to enhance the interpretability and +efficacy of neural models in psychological research. The proposed Fuzzy +Attention Layer mechanism is integrated as a neural network layer within the +Transformer Encoder model to facilitate the analysis of complex psychological +phenomena through neural signals, such as those captured by functional +Near-Infrared Spectroscopy (fNIRS). By leveraging fuzzy logic, the Fuzzy +Attention Layer is capable of learning and identifying interpretable patterns +of neural activity. This capability addresses a significant challenge when +using Transformer: the lack of transparency in determining which specific brain +activities most contribute to particular predictions. Our experimental results +demonstrated on fNIRS data from subjects engaged in social interactions +involving handholding reveal that the Fuzzy Attention Layer not only learns +interpretable patterns of neural activity but also enhances model performance. +Additionally, the learned patterns provide deeper insights into the neural +correlates of interpersonal touch and emotional exchange. The application of +our model shows promising potential in deciphering the subtle complexities of +human social behaviors, thereby contributing significantly to the fields of +social neuroscience and psychological AI. + +
+
+
+
+
+ + ☆ Hierarchical End-to-End Autonomous Driving: Integrating BEV Perception + with Deep Reinforcement Learning + + +
+ End-to-end autonomous driving offers a streamlined alternative to the +traditional modular pipeline, integrating perception, prediction, and planning +within a single framework. While Deep Reinforcement Learning (DRL) has recently +gained traction in this domain, existing approaches often overlook the critical +connection between feature extraction of DRL and perception. In this paper, we +bridge this gap by mapping the DRL feature extraction network directly to the +perception phase, enabling clearer interpretation through semantic +segmentation. By leveraging Bird's-Eye-View (BEV) representations, we propose a +novel DRL-based end-to-end driving framework that utilizes multi-sensor inputs +to construct a unified three-dimensional understanding of the environment. This +BEV-based system extracts and translates critical environmental features into +high-level abstract states for DRL, facilitating more informed control. +Extensive experimental evaluations demonstrate that our approach not only +enhances interpretability but also significantly outperforms state-of-the-art +methods in autonomous driving control tasks, reducing the collision rate by +20%. + +
+
+
+
+
+ + ☆ Prototype based Masked Audio Model for Self-Supervised Learning of Sound + Event Detection ICASSP2025 + + +
+ A significant challenge in sound event detection (SED) is the effective +utilization of unlabeled data, given the limited availability of labeled data +due to high annotation costs. Semi-supervised algorithms rely on labeled data +to learn from unlabeled data, and the performance is constrained by the quality +and size of the former. In this paper, we introduce the Prototype based Masked +Audio Model~(PMAM) algorithm for self-supervised representation learning in +SED, to better exploit unlabeled data. Specifically, semantically rich +frame-level pseudo labels are constructed from a Gaussian mixture model (GMM) +based prototypical distribution modeling. These pseudo labels supervise the +learning of a Transformer-based masked audio model, in which binary +cross-entropy loss is employed instead of the widely used InfoNCE loss, to +provide independent loss contributions from different prototypes, which is +important in real scenarios in which multiple labels may apply to unsupervised +data frames. A final stage of fine-tuning with just a small amount of labeled +data yields a very high performing SED model. On like-for-like tests using the +DESED task, our method achieves a PSDS1 score of 62.5\%, surpassing current +state-of-the-art models and demonstrating the superiority of the proposed +technique. + +
+
+ comment: Submitted to ICASSP2025; The code for this paper will be available at + https://github.com/cai525/Transformer4SED after the paper is accepted +
+
+
+
+
+ + ☆ AssistantX: An LLM-Powered Proactive Assistant in Collaborative + Human-Populated Environment + + +
+ The increasing demand for intelligent assistants in human-populated +environments has motivated significant research in autonomous robotic systems. +Traditional service robots and virtual assistants, however, struggle with +real-world task execution due to their limited capacity for dynamic reasoning +and interaction, particularly when human collaboration is required. Recent +developments in Large Language Models have opened new avenues for improving +these systems, enabling more sophisticated reasoning and natural interaction +capabilities. In this paper, we introduce AssistantX, an LLM-powered proactive +assistant designed to operate autonomously in a physical office environment. +Unlike conventional service robots, AssistantX leverages a novel multi-agent +architecture, PPDR4X, which provides advanced inference capabilities and +comprehensive collaboration awareness. By effectively bridging the gap between +virtual operations and physical interactions, AssistantX demonstrates robust +performance in managing complex real-world scenarios. Our evaluation highlights +the architecture's effectiveness, showing that AssistantX can respond to clear +instructions, actively retrieve supplementary information from memory, and +proactively seek collaboration from team members to ensure successful task +completion. More details and videos can be found at +https://assistantx-agent.github.io/AssistantX/. + +
+
+ comment: 6 pages, 8 figures, 4 tables +
+
+
+
+
+ + ☆ FactorSim: Generative Simulation via Factorized Representation + + +
+ Generating simulations to train intelligent agents in game-playing and +robotics from natural language input, from user input or task documentation, +remains an open-ended challenge. Existing approaches focus on parts of this +challenge, such as generating reward functions or task hyperparameters. Unlike +previous work, we introduce FACTORSIM that generates full simulations in code +from language input that can be used to train agents. Exploiting the structural +modularity specific to coded simulations, we propose to use a factored +partially observable Markov decision process representation that allows us to +reduce context dependence during each step of the generation. For evaluation, +we introduce a generative simulation benchmark that assesses the generated +simulation code's accuracy and effectiveness in facilitating zero-shot +transfers in reinforcement learning settings. We show that FACTORSIM +outperforms existing methods in generating simulations regarding prompt +alignment (e.g., accuracy), zero-shot transfer abilities, and human evaluation. +We also demonstrate its effectiveness in generating robotic tasks. + +
+
+ comment: neurips 2024, project website: + https://cs.stanford.edu/~sunfanyun/factorsim/ +
+
+
+
+
+ + ☆ Digital Twin Ecosystem for Oncology Clinical Operations + + +
+ Artificial Intelligence (AI) and Large Language Models (LLMs) hold +significant promise in revolutionizing healthcare, especially in clinical +applications. Simultaneously, Digital Twin technology, which models and +simulates complex systems, has gained traction in enhancing patient care. +However, despite the advances in experimental clinical settings, the potential +of AI and digital twins to streamline clinical operations remains largely +untapped. This paper introduces a novel digital twin framework specifically +designed to enhance oncology clinical operations. We propose the integration of +multiple specialized digital twins, such as the Medical Necessity Twin, Care +Navigator Twin, and Clinical History Twin, to enhance workflow efficiency and +personalize care for each patient based on their unique data. Furthermore, by +synthesizing multiple data sources and aligning them with the National +Comprehensive Cancer Network (NCCN) guidelines, we create a dynamic Cancer Care +Path, a continuously evolving knowledge base that enables these digital twins +to provide precise, tailored clinical recommendations. + +
+
+ comment: Pre Print +
+
+
+
+
+ + ☆ AI Delegates with a Dual Focus: Ensuring Privacy and Strategic + Self-Disclosure + + +
+ Large language model (LLM)-based AI delegates are increasingly utilized to +act on behalf of users, assisting them with a wide range of tasks through +conversational interfaces. Despite their advantages, concerns arise regarding +the potential risk of privacy leaks, particularly in scenarios involving social +interactions. While existing research has focused on protecting privacy by +limiting the access of AI delegates to sensitive user information, many social +scenarios require disclosing private details to achieve desired outcomes, +necessitating a balance between privacy protection and disclosure. To address +this challenge, we conduct a pilot study to investigate user preferences for AI +delegates across various social relations and task scenarios, and then propose +a novel AI delegate system that enables privacy-conscious self-disclosure. Our +user study demonstrates that the proposed AI delegate strategically protects +privacy, pioneering its use in diverse and dynamic social interactions. + +
+
+
+
+
+ + ☆ T3: A Novel Zero-shot Transfer Learning Framework Iteratively Training + on an Assistant Task for a Target Task + + +
+ Long text summarization, gradually being essential for efficiently processing +large volumes of information, stays challenging for Large Language Models +(LLMs) such as GPT and LLaMA families because of the insufficient open-sourced +training datasets and the high requirement of contextual details dealing. To +address the issue, we design a novel zero-shot transfer learning framework, +abbreviated as T3, to iteratively training a baseline LLM on an assistant task +for the target task, where the former should own richer data resources and +share structural or semantic similarity with the latter. In practice, T3 is +approached to deal with the long text summarization task by utilizing question +answering as the assistant task, and further validated its effectiveness on the +BBC summary, NarraSum, FairytaleQA, and NLQuAD datasets, with up to nearly 14% +improvement in ROUGE, 35% improvement in BLEU, and 16% improvement in Factscore +compared to three baseline LLMs, demonstrating its potential for more +assistant-target task combinations. + +
+
+
+
+
+ + ☆ P4Q: Learning to Prompt for Quantization in Visual-language Models + + +
+ Large-scale pre-trained Vision-Language Models (VLMs) have gained prominence +in various visual and multimodal tasks, yet the deployment of VLMs on +downstream application platforms remains challenging due to their prohibitive +requirements of training samples and computing resources. Fine-tuning and +quantization of VLMs can substantially reduce the sample and computation costs, +which are in urgent need. There are two prevailing paradigms in quantization, +Quantization-Aware Training (QAT) can effectively quantize large-scale VLMs but +incur a huge training cost, while low-bit Post-Training Quantization (PTQ) +suffers from a notable performance drop. We propose a method that balances +fine-tuning and quantization named ``Prompt for Quantization'' (P4Q), in which +we design a lightweight architecture to leverage contrastive loss supervision +to enhance the recognition performance of a PTQ model. Our method can +effectively reduce the gap between image features and text features caused by +low-bit quantization, based on learnable prompts to reorganize textual +representations and a low-bit adapter to realign the distributions of image and +text features. We also introduce a distillation loss based on cosine similarity +predictions to distill the quantized model using a full-precision teacher. +Extensive experimental results demonstrate that our P4Q method outperforms +prior arts, even achieving comparable results to its full-precision +counterparts. For instance, our 8-bit P4Q can theoretically compress the +CLIP-ViT/B-32 by 4 $\times$ while achieving 66.94\% Top-1 accuracy, +outperforming the learnable prompt fine-tuned full-precision model by 2.24\% +with negligible additional parameters on the ImageNet dataset. + +
+
+
+
+
+ + ☆ Hand-object reconstruction via interaction-aware graph attention + mechanism ICIP 2024 + + +
+ Estimating the poses of both a hand and an object has become an important +area of research due to the growing need for advanced vision computing. The +primary challenge involves understanding and reconstructing how hands and +objects interact, such as contact and physical plausibility. Existing +approaches often adopt a graph neural network to incorporate spatial +information of hand and object meshes. However, these approaches have not fully +exploited the potential of graphs without modification of edges within and +between hand- and object-graphs. We propose a graph-based refinement method +that incorporates an interaction-aware graph-attention mechanism to account for +hand-object interactions. Using edges, we establish connections among closely +correlated nodes, both within individual graphs and across different graphs. +Experiments demonstrate the effectiveness of our proposed method with notable +improvements in the realm of physical plausibility. + +
+
+ comment: 7 pages, Accepted by ICIP 2024 +
+
+
+
+
+ + ☆ Neural P$^3$M: A Long-Range Interaction Modeling Enhancer for Geometric + GNNs NeurIPS 2024 + + +
+ Geometric graph neural networks (GNNs) have emerged as powerful tools for +modeling molecular geometry. However, they encounter limitations in effectively +capturing long-range interactions in large molecular systems. To address this +challenge, we introduce Neural P$^3$M, a versatile enhancer of geometric GNNs +to expand the scope of their capabilities by incorporating mesh points +alongside atoms and reimaging traditional mathematical operations in a +trainable manner. Neural P$^3$M exhibits flexibility across a wide range of +molecular systems and demonstrates remarkable accuracy in predicting energies +and forces, outperforming on benchmarks such as the MD22 dataset. It also +achieves an average improvement of 22% on the OE62 dataset while integrating +with various architectures. + +
+
+ comment: Published as a conference paper at NeurIPS 2024 +
+
+
+
+
+ + ☆ Dirichlet-Based Coarse-to-Fine Example Selection For Open-Set Annotation + + +
+ Active learning (AL) has achieved great success by selecting the most +valuable examples from unlabeled data. However, they usually deteriorate in +real scenarios where open-set noise gets involved, which is studied as open-set +annotation (OSA). In this paper, we owe the deterioration to the unreliable +predictions arising from softmax-based translation invariance and propose a +Dirichlet-based Coarse-to-Fine Example Selection (DCFS) strategy accordingly. +Our method introduces simplex-based evidential deep learning (EDL) to break +translation invariance and distinguish known and unknown classes by considering +evidence-based data and distribution uncertainty simultaneously. Furthermore, +hard known-class examples are identified by model discrepancy generated from +two classifier heads, where we amplify and alleviate the model discrepancy +respectively for unknown and known classes. Finally, we combine the discrepancy +with uncertainties to form a two-stage strategy, selecting the most informative +examples from known classes. Extensive experiments on various openness ratio +datasets demonstrate that DCFS achieves state-of-art performance. + +
+
+
+
+
+ + ☆ Open Digital Rights Enforcement Framework (ODRE): from descriptive to + enforceable policies + + +
+ From centralised platforms to decentralised ecosystems, like Data Spaces, +sharing data has become a paramount challenge. For this reason, the definition +of data usage policies has become crucial in these domains, highlighting the +necessity of effective policy enforcement mechanisms. The Open Digital Rights +Language (ODRL) is a W3C standard ontology designed to describe data usage +policies, however, it lacks built-in enforcement capabilities, limiting its +practical application. This paper introduces the Open Digital Rights +Enforcement (ODRE) framework, whose goal is to provide ODRL with enforcement +capabilities. The ODRE framework proposes a novel approach to express ODRL +policies that integrates the descriptive ontology terms of ODRL with other +languages that allow behaviour specification, such as dynamic data handling or +function evaluation. The framework includes an enforcement algorithm for ODRL +policies and two open-source implementations in Python and Java. The ODRE +framework is also designed to support future extensions of ODRL to specific +domain scenarios. In addition, current limitations of ODRE, ODRL, and current +challenges are reported. Finally, to demonstrate the enforcement capabilities +of the implementations, their performance, and their extensibility features, +several experiments have been carried out with positive results. + +
+
+ comment: 20 pages, 3 Figures, Submitted to Computers & Security journal +
+
+
+
+
+ + ☆ TA-Cleaner: A Fine-grained Text Alignment Backdoor Defense Strategy for + Multimodal Contrastive Learning + + +
+ Pre-trained large models for multimodal contrastive learning, such as CLIP, +have been widely recognized in the industry as highly susceptible to +data-poisoned backdoor attacks. This poses significant risks to downstream +model training. In response to such potential threats, finetuning offers a +simpler and more efficient defense choice compared to retraining large models +with augmented data. In the supervised learning domain, fine-tuning defense +strategies can achieve excellent defense performance. However, in the +unsupervised and semi-supervised domain, we find that when CLIP faces some +complex attack techniques, the existing fine-tuning defense strategy, +CleanCLIP, has some limitations on defense performance. The synonym +substitution of its text-augmentation is insufficient to enhance the text +feature space. To compensate for this weakness, we improve it by proposing a +fine-grained \textbf{T}ext \textbf{A}lignment \textbf{C}leaner (TA-Cleaner) to +cut off feature connections of backdoor triggers. We randomly select a few +samples for positive and negative subtext generation at each epoch of +CleanCLIP, and align the subtexts to the images to strengthen the text +self-supervision. We evaluate the effectiveness of our TA-Cleaner against six +attack algorithms and conduct comprehensive zero-shot classification tests on +ImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves +state-of-the-art defensiveness among finetuning-based defense techniques. Even +when faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms +CleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\% and 63.88\%, +respectively. + +
+
+
+
+
+ + ☆ Subjective and Objective Quality-of-Experience Evaluation Study for Live + Video Streaming + + +
+ In recent years, live video streaming has gained widespread popularity across +various social media platforms. Quality of experience (QoE), which reflects +end-users' satisfaction and overall experience, plays a critical role for media +service providers to optimize large-scale live compression and transmission +strategies to achieve perceptually optimal rate-distortion trade-off. Although +many QoE metrics for video-on-demand (VoD) have been proposed, there remain +significant challenges in developing QoE metrics for live video streaming. To +bridge this gap, we conduct a comprehensive study of subjective and objective +QoE evaluations for live video streaming. For the subjective QoE study, we +introduce the first live video streaming QoE dataset, TaoLive QoE, which +consists of $42$ source videos collected from real live broadcasts and $1,155$ +corresponding distorted ones degraded due to a variety of streaming +distortions, including conventional streaming distortions such as compression, +stalling, as well as live streaming-specific distortions like frame skipping, +variable frame rate, etc. Subsequently, a human study was conducted to derive +subjective QoE scores of videos in the TaoLive QoE dataset. For the objective +QoE study, we benchmark existing QoE models on the TaoLive QoE dataset as well +as publicly available QoE datasets for VoD scenarios, highlighting that current +models struggle to accurately assess video QoE, particularly for live content. +Hence, we propose an end-to-end QoE evaluation model, Tao-QoE, which integrates +multi-scale semantic features and optical flow-based motion features to +predicting a retrospective QoE score, eliminating reliance on statistical +quality of service (QoS) features. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Deep Manifold Part 1: Anatomy of Neural Network Manifold + + +
+ Based on the numerical manifold method principle, we developed a mathematical +framework of a neural network manifold: Deep Manifold and discovered that +neural networks: 1) is numerical computation combining forward and inverse; 2) +have near infinite degrees of freedom; 3) exponential learning capacity with +depth; 4) have self-progressing boundary conditions; 5) has training hidden +bottleneck. We also define two concepts: neural network learning space and deep +manifold space and introduce two concepts: neural network intrinsic pathway and +fixed point. We raise three fundamental questions: 1). What is the training +completion definition; 2). where is the deep learning convergence point (neural +network fixed point); 3). How important is token timestamp in training data +given negative time is critical in inverse problem. + +
+
+
+
+
+ + ☆ Improving Fast Adversarial Training via Self-Knowledge Guidance + + +
+ Adversarial training has achieved remarkable advancements in defending +against adversarial attacks. Among them, fast adversarial training (FAT) is +gaining attention for its ability to achieve competitive robustness with fewer +computing resources. Existing FAT methods typically employ a uniform strategy +that optimizes all training data equally without considering the influence of +different examples, which leads to an imbalanced optimization. However, this +imbalance remains unexplored in the field of FAT. In this paper, we conduct a +comprehensive study of the imbalance issue in FAT and observe an obvious class +disparity regarding their performances. This disparity could be embodied from a +perspective of alignment between clean and robust accuracy. Based on the +analysis, we mainly attribute the observed misalignment and disparity to the +imbalanced optimization in FAT, which motivates us to optimize different +training data adaptively to enhance robustness. Specifically, we take disparity +and misalignment into consideration. First, we introduce self-knowledge guided +regularization, which assigns differentiated regularization weights to each +class based on its training state, alleviating class disparity. Additionally, +we propose self-knowledge guided label relaxation, which adjusts label +relaxation according to the training accuracy, alleviating the misalignment and +improving robustness. By combining these methods, we formulate the +Self-Knowledge Guided FAT (SKG-FAT), leveraging naturally generated knowledge +during training to enhance the adversarial robustness without compromising +training efficiency. Extensive experiments on four standard datasets +demonstrate that the SKG-FAT improves the robustness and preserves competitive +clean accuracy, outperforming the state-of-the-art methods. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Multimodal Banking Dataset: Understanding Client Needs through Event + Sequences + + +
+ Financial organizations collect a huge amount of data about clients that +typically has a temporal (sequential) structure and is collected from various +sources (modalities). Due to privacy issues, there are no large-scale +open-source multimodal datasets of event sequences, which significantly limits +the research in this area. In this paper, we present the industrial-scale +publicly available multimodal banking dataset, MBD, that contains more than +1.5M corporate clients with several modalities: 950M bank transactions, 1B geo +position events, 5M embeddings of dialogues with technical support and monthly +aggregated purchases of four bank's products. All entries are properly +anonymized from real proprietary bank data. Using this dataset, we introduce a +novel benchmark with two business tasks: campaigning (purchase prediction in +the next month) and matching of clients. We provide numerical results that +demonstrate the superiority of our multi-modal baselines over single-modal +techniques for each task. As a result, the proposed dataset can open new +perspectives and facilitate the future development of practically important +large-scale multimodal algorithms for event sequences. + HuggingFace Link: https://huggingface.co/datasets/ai-lab/MBD + Github Link: https://github.com/Dzhambo/MBD + +
+
+
+
+
+ + ☆ Let the Quantum Creep In: Designing Quantum Neural Network Models by + Gradually Swapping Out Classical Components + + +
+ Artificial Intelligence (AI), with its multiplier effect and wide +applications in multiple areas, could potentially be an important application +of quantum computing. Since modern AI systems are often built on neural +networks, the design of quantum neural networks becomes a key challenge in +integrating quantum computing into AI. To provide a more fine-grained +characterisation of the impact of quantum components on the performance of +neural networks, we propose a framework where classical neural network layers +are gradually replaced by quantum layers that have the same type of input and +output while keeping the flow of information between layers unchanged, +different from most current research in quantum neural network, which favours +an end-to-end quantum model. We start with a simple three-layer classical +neural network without any normalisation layers or activation functions, and +gradually change the classical layers to the corresponding quantum versions. We +conduct numerical experiments on image classification datasets such as the +MNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of +performance brought by the systematic introduction of quantum components. +Through this framework, our research sheds new light on the design of future +quantum neural network models where it could be more favourable to search for +methods and frameworks that harness the advantages from both the classical and +quantum worlds. + +
+
+ comment: 50 pages (including Appendix), many figures, accepted as a poster on + QTML2024. Code available at + https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In +
+
+
+
+
+ + ☆ A Scalable Data-Driven Framework for Systematic Analysis of SEC 10-K + Filings Using Large Language Models + + +
+ The number of companies listed on the NYSE has been growing exponentially, +creating a significant challenge for market analysts, traders, and stockholders +who must monitor and assess the performance and strategic shifts of a large +number of companies regularly. There is an increasing need for a fast, +cost-effective, and comprehensive method to evaluate the performance and detect +and compare many companies' strategy changes efficiently. We propose a novel +data-driven approach that leverages large language models (LLMs) to +systematically analyze and rate the performance of companies based on their SEC +10-K filings. These filings, which provide detailed annual reports on a +company's financial performance and strategic direction, serve as a rich source +of data for evaluating various aspects of corporate health, including +confidence, environmental sustainability, innovation, and workforce management. +We also introduce an automated system for extracting and preprocessing 10-K +filings. This system accurately identifies and segments the required sections +as outlined by the SEC, while also isolating key textual content that contains +critical information about the company. This curated data is then fed into +Cohere's Command-R+ LLM to generate quantitative ratings across various +performance metrics. These ratings are subsequently processed and visualized to +provide actionable insights. The proposed scheme is then implemented on an +interactive GUI as a no-code solution for running the data pipeline and +creating the visualizations. The application showcases the rating results and +provides year-on-year comparisons of company performance. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ Enhancing Structured-Data Retrieval with GraphRAG: Soccer Data Case + Study + + +
+ Extracting meaningful insights from large and complex datasets poses +significant challenges, particularly in ensuring the accuracy and relevance of +retrieved information. Traditional data retrieval methods such as sequential +search and index-based retrieval often fail when handling intricate and +interconnected data structures, resulting in incomplete or misleading outputs. +To overcome these limitations, we introduce Structured-GraphRAG, a versatile +framework designed to enhance information retrieval across structured datasets +in natural language queries. Structured-GraphRAG utilizes multiple knowledge +graphs, which represent data in a structured format and capture complex +relationships between entities, enabling a more nuanced and comprehensive +retrieval of information. This graph-based approach reduces the risk of errors +in language model outputs by grounding responses in a structured format, +thereby enhancing the reliability of results. We demonstrate the effectiveness +of Structured-GraphRAG by comparing its performance with that of a recently +published method using traditional retrieval-augmented generation. Our findings +show that Structured-GraphRAG significantly improves query processing +efficiency and reduces response times. While our case study focuses on soccer +data, the framework's design is broadly applicable, offering a powerful tool +for data analysis and enhancing language model applications across various +structured domains. + +
+
+
+
+
+ + ☆ Dr. GPT in Campus Counseling: Understanding Higher Education Students' + Opinions on LLM-assisted Mental Health Services + + +
+ In response to the increasing mental health challenges faced by college +students, we sought to understand their perspectives on how AI applications, +particularly Large Language Models (LLMs), can be leveraged to enhance their +mental well-being. Through pilot interviews with ten diverse students, we +explored their opinions on the use of LLMs across five fictional scenarios: +General Information Inquiry, Initial Screening, Reshaping Patient-Expert +Dynamics, Long-term Care, and Follow-up Care. Our findings revealed that +students' acceptance of LLMs varied by scenario, with participants highlighting +both potential benefits, such as proactive engagement and personalized +follow-up care, and concerns, including limitations in training data and +emotional support. These insights inform how AI technology should be designed +and implemented to effectively support and enhance students' mental well-being, +particularly in scenarios where LLMs can complement traditional methods, while +maintaining empathy and respecting individual preferences. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Showing Many Labels in Multi-label Classification Models: An Empirical + Study of Adversarial Examples + + +
+ With the rapid development of Deep Neural Networks (DNNs), they have been +applied in numerous fields. However, research indicates that DNNs are +susceptible to adversarial examples, and this is equally true in the +multi-label domain. To further investigate multi-label adversarial examples, we +introduce a novel type of attacks, termed "Showing Many Labels". The objective +of this attack is to maximize the number of labels included in the classifier's +prediction results. In our experiments, we select nine attack algorithms and +evaluate their performance under "Showing Many Labels". Eight of the attack +algorithms were adapted from the multi-class environment to the multi-label +environment, while the remaining one was specifically designed for the +multi-label environment. We choose ML-LIW and ML-GCN as target models and train +them on four popular multi-label datasets: VOC2007, VOC2012, NUS-WIDE, and +COCO. We record the success rate of each algorithm when it shows the expected +number of labels in eight different scenarios. Experimental results indicate +that under the "Showing Many Labels", iterative attacks perform significantly +better than one-step attacks. Moreover, it is possible to show all labels in +the dataset. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Pixel-Space Post-Training of Latent Diffusion Models + + +
+ Latent diffusion models (LDMs) have made significant advancements in the +field of image generation in recent years. One major advantage of LDMs is their +ability to operate in a compressed latent space, allowing for more efficient +training and deployment. However, despite these advantages, challenges with +LDMs still remain. For example, it has been observed that LDMs often generate +high-frequency details and complex compositions imperfectly. We hypothesize +that one reason for these flaws is due to the fact that all pre- and +post-training of LDMs are done in latent space, which is typically $8 \times 8$ +lower spatial-resolution than the output images. To address this issue, we +propose adding pixel-space supervision in the post-training process to better +preserve high-frequency details. Experimentally, we show that adding a +pixel-space objective significantly improves both supervised quality +fine-tuning and preference-based post-training by a large margin on a +state-of-the-art DiT transformer and U-Net diffusion models in both visual +quality and visual flaw metrics, while maintaining the same text alignment +quality. + +
+
+
+
+
+ + ☆ Triple Point Masking + + +
+ Existing 3D mask learning methods encounter performance bottlenecks under +limited data, and our objective is to overcome this limitation. In this paper, +we introduce a triple point masking scheme, named TPM, which serves as a +scalable framework for pre-training of masked autoencoders to achieve +multi-mask learning for 3D point clouds. Specifically, we augment the baselines +with two additional mask choices (i.e., medium mask and low mask) as our core +insight is that the recovery process of an object can manifest in diverse ways. +Previous high-masking schemes focus on capturing the global representation but +lack the fine-grained recovery capability, so that the generated pre-trained +weights tend to play a limited role in the fine-tuning process. With the +support of the proposed TPM, available methods can exhibit more flexible and +accurate completion capabilities, enabling the potential autoencoder in the +pre-training stage to consider multiple representations of a single 3D object. +In addition, an SVM-guided weight selection module is proposed to fill the +encoder parameters for downstream networks with the optimal weight during the +fine-tuning stage, maximizing linear accuracy and facilitating the acquisition +of intricate representations for new objects. Extensive experiments show that +the four baselines equipped with the proposed TPM achieve comprehensive +performance improvements on various downstream tasks. + +
+
+
+
+
+ + ☆ Modulated Intervention Preference Optimization (MIPO): Keey the Easy, + Refine the Difficult AAAI 2025 + + +
+ Preference optimization methods typically begin training with a well-trained +SFT model as a reference model. In RLHF and DPO, a regularization term is used +during the preference optimization process to prevent the policy model from +deviating too far from the reference model's distribution, thereby avoiding the +generation of anomalous responses. When the reference model is already +well-aligned with the given data or only requires slight adjustments, this +approach can produce a well-aligned model. However, if the reference model is +not aligned with the given data and requires significant deviation from its +current state, a regularization term may actually hinder the model alignment. +In this study, we propose \textbf{Modulated Intervention Preference +Optimization (MIPO)} to address this issue. MIPO modulates the degree of +intervention from the reference model based on how well the given data is +aligned with it. If the data is well-aligned, the intervention is increased to +prevent the policy model from diverging significantly from reference model. +Conversely, if the alignment is poor, the interference is reduced to facilitate +more extensive training. We compare the performance of MIPO and DPO using +Mistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental +results demonstrate that MIPO consistently outperforms DPO across various +evaluation scenarios. + +
+
+ comment: 8pages, submitted to AAAI 2025 +
+
+
+
+
+ + ☆ On the Implicit Relation Between Low-Rank Adaptation and Differential + Privacy + + +
+ A significant approach in natural language processing involves large-scale +pre-training on general domain data followed by adaptation to specific tasks or +domains. As models grow in size, full fine-tuning all parameters becomes +increasingly impractical. To address this, some methods for low-rank task +adaptation of language models have been proposed, e.g. LoRA and FLoRA. These +methods keep the pre-trained model weights fixed and incorporate trainable +low-rank decomposition matrices into some layers of the transformer +architecture, called adapters. This approach significantly reduces the number +of trainable parameters required for downstream tasks compared to full +fine-tuning all parameters. In this work, we look at low-rank adaptation from +the lens of data privacy. We show theoretically that the low-rank adaptation +used in LoRA and FLoRA is equivalent to injecting some random noise into the +batch gradients w.r.t the adapter parameters coming from their full +fine-tuning, and we quantify the variance of the injected noise. By +establishing a Berry-Esseen type bound on the total variation distance between +the noise distribution and a Gaussian distribution with the same variance, we +show that the dynamics of LoRA and FLoRA are very close to differentially +private full fine-tuning the adapters, which suggests that low-rank adaptation +implicitly provides privacy w.r.t the fine-tuning data. Finally, using +Johnson-Lindenstrauss lemma, we show that when augmented with gradient +clipping, low-rank adaptation is almost equivalent to differentially private +full fine-tuning adapters with a fixed noise scale. + +
+
+
+
+
+ + ☆ Just say what you want: only-prompting self-rewarding online preference + optimization + + +
+ We address the challenge of online Reinforcement Learning from Human Feedback +(RLHF) with a focus on self-rewarding alignment methods. In online RLHF, +obtaining feedback requires interaction with the environment, which can be +costly when using additional reward models or the GPT-4 API. Current +self-rewarding approaches rely heavily on the discriminator's judgment +capabilities, which are effective for large-scale models but challenging to +transfer to smaller ones. To address these limitations, we propose a novel, +only-prompting self-rewarding online algorithm that generates preference +datasets without relying on judgment capabilities. Additionally, we employ +fine-grained arithmetic control over the optimality gap between positive and +negative examples, generating more hard negatives in the later stages of +training to help the model better capture subtle human preferences. Finally, we +conduct extensive experiments on two base models, Mistral-7B and +Mistral-Instruct-7B, which significantly bootstrap the performance of the +reference model, achieving 34.5% in the Length-controlled Win Rates of +AlpacaEval 2.0. + +
+
+
+
+
+ + ☆ SimVG: A Simple Framework for Visual Grounding with Decoupled + Multi-modal Fusion NeurIPS2024 + + +
+ Visual grounding is a common vision task that involves grounding descriptive +sentences to the corresponding regions of an image. Most existing methods use +independent image-text encoding and apply complex hand-crafted modules or +encoder-decoder architectures for modal interaction and query reasoning. +However, their performance significantly drops when dealing with complex +textual expressions. This is because the former paradigm only utilizes limited +downstream data to fit the multi-modal feature fusion. Therefore, it is only +effective when the textual expressions are relatively simple. In contrast, +given the wide diversity of textual expressions and the uniqueness of +downstream training data, the existing fusion module, which extracts multimodal +content from a visual-linguistic context, has not been fully investigated. In +this paper, we present a simple yet robust transformer-based framework, SimVG, +for visual grounding. Specifically, we decouple visual-linguistic feature +fusion from downstream tasks by leveraging existing multimodal pre-trained +models and incorporating additional object tokens to facilitate deep +integration of downstream and pre-training tasks. Furthermore, we design a +dynamic weight-balance distillation method in the multi-branch synchronous +learning process to enhance the representation capability of the simpler +branch. This branch only consists of a lightweight MLP, which simplifies the +structure and improves reasoning speed. Experiments on six widely used VG +datasets, i.e., RefCOCO/+/g, ReferIt, Flickr30K, and GRefCOCO, demonstrate the +superiority of SimVG. Finally, the proposed method not only achieves +improvements in efficiency and convergence speed but also attains new +state-of-the-art performance on these benchmarks. Codes and models will be +available at \url{https://github.com/Dmmm1997/SimVG}. + +
+
+ comment: 21pages, 11figures, NeurIPS2024 +
+
+
+
+
+ + ☆ Drone Stereo Vision for Radiata Pine Branch Detection and Distance + Measurement: Integrating SGBM and Segmentation Models + + +
+ Manual pruning of radiata pine trees presents significant safety risks due to +their substantial height and the challenging terrains in which they thrive. To +address these risks, this research proposes the development of a drone-based +pruning system equipped with specialized pruning tools and a stereo vision +camera, enabling precise detection and trimming of branches. Deep learning +algorithms, including YOLO and Mask R-CNN, are employed to ensure accurate +branch detection, while the Semi-Global Matching algorithm is integrated to +provide reliable distance estimation. The synergy between these techniques +facilitates the precise identification of branch locations and enables +efficient, targeted pruning. Experimental results demonstrate that the combined +implementation of YOLO and SGBM enables the drone to accurately detect branches +and measure their distances from the drone. This research not only improves the +safety and efficiency of pruning operations but also makes a significant +contribution to the advancement of drone technology in the automation of +agricultural and forestry practices, laying a foundational framework for +further innovations in environmental management. + +
+
+
+
+
+ + ♻ ☆ UDC: A Unified Neural Divide-and-Conquer Framework for Large-Scale + Combinatorial Optimization Problems + + +
+ Single-stage neural combinatorial optimization solvers have achieved +near-optimal results on various small-scale combinatorial optimization (CO) +problems without needing expert knowledge. However, these solvers exhibit +significant performance degradation when applied to large-scale CO problems. +Recently, two-stage neural methods with divide-and-conquer strategies have +shown efficiency in addressing large-scale CO problems. Nevertheless, the +performance of these methods highly relies on problem-specific heuristics in +either the divide or the conquer procedure, which limits their applicability to +general CO problems. Moreover, these methods employ separate training schemes +and ignore the interdependencies between the dividing and conquering +strategies, which often leads to sub-optimal solutions. To tackle these +drawbacks, this article develops a unified neural divide-and-conquer framework +(i.e., UDC) for solving general large-scale CO problems. UDC offers a +Divide-Conquer-Reunion (DCR) training method to eliminate the negative impact +of a sub-optimal dividing policy. Employing a high-efficiency Graph Neural +Network (GNN) for global instance dividing and a fixed-length sub-path solver +for conquering divided sub-problems, the proposed UDC framework demonstrates +extensive applicability, achieving superior performance in 10 representative +large-scale CO problems. The code is available at +https://github.com/CIAM-Group/NCO_code/tree/main/single_objective/UDC-Large-scale-CO-master. + +
+
+
+
+
+ + ♻ ☆ Is It Good Data for Multilingual Instruction Tuning or Just Bad + Multilingual Evaluation for Large Language Models? EMNLP 2024 + + +
+ Multilingual large language models are designed, claimed, and expected to +cater to speakers of varied languages. We hypothesise that the current +practices of fine-tuning and evaluating these models may not perfectly align +with this objective owing to a heavy reliance on translation, which cannot +cover language-specific knowledge but can introduce translation defects. It +remains unknown whether the nature of the instruction data has an impact on the +model output; conversely, it is questionable whether translated test sets can +capture such nuances. Due to the often coupled practices of using translated +data in both stages, such imperfections could have been overlooked. This work +investigates these issues using controlled native or translated data during the +instruction tuning and evaluation stages. We show that native or generation +benchmarks reveal a notable difference between native and translated +instruction data especially when model performance is high, whereas other types +of test sets cannot. The comparison between round-trip and single-pass +translations reflects the importance of knowledge from language-native +resources. Finally, we demonstrate that regularization is beneficial to +bridging this gap on structured but not generative tasks. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ AI-driven View Guidance System in Intra-cardiac Echocardiography Imaging + + +
+ Intra-cardiac Echocardiography (ICE) is a crucial imaging modality used in +electrophysiology (EP) and structural heart disease (SHD) interventions, +providing real-time, high-resolution views from within the heart. Despite its +advantages, effective manipulation of the ICE catheter requires significant +expertise, which can lead to inconsistent outcomes, particularly among less +experienced operators. To address this challenge, we propose an AI-driven +closed-loop view guidance system with human-in-the-loop feedback, designed to +assist users in navigating ICE imaging without requiring specialized knowledge. +Our method models the relative position and orientation vectors between +arbitrary views and clinically defined ICE views in a spatial coordinate +system, guiding users on how to manipulate the ICE catheter to transition from +the current view to the desired view over time. Operating in a closed-loop +configuration, the system continuously predicts and updates the necessary +catheter manipulations, ensuring seamless integration into existing clinical +workflows. The effectiveness of the proposed system is demonstrated through a +simulation-based evaluation, achieving an 89% success rate with the 6532 test +dataset, highlighting its potential to improve the accuracy and efficiency of +ICE imaging procedures. + +
+
+
+
+
+ + ♻ ☆ Learning Interactive Real-World Simulators + + +
+ Generative models trained on internet data have revolutionized how text, +image, and video content can be created. Perhaps the next milestone for +generative models is to simulate realistic experience in response to actions +taken by humans, robots, and other interactive agents. Applications of a +real-world simulator range from controllable content creation in games and +movies, to training embodied agents purely in simulation that can be directly +deployed in the real world. We explore the possibility of learning a universal +simulator (UniSim) of real-world interaction through generative modeling. We +first make the important observation that natural datasets available for +learning a real-world simulator are often rich along different dimensions +(e.g., abundant objects in image data, densely sampled actions in robotics +data, and diverse movements in navigation data). With careful orchestration of +diverse datasets, each providing a different aspect of the overall experience, +we can simulate the visual outcome of both high-level instructions such as +"open the drawer" and low-level controls from otherwise static scenes and +objects. We use the simulator to train both high-level vision-language policies +and low-level reinforcement learning policies, each of which can be deployed in +the real world in zero shot after training purely in simulation. We also show +that other types of intelligence such as video captioning models can benefit +from training with simulated experience, opening up even wider applications. +Video demos can be found at https://universal-simulator.github.io. + +
+
+ comment: https://universal-simulator.github.io +
+
+
+
+
+ + ♻ ☆ Ascend HiFloat8 Format for Deep Learning + + +
+ This preliminary white paper proposes a novel 8-bit floating-point data +format HiFloat8 (abbreviated as HiF8) for deep learning. HiF8 features tapered +precision. For normal value encoding, it provides 7 exponent values with 3-bit +mantissa, 8 exponent values with 2-bit mantissa, and 16 exponent values with +1-bit mantissa. For denormal value encoding, it extends the dynamic range by 7 +extra powers of 2, from 31 to 38 binades (notice that FP16 covers 40 binades). +Meanwhile, HiF8 encodes all the special values except that positive zero and +negative zero are represented by only one bit-pattern. Thanks to the better +balance between precision and dynamic range, HiF8 can be simultaneously used in +both forward and backward passes of AI training. In this paper, we will +describe the definition and rounding methods of HiF8, as well as the tentative +training and inference solutions. To demonstrate the efficacy of HiF8, massive +simulation results on various neural networks, including traditional neural +networks and large language models (LLMs), will also be presented. + +
+
+ comment: 13 Pages, 4 Figures, 9 Tables +
+
+
+
+
+ + ♻ ☆ KAG: Boosting LLMs in Professional Domains via Knowledge Augmented + Generation + + +
+ The recently developed retrieval-augmented generation (RAG) technology has +enabled the efficient construction of domain-specific applications. However, it +also has limitations, including the gap between vector similarity and the +relevance of knowledge reasoning, as well as insensitivity to knowledge logic, +such as numerical values, temporal relations, expert rules, and others, which +hinder the effectiveness of professional knowledge services. In this work, we +introduce a professional domain knowledge service framework called Knowledge +Augmented Generation (KAG). KAG is designed to address the aforementioned +challenges with the motivation of making full use of the advantages of +knowledge graph(KG) and vector retrieval, and to improve generation and +reasoning performance by bidirectionally enhancing large language models (LLMs) +and KGs through five key aspects: (1) LLM-friendly knowledge representation, +(2) mutual-indexing between knowledge graphs and original chunks, (3) +logical-form-guided hybrid reasoning engine, (4) knowledge alignment with +semantic reasoning, and (5) model capability enhancement for KAG. We compared +KAG with existing RAG methods in multihop question answering and found that it +significantly outperforms state-of-theart methods, achieving a relative +improvement of 19.6% on 2wiki and 33.5% on hotpotQA in terms of F1 score. We +have successfully applied KAG to two professional knowledge Q&A tasks of Ant +Group, including E-Government Q&A and E-Health Q&A, achieving significant +improvement in professionalism compared to RAG methods. + +
+
+ comment: 33 pages +
+
+
+
+
+ + ♻ ☆ HAICOSYSTEM: An Ecosystem for Sandboxing Safety Risks in Human-AI + Interactions + + +
+ AI agents are increasingly autonomous in their interactions with human users +and tools, leading to increased interactional safety risks. We present +HAICOSYSTEM, a framework examining AI agent safety within diverse and complex +social interactions. HAICOSYSTEM features a modular sandbox environment that +simulates multi-turn interactions between human users and AI agents, where the +AI agents are equipped with a variety of tools (e.g., patient management +platforms) to navigate diverse scenarios (e.g., a user attempting to access +other patients' profiles). To examine the safety of AI agents in these +interactions, we develop a comprehensive multi-dimensional evaluation framework +that uses metrics covering operational, content-related, societal, and legal +risks. Through running 1840 simulations based on 92 scenarios across seven +domains (e.g., healthcare, finance, education), we demonstrate that HAICOSYSTEM +can emulate realistic user-AI interactions and complex tool use by AI agents. +Our experiments show that state-of-the-art LLMs, both proprietary and +open-sourced, exhibit safety risks in over 50\% cases, with models generally +showing higher risks when interacting with simulated malicious users. Our +findings highlight the ongoing challenge of building agents that can safely +navigate complex interactions, particularly when faced with malicious users. To +foster the AI agent safety ecosystem, we release a code platform that allows +practitioners to create custom scenarios, simulate interactions, and evaluate +the safety and performance of their agents. + +
+
+ comment: Both the second and third authors contributed equally +
+
+
+
+
+ + ♻ ☆ TypeFly: Flying Drones with Large Language Model + + +
+ Recent advancements in robot control using large language models (LLMs) have +demonstrated significant potential, primarily due to LLMs' capabilities to +understand natural language commands and generate executable plans in various +languages. However, in real-time and interactive applications involving mobile +robots, particularly drones, the sequential token generation process inherent +to LLMs introduces substantial latency, i.e. response time, in control plan +generation. + In this paper, we present a system called ChatFly that tackles this problem +using a combination of a novel programming language called MiniSpec and its +runtime to reduce the plan generation time and drone response time. That is, +instead of asking an LLM to write a program (robotic plan) in the popular but +verbose Python, ChatFly gets it to do it in MiniSpec specially designed for +token efficiency and stream interpretation. Using a set of challenging drone +tasks, we show that design choices made by ChatFly can reduce up to 62% +response time and provide a more consistent user experience, enabling +responsive and intelligent LLM-based drone control with efficient completion. + +
+
+
+
+
+ + ♻ ☆ LingoQA: Visual Question Answering for Autonomous Driving ECCV 2024 + + +
+ We introduce LingoQA, a novel dataset and benchmark for visual question +answering in autonomous driving. The dataset contains 28K unique short video +scenarios, and 419K annotations. Evaluating state-of-the-art vision-language +models on our benchmark shows that their performance is below human +capabilities, with GPT-4V responding truthfully to 59.6% of the questions +compared to 96.6% for humans. For evaluation, we propose a truthfulness +classifier, called Lingo-Judge, that achieves a 0.95 Spearman correlation +coefficient to human evaluations, surpassing existing techniques like METEOR, +BLEU, CIDEr, and GPT-4. We establish a baseline vision-language model and run +extensive ablation studies to understand its performance. We release our +dataset and benchmark as an evaluation platform for vision-language models in +autonomous driving. + +
+
+ comment: Accepted to ECCV 2024. Benchmark and dataset are available at + https://github.com/wayveai/LingoQA/ +
+
+
+
+
+ + ♻ ☆ Language agents achieve superhuman synthesis of scientific knowledge + + +
+ Language models are known to hallucinate incorrect information, and it is +unclear if they are sufficiently accurate and reliable for use in scientific +research. We developed a rigorous human-AI comparison methodology to evaluate +language model agents on real-world literature search tasks covering +information retrieval, summarization, and contradiction detection tasks. We +show that PaperQA2, a frontier language model agent optimized for improved +factuality, matches or exceeds subject matter expert performance on three +realistic literature research tasks without any restrictions on humans (i.e., +full access to internet, search tools, and time). PaperQA2 writes cited, +Wikipedia-style summaries of scientific topics that are significantly more +accurate than existing, human-written Wikipedia articles. We also introduce a +hard benchmark for scientific literature research called LitQA2 that guided +design of PaperQA2, leading to it exceeding human performance. Finally, we +apply PaperQA2 to identify contradictions within the scientific literature, an +important scientific task that is challenging for humans. PaperQA2 identifies +2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of +which 70% are validated by human experts. These results demonstrate that +language model agents are now capable of exceeding domain experts across +meaningful tasks on scientific literature. + +
+
+
+
+
+ + ♻ ☆ TabGraphs: A Benchmark and Strong Baselines for Learning on Graphs with + Tabular Node Features + + +
+ Tabular machine learning is an important field for industry and science. In +this field, table rows are usually treated as independent data samples, but +additional information about relations between them is sometimes available and +can be used to improve predictive performance. Such information can be +naturally modeled with a graph, thus tabular machine learning may benefit from +graph machine learning methods. However, graph machine learning models are +typically evaluated on datasets with homogeneous node features, which have +little in common with heterogeneous mixtures of numerical and categorical +features present in tabular datasets. Thus, there is a critical difference +between the data used in tabular and graph machine learning studies, which does +not allow one to understand how successfully graph models can be transferred to +tabular data. To bridge this gap, we propose a new benchmark of diverse graphs +with heterogeneous tabular node features and realistic prediction tasks. We use +this benchmark to evaluate a vast set of models, including simple methods +previously overlooked in the literature. Our experiments show that graph neural +networks (GNNs) can indeed often bring gains in predictive performance for +tabular data, but standard tabular models also can be adapted to work with +graph data by using simple feature preprocessing, which sometimes enables them +to compete with and even outperform GNNs. Based on our empirical study, we +provide insights for researchers and practitioners in both tabular and graph +machine learning fields. + +
+
+
+
+
+ + ♻ ☆ Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of + Anomalous Behavior in Bio-regenerative Life Support System Telemetry ECML + + +
+ The detection of abnormal or critical system states is essential in condition +monitoring. While much attention is given to promptly identifying anomalies, a +retrospective analysis of these anomalies can significantly enhance our +comprehension of the underlying causes of observed undesired behavior. This +aspect becomes particularly critical when the monitored system is deployed in a +vital environment. In this study, we delve into anomalies within the domain of +Bio-Regenerative Life Support Systems (BLSS) for space exploration and analyze +anomalies found in telemetry data stemming from the EDEN ISS space greenhouse +in Antarctica. We employ time series clustering on anomaly detection results to +categorize various types of anomalies in both uni- and multivariate settings. +We then assess the effectiveness of these methods in identifying systematic +anomalous behavior. Additionally, we illustrate that the anomaly detection +methods MDI and DAMP produce complementary results, as previously indicated by +research. + +
+
+ comment: 12 pages, + Supplemental Materials, Published at Machine Learning and + Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024 +
+
+
+
+
+ + ♻ ☆ A Comprehensive Framework for Evaluating API-oriented Code Generation in + Large Language Models + + +
+ Large language models (LLMs) like GitHub Copilot and ChatGPT have emerged as +powerful tools for code generation, significantly enhancing productivity and +accelerating software development. However, existing benchmarks primarily focus +on general code generation without considering API-oriented code generation, +i.e., generating code that invokes APIs from specific libraries. Given the +growing demand for API-oriented code generation, there is a pressing need for a +systematic and automated approach to evaluate LLM on API-oriented code +generation. To address this gap, we propose AutoAPIEval, a lightweight and +automated framework designed to evaluate the capabilities of LLMs in +API-oriented code generation. Our framework works with any library that +provides API documentation and focuses on two unit tasks: API recommendation +and code example generation, along with four metrics to evaluate the generated +APIs and code examples, such as the proportion of incorrect API recommendations +for Task 1, and the proportion of code examples where no specific API is +invoked and uncompilable/unexecutable code examples for Task 2. In addition, we +conducted a case study on three LLMs (ChatGPT, MagiCoder, and DeepSeek Coder) +and Java Runtime Environment 8 to demonstrate the framework's effectiveness. +Our findings reveal substantial variability in LLM performance across tasks, +with ChatGPT adhering better to instructions, while sharing similar +effectiveness in code example generation with its counterparts (i.e., MagiCoder +and DeekSeek Coder). We also identify key factors associated with code quality, +such as API popularity and model confidence, and build classifiers that achieve +high accuracy in detecting incorrect API recommendations and erroneous code +examples. Retrieval-augmented generation enhances the quality of code generated +by LLMs, though its effectiveness varies across different LLMs. + +
+
+
+
+
+ + ♻ ☆ Transformers, Contextualism, and Polysemy + + +
+ The transformer architecture, introduced by Vaswani et al. (2017), is at the +heart of the remarkable recent progress in the development of language models, +including widely-used chatbots such as Chat-GPT and Claude. In this paper, I +argue that we can extract from the way the transformer architecture works a +theory of the relationship between context and meaning. I call this the +transformer theory, and I argue that it is novel with regard to two related +philosophical debates: the contextualism debate regarding the extent of +context-sensitivity across natural language, and the polysemy debate regarding +how polysemy should be captured within an account of word meaning. + +
+
+
+
+
+ + ♻ ☆ Opponent Shaping for Antibody Development + + +
+ Anti-viral therapies are typically designed to target the current strains of +a virus. Game theoretically, this corresponds to a short-sighted, or myopic, +response. However, therapy-induced selective pressures act on viral antigens to +drive the emergence of mutated strains, against which initial therapies have +reduced efficacy. Building on a computational model of binding between +antibodies and viral antigens (the Absolut! framework), we design and implement +a genetic simulation of such viral evolutionary escape. Crucially, this allows +our antibody optimisation algorithm to consider and influence the entire escape +curve of the virus, i.e. to guide (or ''shape'') the viral evolution. This is +inspired by opponent shaping which, in general-sum learning, accounts for the +adaptation of the co-player rather than playing a myopic best response. Hence +we call the optimised antibodies shapers. Within our simulations, we +demonstrate that our shapers target both current and simulated future viral +variants, outperforming the antibodies chosen in a myopic way. Furthermore, we +show that shapers exert specific evolutionary pressure on the virus compared to +myopic antibodies. Altogether, shapers modify the evolutionary trajectories of +viral strains and minimise the viral escape compared to their myopic +counterparts. While this is a simplified model, we hope that our proposed +paradigm will enable the discovery of better long-lived vaccines and antibody +therapies in the future, enabled by rapid advancements in the capabilities of +simulation tools. Our code is available at +https://github.com/olakalisz/antibody-shapers. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ Discrete, compositional, and symbolic representations through attractor + dynamics + + +
+ Symbolic systems are powerful frameworks for modeling cognitive processes as +they encapsulate the rules and relationships fundamental to many aspects of +human reasoning and behavior. Central to these models are systematicity, +compositionality, and productivity, making them invaluable in both cognitive +science and artificial intelligence. However, certain limitations remain. For +instance, the integration of structured symbolic processes and latent +sub-symbolic processes has been implemented at the computational level through +fiat methods such as quantization or softmax sampling, which assume, rather +than derive, the operations underpinning discretization and symbolicization. In +this work, we introduce a novel neural stochastic dynamical systems model that +integrates attractor dynamics with symbolic representations to model cognitive +processes akin to the probabilistic language of thought (PLoT). Our model +segments the continuous representational space into discrete basins, with +attractor states corresponding to symbolic sequences, that reflect the +semanticity and compositionality characteristic of symbolic systems through +unsupervised learning, rather than relying on pre-defined primitives. Moreover, +like PLoT, our model learns to sample a diverse distribution of attractor +states that reflect the mutual information between the input data and the +symbolic encodings. This approach establishes a unified framework that +integrates both symbolic and sub-symbolic processing through neural dynamics, a +neuro-plausible substrate with proven expressivity in AI, offering a more +comprehensive model that mirrors the complex duality of cognitive operations. + +
+
+
+
+
+ + ♻ ☆ ZSC-Eval: An Evaluation Toolkit and Benchmark for Multi-agent Zero-shot + Coordination NeurIPS 2024 + + +
+ Zero-shot coordination (ZSC) is a new cooperative multi-agent reinforcement +learning (MARL) challenge that aims to train an ego agent to work with diverse, +unseen partners during deployment. The significant difference between the +deployment-time partners' distribution and the training partners' distribution +determined by the training algorithm makes ZSC a unique out-of-distribution +(OOD) generalization challenge. The potential distribution gap between +evaluation and deployment-time partners leads to inadequate evaluation, which +is exacerbated by the lack of appropriate evaluation metrics. In this paper, we +present ZSC-Eval, the first evaluation toolkit and benchmark for ZSC +algorithms. ZSC-Eval consists of: 1) Generation of evaluation partner +candidates through behavior-preferring rewards to approximate deployment-time +partners' distribution; 2) Selection of evaluation partners by Best-Response +Diversity (BR-Div); 3) Measurement of generalization performance with various +evaluation partners via the Best-Response Proximity (BR-Prox) metric. We use +ZSC-Eval to benchmark ZSC algorithms in Overcooked and Google Research Football +environments and get novel empirical findings. We also conduct a human +experiment of current ZSC algorithms to verify the ZSC-Eval's consistency with +human evaluation. ZSC-Eval is now available at +https://github.com/sjtu-marl/ZSC-Eval. + +
+
+ comment: Accepted in NeurIPS 2024 Dataset and Benchmark Track +
+
+
+
+
+ + ♻ ☆ Empowering Agrifood System with Artificial Intelligence: A Survey of the + Progress, Challenges and Opportunities + + +
+ With the world population rapidly increasing, transforming our agrifood +systems to be more productive, efficient, safe, and sustainable is crucial to +mitigate potential food shortages. Recently, artificial intelligence (AI) +techniques such as deep learning (DL) have demonstrated their strong abilities +in various areas, including language, vision, remote sensing (RS), and agrifood +systems applications. However, the overall impact of AI on agrifood systems +remains unclear. In this paper, we thoroughly review how AI techniques can +transform agrifood systems and contribute to the modern agrifood industry. +Firstly, we summarize the data acquisition methods in agrifood systems, +including acquisition, storage, and processing techniques. Secondly, we present +a progress review of AI methods in agrifood systems, specifically in +agriculture, animal husbandry, and fishery, covering topics such as agrifood +classification, growth monitoring, yield prediction, and quality assessment. +Furthermore, we highlight potential challenges and promising research +opportunities for transforming modern agrifood systems with AI. We hope this +survey could offer an overall picture to newcomers in the field and serve as a +starting point for their further research. The project website is +https://github.com/Frenkie14/Agrifood-Survey. + +
+
+ comment: Accepted by ACM Computing Surveys +
+
+
+
+
+ + ♻ ☆ Investigating OCR-Sensitive Neurons to Improve Entity Recognition in + Historical Documents + + +
+ This paper investigates the presence of OCR-sensitive neurons within the +Transformer architecture and their influence on named entity recognition (NER) +performance on historical documents. By analysing neuron activation patterns in +response to clean and noisy text inputs, we identify and then neutralise +OCR-sensitive neurons to improve model performance. Based on two open access +large language models (Llama2 and Mistral), experiments demonstrate the +existence of OCR-sensitive regions and show improvements in NER performance on +historical newspapers and classical commentaries, highlighting the potential of +targeted neuron modulation to improve models' performance on noisy text. + +
+
+
+
+
+ + ♻ ☆ Scenario of Use Scheme: Threat Model Specification for Speaker Privacy + Protection in the Medical Domain + + +
+ Speech recordings are being more frequently used to detect and monitor +disease, leading to privacy concerns. Beyond cryptography, protection of speech +can be addressed by approaches, such as perturbation, disentanglement, and +re-synthesis, that eliminate sensitive information of the speaker, leaving the +information necessary for medical analysis purposes. In order for such privacy +protective approaches to be developed, clear and systematic specifications of +assumptions concerning medical settings and the needs of medical professionals +are necessary. In this paper, we propose a Scenario of Use Scheme that +incorporates an Attacker Model, which characterizes the adversary against whom +the speaker's privacy must be defended, and a Protector Model, which specifies +the defense. We discuss the connection of the scheme with previous work on +speech privacy. Finally, we present a concrete example of a specified Scenario +of Use and a set of experiments about protecting speaker data against gender +inference attacks while maintaining utility for Parkinson's detection. + +
+
+ comment: Accepted and published at SPSC Symposium 2024 4th Symposium on + Security and Privacy in Speech Communication. Interspeech 2024 +
+
+
+
+
+ + ♻ ☆ Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation CoRL 2024 + + +
+ Given the high cost of collecting robotic data in the real world, sample +efficiency is a consistently compelling pursuit in robotics. In this paper, we +introduce SGRv2, an imitation learning framework that enhances sample +efficiency through improved visual and action representations. Central to the +design of SGRv2 is the incorporation of a critical inductive bias-action +locality, which posits that robot's actions are predominantly influenced by the +target object and its interactions with the local environment. Extensive +experiments in both simulated and real-world settings demonstrate that action +locality is essential for boosting sample efficiency. SGRv2 excels in RLBench +tasks with keyframe control using merely 5 demonstrations and surpasses the RVT +baseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and +MimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR. +In real-world environments, with only eight demonstrations, SGRv2 can perform a +variety of tasks at a markedly higher success rate compared to baseline models. +Project website: http://sgrv2-robot.github.io + +
+
+ comment: CoRL 2024. Project website: http://sgrv2-robot.github.io +
+
+
+
+
+ + ♻ ☆ Explainable AI needs formal notions of explanation correctness + + +
+ The use of machine learning (ML) in critical domains such as medicine poses +risks and requires regulation. One requirement is that decisions of ML systems +in high-risk applications should be human-understandable. The field of +"explainable artificial intelligence" (XAI) seemingly addresses this need. +However, in its current form, XAI is unfit to provide quality control for ML; +it itself needs scrutiny. Popular XAI methods cannot reliably answer important +questions about ML models, their training data, or a given test input. We +recapitulate results demonstrating that popular XAI methods systematically +attribute importance to input features that are independent of the prediction +target. This limits their utility for purposes such as model and data +(in)validation, model improvement, and scientific discovery. We argue that the +fundamental reason for this limitation is that current XAI methods do not +address well-defined problems and are not evaluated against objective criteria +of explanation correctness. Researchers should formally define the problems +they intend to solve first and then design methods accordingly. This will lead +to notions of explanation correctness that can be theoretically verified and +objective metrics of explanation performance that can be assessed using +ground-truth data. + +
+
+
+
+
+ + ♻ ☆ Learning to Receive Help: Intervention-Aware Concept Embedding Models NeurIPS 2023 + + +
+ Concept Bottleneck Models (CBMs) tackle the opacity of neural architectures +by constructing and explaining their predictions using a set of high-level +concepts. A special property of these models is that they permit concept +interventions, wherein users can correct mispredicted concepts and thus improve +the model's performance. Recent work, however, has shown that intervention +efficacy can be highly dependent on the order in which concepts are intervened +on and on the model's architecture and training hyperparameters. We argue that +this is rooted in a CBM's lack of train-time incentives for the model to be +appropriately receptive to concept interventions. To address this, we propose +Intervention-aware Concept Embedding models (IntCEMs), a novel CBM-based +architecture and training paradigm that improves a model's receptiveness to +test-time interventions. Our model learns a concept intervention policy in an +end-to-end fashion from where it can sample meaningful intervention +trajectories at train-time. This conditions IntCEMs to effectively select and +receive concept interventions when deployed at test-time. Our experiments show +that IntCEMs significantly outperform state-of-the-art concept-interpretable +models when provided with test-time concept interventions, demonstrating the +effectiveness of our approach. + +
+
+ comment: Accepted as a spotlight at the Thirty-seventh Conference on Neural + Information Processing Systems (NeurIPS 2023) +
+
+
+
+
+ + ♻ ☆ EfficientRAG: Efficient Retriever for Multi-Hop Question Answering + + +
+ Retrieval-augmented generation (RAG) methods encounter difficulties when +addressing complex questions like multi-hop queries. While iterative retrieval +methods improve performance by gathering additional information, current +approaches often rely on multiple calls of large language models (LLMs). In +this paper, we introduce EfficientRAG, an efficient retriever for multi-hop +question answering. EfficientRAG iteratively generates new queries without the +need for LLM calls at each iteration and filters out irrelevant information. +Experimental results demonstrate that EfficientRAG surpasses existing RAG +methods on three open-domain multi-hop question-answering datasets. + +
+
+ comment: 20 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Unsupervisedly Learned Representations: Should the Quest be Over? + + +
+ After four decades of research there still exists a Classification accuracy +gap of about 20% between our best Unsupervisedly Learned Representations +methods and the accuracy rates achieved by intelligent animals. It thus may +well be that we are looking in the wrong direction. A possible solution to this +puzzle is presented. We demonstrate that Reinforcement Learning can learn +representations which achieve the same accuracy as that of animals. Our main +modest contribution lies in the observations that: a. when applied to a real +world environment Reinforcement Learning does not require labels, and thus may +be legitimately considered as Unsupervised Learning, and b. in contrast, when +Reinforcement Learning is applied in a simulated environment it does inherently +require labels and should thus be generally be considered as Supervised +Learning. The corollary of these observations is that further search for +Unsupervised Learning competitive paradigms which may be trained in simulated +environments may be futile. + +
+
+ comment: To be published at The 6th International Conference on Machine + Learning, Optimization and Data Science - LOD 2020 +
+
+
+
+
+ + ♻ ☆ Fast Sampling Through The Reuse Of Attention Maps In Diffusion Models + + +
+ Text-to-image diffusion models have demonstrated unprecedented capabilities +for flexible and realistic image synthesis. Nevertheless, these models rely on +a time-consuming sampling procedure, which has motivated attempts to reduce +their latency. When improving efficiency, researchers often use the original +diffusion model to train an additional network designed specifically for fast +image generation. In contrast, our approach seeks to reduce latency directly, +without any retraining, fine-tuning, or knowledge distillation. In particular, +we find the repeated calculation of attention maps to be costly yet redundant, +and instead suggest reusing them during sampling. Our specific reuse strategies +are based on ODE theory, which implies that the later a map is reused, the +smaller the distortion in the final image. We empirically compare these reuse +strategies with few-step sampling procedures of comparable latency, finding +that reuse generates images that are closer to those produced by the original +high-latency diffusion model. + +
+
+
+
+
+ + ♻ ☆ An Empirical Study on Cross-lingual Vocabulary Adaptation for Efficient + Language Model Inference EMNLP 2024 + + +
+ The development of state-of-the-art generative large language models (LLMs) +disproportionately relies on English-centric tokenizers, vocabulary and +pre-training data. Despite the fact that some LLMs have multilingual +capabilities, recent studies have shown that their inference efficiency +deteriorates when generating text in languages other than English. This results +in increased inference time and costs. Cross-lingual vocabulary adaptation +(CVA) methods have been proposed for adapting models to a target language +aiming to improve downstream performance. However, the effectiveness of these +methods on increasing inference efficiency of generative LLMs has yet to be +explored. In this paper, we perform an empirical study of five CVA methods on +four generative LLMs (including monolingual and multilingual models) across +four typologically-diverse languages and four natural language understanding +tasks. We find that CVA substantially contributes to LLM inference speedups of +up to 271.5\%. We also show that adapting LLMs that have been pre-trained on +more balanced multilingual data results in downstream performance comparable to +the original models. + +
+
+ comment: Accepted at EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Abstraction-of-Thought Makes Language Models Better Reasoners EMNLP 2024 + + +
+ Abstract reasoning, the ability to reason from the abstract essence of a +problem, serves as a key to generalization in human reasoning. However, +eliciting language models to perform reasoning with abstraction remains +unexplored. This paper seeks to bridge this gap by introducing a novel +structured reasoning format called Abstraction-of-Thought (AoT). The uniqueness +of AoT lies in its explicit requirement for varying levels of abstraction +within the reasoning process. This approach could elicit language models to +first contemplate on the abstract level before incorporating concrete details, +which is overlooked by the prevailing step-by-step Chain-of-Thought (CoT) +method. To align models with the AoT format, we present AoT Collection, a +generic finetuning dataset consisting of 348k high-quality samples with AoT +reasoning processes, collected via an automated and scalable pipeline. We +finetune a wide range of language models with AoT Collection and conduct +extensive evaluations on 23 unseen tasks from the challenging benchmark +Big-Bench Hard. Experimental results indicate that models aligned to AoT +reasoning format substantially outperform those aligned to CoT in many +reasoning tasks. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ General-purpose Clothes Manipulation with Semantic Keypoints + + +
+ Clothes manipulation is a critical skill for household robots. Recent +advancements have been made in task-specific clothes manipulation, such as +folding, flattening, and hanging. However, due to clothes' complex geometries +and deformability, creating a general-purpose robot system that can manipulate +a diverse range of clothes in many ways remains challenging. Since clothes are +typically designed with specific structures, we propose identifying these +specific features like ``left sleeve'' as semantic keypoints. Semantic +keypoints can provide semantic cues for task planning and geometric cues for +low-level action generation. With this insight, we develop a hierarchical +learning framework using the large language model (LLM) for general-purpose +CLothes mAnipulation with Semantic keyPoints (CLASP). Extensive simulation +experiments show that CLASP outperforms baseline methods on both seen and +unseen tasks across various clothes manipulation tasks. Real-world experiments +show that CLASP can be directly deployed in the real world and applied to a +wide variety of clothes. + +
+
+
+
+
+ + ♻ ☆ A Concept-Value Network as a Brain Model + + +
+ This paper suggests a statistical framework for describing the relations +between the physical and conceptual entities of a brain-like model. Features +and concept instances are put into context, where the paper suggests that +features may be the electrical wiring, although chemical connections are also +possible. With this idea, the actual length of the connection is important, +because it is related to firing rates and neuron synchronization, but the +signal type is less important. The paper then suggests that concepts are neuron +groups that link feature sets and concept instances are determined by chemical +signals from those groups. Therefore, features become the static horizontal +framework of the neural system and concepts are vertically interconnected +combinations of these. With regards to functionality, the neuron is then +considered to be functional and the more horizontal memory structures can even +be glial. This would also suggest that features can be distributed entities and +not concentrated to a single area. Another aspect could be signal 'breaks' that +compartmentalise a pattern and may help with neural binding. + +
+
+
+
+
+ + ♻ ☆ Augmented neural forms with parametric boundary-matching operators for + solving ordinary differential equations + + +
+ Approximating solutions of ordinary and partial differential equations +constitutes a significant challenge. Based on functional expressions that +inherently depend on neural networks, neural forms are specifically designed to +precisely satisfy the prescribed initial or boundary conditions of the problem, +while providing the approximate solutions in closed form. Departing from the +important class of ordinary differential equations, the present work aims to +refine and validate the neural forms methodology, paving the ground for further +developments in more challenging fields. The main contributions are as follows. +First, it introduces a formalism for systematically crafting proper neural +forms with adaptable boundary matches that are amenable to optimization. +Second, it describes a novel technique for converting problems with Neumann or +Robin conditions into equivalent problems with parametric Dirichlet conditions. +Third, it outlines a method for determining an upper bound on the absolute +deviation from the exact solution. The proposed augmented neural forms approach +was tested on a set of diverse problems, encompassing first- and second-order +ordinary differential equations, as well as first-order systems. Stiff +differential equations have been considered as well. The resulting solutions +were subjected to assessment against existing exact solutions, solutions +derived through the common penalized neural method, and solutions obtained via +contemporary numerical analysis methods. The reported results demonstrate that +the augmented neural forms not only satisfy the boundary and initial conditions +exactly, but also provide closed-form solutions that facilitate high-quality +interpolation and controllable overall precision. These attributes are +essential for expanding the application field of neural forms to more +challenging problems that are described by partial differential equations. + +
+
+
+
+
+ + ♻ ☆ SR-CurvANN: Advancing 3D Surface Reconstruction through Curvature-Aware + Neural Networks + + +
+ Incomplete or missing data in three-dimensional (3D) models can lead to +erroneous or flawed renderings, limiting their usefulness in applications such +as visualization, geometric computation, and 3D printing. Conventional +surface-repair techniques often fail to infer complex geometric details in +missing areas. Neural networks successfully address hole-filling tasks in 2D +images using inpainting techniques. The combination of surface reconstruction +algorithms, guided by the model's curvature properties and the creativity of +neural networks in the inpainting processes should provide realistic results in +the hole completion task. In this paper, we propose a novel method entitled +SR-CurvANN (Surface Reconstruction Based on Curvature-Aware Neural Networks) +that incorporates neural network-based 2D inpainting to effectively reconstruct +3D surfaces. We train the neural networks with images that represent planar +representations of the curvature at vertices of hundreds of 3D models. Once the +missing areas have been inferred, a coarse-to-fine surface deformation process +ensures that the surface fits the reconstructed curvature image. Our proposal +makes it possible to learn and generalize patterns from a wide variety of +training 3D models, generating comprehensive inpainted curvature images and +surfaces. Experiments conducted on 959 models with several holes have +demonstrated that SR-CurvANN excels in the shape completion process, filling +holes with a remarkable level of realism and precision. + +
+
+ comment: Major changes in title, paper structure, text and figures. Improved + results. 23 pages, 14 figures. Decision about submission not taken yet +
+
+
+
+
+ + ♻ ☆ On the Design and Analysis of LLM-Based Algorithms + + +
+ We initiate a formal investigation into the design and analysis of LLM-based +algorithms, i.e. algorithms that contain one or multiple calls of large +language models (LLMs) as sub-routines and critically rely on the capabilities +of LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt +engineering to complicated LLM-powered agent systems and compound AI systems, +have achieved remarkable empirical success, the design and optimization of them +have mostly relied on heuristics and trial-and-errors, which is largely due to +a lack of formal and analytical study for these algorithms. To fill this gap, +we start by identifying the computational-graph representation of LLM-based +algorithms, the design principle of task decomposition, and some key +abstractions, which then facilitate our formal analysis for the accuracy and +efficiency of LLM-based algorithms, despite the black-box nature of LLMs. +Through extensive analytical and empirical investigation in a series of case +studies, we demonstrate that the proposed framework is broadly applicable to a +wide range of scenarios and diverse patterns of LLM-based algorithms, such as +parallel, hierarchical and recursive task decomposition. Our proposed framework +holds promise for advancing LLM-based algorithms, by revealing the reasons +behind curious empirical phenomena, guiding the choices of hyperparameters, +predicting the empirical performance of algorithms, and inspiring new algorithm +design. To promote further study of LLM-based algorithms, we release our source +code at +https://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm. + +
+
+
+
+
+ + ♻ ☆ In-Context Ensemble Improves Video-Language Models for Low-Level + Workflow Understanding from Human Demonstrations + + +
+ A Standard Operating Procedure (SOP) defines a low-level, step-by-step +written guide for a business software workflow based on a video demonstration. +SOPs are a crucial step toward automating end-to-end software workflows. +Manually creating SOPs can be time-consuming. Recent advancements in large +video-language models offer the potential for automating SOP generation by +analyzing recordings of human demonstrations. However, current large +video-language models face challenges with zero-shot SOP generation. We explore +in-context learning with video-language models for SOP generation. We report +that in-context learning sometimes helps video-language models at SOP +generation. We then propose an in-context ensemble learning to further enhance +the capabilities of the models in SOP generation. + +
+
+ comment: multimodal in-context ensemble learning, video-language models, SOP + generation, pseudo-labels, in-context learning, prompt engineering +
+
+
+
+
+ + ♻ ☆ Leveraging summary of radiology reports with transformers + + +
+ Two fundamental problems in health-care stem from patient handoff and triage. +Doctors are often required to perform complex findings summarization to +facilitate efficient communication with specialists and decision making on the +urgency of each case. To address these challenges, we present a state of the +art radiology report summarization model utilizing adjusted bidirectional +encoder representation from transformers BERTtoBERT encoder and decoder +architecture. We also provide a data processing pipeline for future models +developed on the the MIMIC CXR dataset. Our approach includes a novel method +for augmenting medical data and a comprehensive performance analysis. Our best +performing model achieved a recall oriented understudy for gisting evaluation L +F1 score of 58.75/100, outperforming specialized checkpoints with more +sophisticated attention mechanisms. We also provide a data processing pipeline +for future models developed on the MIMIC chest X-ray dataset. The model +introduced in this paper demonstrates significantly improved capacity in +radiology report summarization, highlighting the potential for ensuring better +clinical workflows and enhanced patient care. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Tenplex: Dynamic Parallelism for Deep Learning using Parallelizable + Tensor Collections + + +
+ Deep learning (DL) jobs use multi-dimensional parallelism, i.e. combining +data, model, and pipeline parallelism, to use large GPU clusters efficiently. +Long-running jobs may experience changes to their GPU allocation: (i) resource +elasticity during training adds or removes GPUs; (ii) hardware maintenance may +require redeployment on different GPUs; and (iii) GPU failures force jobs to +run with fewer devices. Current DL frameworks tie jobs to a set of GPUs and +thus lack support for these scenarios. In particular, they cannot change the +multi-dimensional parallelism of an already-running job in an efficient and +model-independent way. + We describe Scalai, a state management library for DL systems that enables +jobs to change their parallelism dynamically after the GPU allocation is +updated at runtime. Scalai achieves this through a new abstraction, a +parallelizable tensor collection (PTC), that externalizes the job state during +training. After a GPU change, Scalai uses the PTC to transform the job state: +the PTC repartitions the dataset state under data parallelism and exposes it to +DL workers through a virtual file system; and the PTC obtains the model state +as partitioned checkpoints and transforms them to reflect the new +parallelization configuration. For efficiency, Scalai executes PTC +transformations in parallel with minimum data movement between workers. Our +experiments show that Scalai enables DL jobs to support dynamic parallelization +with low overhead. + +
+
+ comment: The 30th Symposium on Operating Systems Principles (SOSP24) +
+
+
+
+
+ + ♻ ☆ IDP-PGFE: An Interpretable Disruption Predictor based on Physics-Guided + Feature Extraction + + +
+ Disruption prediction has made rapid progress in recent years, especially in +machine learning (ML)-based methods. Understanding why a predictor makes a +certain prediction can be as crucial as the prediction's accuracy for future +tokamak disruption predictors. The purpose of most disruption predictors is +accuracy or cross-machine capability. However, if a disruption prediction model +can be interpreted, it can tell why certain samples are classified as +disruption precursors. This allows us to tell the types of incoming disruption +and gives us insight into the mechanism of disruption. This paper designs a +disruption predictor called Interpretable Disruption Predictor based On +Physics-guided feature extraction (IDP-PGFE) on J-TEXT. The prediction +performance of the model is effectively improved by extracting physics-guided +features. A high-performance model is required to ensure the validity of the +interpretation results. The interpretability study of IDP-PGFE provides an +understanding of J-TEXT disruption and is generally consistent with existing +comprehension of disruption. IDP-PGFE has been applied to the disruption due to +continuously increasing density towards density limit experiments on J-TEXT. +The time evolution of the PGFE features contribution demonstrates that the +application of ECRH triggers radiation-caused disruption, which lowers the +density at disruption. While the application of RMP indeed raises the density +limit in J-TEXT. The interpretability study guides intuition on the physical +mechanisms of density limit disruption that RMPs affect not only the MHD +instabilities but also the radiation profile, which delays density limit +disruption. + +
+
+ comment: 17 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ AI-enhanced Collective Intelligence + + +
+ Current societal challenges exceed the capacity of humans operating either +alone or collectively. As AI evolves, its role within human collectives will +vary from an assistive tool to a participatory member. Humans and AI possess +complementary capabilities that, together, can surpass the collective +intelligence of either humans or AI in isolation. However, the interactions in +human-AI systems are inherently complex, involving intricate processes and +interdependencies. This review incorporates perspectives from complex network +science to conceptualize a multilayer representation of human-AI collective +intelligence, comprising cognition, physical, and information layers. Within +this multilayer network, humans and AI agents exhibit varying characteristics; +humans differ in diversity from surface-level to deep-level attributes, while +AI agents range in degrees of functionality and anthropomorphism. We explore +how agents' diversity and interactions influence the system's collective +intelligence and analyze real-world instances of AI-enhanced collective +intelligence. We conclude by considering potential challenges and future +developments in this field. + +
+
+ comment: 43 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ SatFed: A Resource-Efficient LEO Satellite-Assisted Heterogeneous + Federated Learning Framework + + +
+ Traditional federated learning (FL) frameworks rely heavily on terrestrial +networks, where coverage limitations and increasing bandwidth congestion +significantly hinder model convergence. Fortunately, the advancement of +low-Earth orbit (LEO) satellite networks offers promising new communication +avenues to augment traditional terrestrial FL. Despite this potential, the +limited satellite-ground communication bandwidth and the heterogeneous +operating environments of ground devices-including variations in data, +bandwidth, and computing power-pose substantial challenges for effective and +robust satellite-assisted FL. To address these challenges, we propose SatFed, a +resource-efficient satellite-assisted heterogeneous FL framework. SatFed +implements freshness-based model prioritization queues to optimize the use of +highly constrained satellite-ground bandwidth, ensuring the transmission of the +most critical models. Additionally, a multigraph is constructed to capture +real-time heterogeneous relationships between devices, including data +distribution, terrestrial bandwidth, and computing capability. This multigraph +enables SatFed to aggregate satellite-transmitted models into peer guidance, +enhancing local training in heterogeneous environments. Extensive experiments +with real-world LEO satellite networks demonstrate that SatFed achieves +superior performance and robustness compared to state-of-the-art benchmarks. + +
+
+ comment: 10 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ AutoScraper: A Progressive Understanding Web Agent for Web Scraper + Generation EMNLP 2024 + + +
+ Web scraping is a powerful technique that extracts data from websites, +enabling automated data collection, enhancing data analysis capabilities, and +minimizing manual data entry efforts. Existing methods, wrappers-based methods +suffer from limited adaptability and scalability when faced with a new website, +while language agents, empowered by large language models (LLMs), exhibit poor +reusability in diverse web environments. In this work, we introduce the +paradigm of generating web scrapers with LLMs and propose AutoScraper, a +two-stage framework that can handle diverse and changing web environments more +efficiently. AutoScraper leverages the hierarchical structure of HTML and +similarity across different web pages for generating web scrapers. Besides, we +propose a new executability metric for better measuring the performance of web +scraper generation tasks. We conduct comprehensive experiments with multiple +LLMs and demonstrate the effectiveness of our framework. Resources of this +paper can be found at \url{https://github.com/EZ-hwh/AutoScraper} + +
+
+ comment: 19 pages, 4 figures, 18 tables. Accepted to EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ VARADE: a Variational-based AutoRegressive model for Anomaly Detection + on the Edge + + +
+ Detecting complex anomalies on massive amounts of data is a crucial task in +Industry 4.0, best addressed by deep learning. However, available solutions are +computationally demanding, requiring cloud architectures prone to latency and +bandwidth issues. This work presents VARADE, a novel solution implementing a +light autoregressive framework based on variational inference, which is best +suited for real-time execution on the edge. The proposed approach was validated +on a robotic arm, part of a pilot production line, and compared with several +state-of-the-art algorithms, obtaining the best trade-off between anomaly +detection accuracy, power consumption and inference frequency on two different +edge platforms. + +
+
+
+
+
+ + ♻ ☆ Fixed-length Dense Descriptor for Efficient Fingerprint Matching + + +
+ In fingerprint matching, fixed-length descriptors generally offer greater +efficiency compared to minutiae set, but the recognition accuracy is not as +good as that of the latter. Although much progress has been made in deep +learning based fixed-length descriptors recently, they often fall short when +dealing with incomplete or partial fingerprints, diverse fingerprint poses, and +significant background noise. In this paper, we propose a three-dimensional +representation called Fixed-length Dense Descriptor (FDD) for efficient +fingerprint matching. FDD features great spatial properties, enabling it to +capture the spatial relationships of the original fingerprints, thereby +enhancing interpretability and robustness. Our experiments on various +fingerprint datasets reveal that FDD outperforms other fixed-length +descriptors, especially in matching fingerprints of different areas, +cross-modal fingerprint matching, and fingerprint matching with background +noise. + +
+
+ comment: Accepted by WIFS 2024 +
+
+
+
+
+ + ♻ ☆ Time and State Dependent Neural Delay Differential Equations + + +
+ Discontinuities and delayed terms are encountered in the governing equations +of a large class of problems ranging from physics and engineering to medicine +and economics. These systems cannot be properly modelled and simulated with +standard Ordinary Differential Equations (ODE), or data-driven approximations +such as Neural Ordinary Differential Equations (NODE). To circumvent this +issue, latent variables are typically introduced to solve the dynamics of the +system in a higher dimensional space and obtain the solution as a projection to +the original space. However, this solution lacks physical interpretability. In +contrast, Delay Differential Equations (DDEs), and their data-driven +approximated counterparts, naturally appear as good candidates to characterize +such systems. In this work we revisit the recently proposed Neural DDE by +introducing Neural State-Dependent DDE (SDDDE), a general and flexible +framework that can model multiple and state- and time-dependent delays. We show +that our method is competitive and outperforms other continuous-class models on +a wide variety of delayed dynamical systems. Code is available at the +repository +\href{https://github.com/thibmonsel/Time-and-State-Dependent-Neural-Delay-Differential-Equations}{here}. + +
+
+
+
+
+ + ♻ ☆ Neuro-Symbolic Integration Brings Causal and Reliable Reasoning Proofs + + +
+ Two lines of approaches are adopted for complex reasoning with LLMs. One line +of work prompts LLMs with various reasoning structures, while the structural +outputs can be naturally regarded as intermediate reasoning steps. Another line +of work adopt LLM-free declarative solvers to do the reasoning task, rendering +higher reasoning accuracy but lacking interpretability due to the black-box +nature of the solvers. Aiming to resolve the trade-off between answer accuracy +and interpretability, we present a simple extension to the latter line of work. +Specifically, we showcase that the intermediate search logs generated by Prolog +interpreters can be accessed and interpreted into human-readable reasoning +proofs. As long as LLMs correctly translate problem descriptions into Prolog +representations, the corresponding reasoning proofs are ensured to be causal +and reliable. On two logical reasoning and one arithmetic reasoning datasets, +our framework obtains significant improvements in terms of both answer accuracy +and reasoning proof accuracy. Our code is released at +https://github.com/DAMO-NLP-SG/CaRing + +
+
+
+
+
+ + ♻ ☆ SeCoKD: Aligning Large Language Models for In-Context Learning with + Fewer Shots + + +
+ Previous studies have shown that demonstrations can significantly help Large +Language Models (LLMs ) perform better on the given tasks. However, this +so-called In-Context Learning ( ICL ) ability is very sensitive to the +presenting context, and often dozens of demonstrations are needed. In this +work, we investigate if we can reduce the shot number while still maintaining a +competitive performance. We present SeCoKD, a self-Knowledge Distillation ( KD +) training framework that aligns the student model with a heavily prompted +variation, thereby increasing the utilization of a single demonstration. We +experiment with the SeCoKD across three LLMs and six benchmarks focusing mainly +on reasoning tasks. Results show that our method outperforms the base model and +Supervised Fine-tuning ( SFT ), especially in zero-shot and one-shot settings +by 30% and 10%, respectively. Moreover, SeCoKD brings little negative artifacts +when evaluated on new tasks, which is more robust than Supervised Fine-tuning. + +
+
+ comment: preprint +
+
+
+
+
+ + ♻ ☆ Archon: An Architecture Search Framework for Inference-Time Techniques + + +
+ Inference-time techniques are emerging as highly effective tools to increase +large language model (LLM) capabilities. However, there is still limited +understanding of the best practices for developing systems that combine +inference-time techniques with one or more LLMs, with challenges including: (1) +effectively allocating inference compute budget, (2) understanding the +interactions between different combinations of inference-time techniques and +their impact on downstream performance, and 3) efficiently searching over the +large space of model choices, inference-time techniques, and their +compositions. To address these challenges, we introduce Archon, an automated +framework for designing inference-time architectures. Archon defines an +extensible design space, encompassing methods such as generation ensembling, +multi-sampling, ranking, fusion, critiquing, verification, and unit testing. It +then transforms the problem of selecting and combining LLMs and inference-time +techniques into a hyperparameter optimization objective. To optimize this +objective, we introduce automated Inference-Time Architecture Search (ITAS) +algorithms. Given target benchmark(s), an inference compute budget, and +available LLMs, ITAS outputs optimized architectures. We evaluate Archon +architectures across a wide range of instruction-following and reasoning +benchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval, +MixEval Hard, MATH, and CodeContests. We show that automatically designed +inference-time architectures by Archon outperform strong models such as GPT-4o +and Claude 3.5 Sonnet on these benchmarks, achieving an average increase of +15.1 and 11.2 percentage points with all-source models and open-source models, +respectively. We make our code and datasets available publicly on Github: +https://github.com/ScalingIntelligence/Archon. + +
+
+
+
+
+ + ♻ ☆ Hybrid Spiking Neural Networks for Low-Power Intra-Cortical + Brain-Machine Interfaces + + +
+ Intra-cortical brain-machine interfaces (iBMIs) have the potential to +dramatically improve the lives of people with paraplegia by restoring their +ability to perform daily activities. However, current iBMIs suffer from +scalability and mobility limitations due to bulky hardware and wiring. Wireless +iBMIs offer a solution but are constrained by a limited data rate. To overcome +this challenge, we are investigating hybrid spiking neural networks for +embedded neural decoding in wireless iBMIs. The networks consist of a temporal +convolution-based compression followed by recurrent processing and a final +interpolation back to the original sequence length. As recurrent units, we +explore gated recurrent units (GRUs), leaky integrate-and-fire (LIF) neurons, +and a combination of both - spiking GRUs (sGRUs) and analyze their differences +in terms of accuracy, footprint, and activation sparsity. To that end, we train +decoders on the "Nonhuman Primate Reaching with Multichannel Sensorimotor +Cortex Electrophysiology" dataset and evaluate it using the NeuroBench +framework, targeting both tracks of the IEEE BioCAS Grand Challenge on Neural +Decoding. Our approach achieves high accuracy in predicting velocities of +primate reaching movements from multichannel primary motor cortex recordings +while maintaining a low number of synaptic operations, surpassing the current +baseline models in the NeuroBench framework. This work highlights the potential +of hybrid neural networks to facilitate wireless iBMIs with high decoding +precision and a substantial increase in the number of monitored neurons, paving +the way toward more advanced neuroprosthetic technologies. + +
+
+ comment: This work has been accepted at the 2024 IEEE Biomedical Circuits and + Systems Conference +
+
+
+
+
+ + ♻ ☆ Unused information in token probability distribution of generative LLM: + improving LLM reading comprehension through calculation of expected values + + +
+ LLM text decoding is key component for perceived LLM quality. We demonstrate +two experiments showing that decoding methods could be improved by manipulation +of token probabilities. First, we test few LLM on SummEval summary scoring +dataset, to measure reading comprehension. We compare scores from greedy +decoding to expected values over the next token distribution. We scale logits +by large temperature to increase the entropy of scores. This allows strong +improvement of performance on SummEval (in terms of correlations to human +judgement). We see improvement from 6-8% to 13-28% for 7B Mistral and from +20%-46% to 37%-56% for Mixtral, beating GPT 4 0314 result on two metrics. Part +of the gain seems related to positional bias. Secondly, we use +probability-based tree sampling algorithm, to examine all most probable +generations for given prompt. + +
+
+ comment: 7 pages, 1 figure, presented at FEDCSIS 2024 conference, +
+
+
+
+
+ + ♻ ☆ ND-SDF: Learning Normal Deflection Fields for High-Fidelity Indoor + Reconstruction + + +
+ Neural implicit reconstruction via volume rendering has demonstrated its +effectiveness in recovering dense 3D surfaces. However, it is non-trivial to +simultaneously recover meticulous geometry and preserve smoothness across +regions with differing characteristics. To address this issue, previous methods +typically employ geometric priors, which are often constrained by the +performance of the prior models. In this paper, we propose ND-SDF, which learns +a Normal Deflection field to represent the angular deviation between the scene +normal and the prior normal. Unlike previous methods that uniformly apply +geometric priors on all samples, introducing significant bias in accuracy, our +proposed normal deflection field dynamically learns and adapts the utilization +of samples based on their specific characteristics, thereby improving both the +accuracy and effectiveness of the model. Our method not only obtains smooth +weakly textured regions such as walls and floors but also preserves the +geometric details of complex structures. In addition, we introduce a novel ray +sampling strategy based on the deflection angle to facilitate the unbiased +rendering process, which significantly improves the quality and accuracy of +intricate surfaces, especially on thin structures. Consistent improvements on +various challenging datasets demonstrate the superiority of our method. + +
+
+
+
+
+ + ♻ ☆ INT-FlashAttention: Enabling Flash Attention for INT8 Quantization + + +
+ As the foundation of large language models (LLMs), self-attention module +faces the challenge of quadratic time and memory complexity with respect to +sequence length. FlashAttention accelerates attention computation and reduces +its memory usage by leveraging the GPU memory hierarchy. A promising research +direction is to integrate FlashAttention with quantization methods. This paper +introduces INT-FlashAttention, the first INT8 quantization architecture +compatible with the forward workflow of FlashAttention, which significantly +improves the inference speed of FlashAttention on Ampere GPUs. We implement our +INT-FlashAttention prototype with fully INT8 activations and general +matrix-multiplication (GEMM) kernels, making it the first attention operator +with fully INT8 input. As a general token-level post-training quantization +framework, INT-FlashAttention is also compatible with other data formats like +INT4, etc. Experimental results show INT-FlashAttention achieves 72% faster +inference speed and 82% smaller quantization error compared to standard +FlashAttention with FP16 and FP8 data format. + +
+
+
+
+
+ + ♻ ☆ SliceIt! -- A Dual Simulator Framework for Learning Robot Food Slicing ICRA 2024 + + +
+ Cooking robots can enhance the home experience by reducing the burden of +daily chores. However, these robots must perform their tasks dexterously and +safely in shared human environments, especially when handling dangerous tools +such as kitchen knives. This study focuses on enabling a robot to autonomously +and safely learn food-cutting tasks. More specifically, our goal is to enable a +collaborative robot or industrial robot arm to perform food-slicing tasks by +adapting to varying material properties using compliance control. Our approach +involves using Reinforcement Learning (RL) to train a robot to compliantly +manipulate a knife, by reducing the contact forces exerted by the food items +and by the cutting board. However, training the robot in the real world can be +inefficient, and dangerous, and result in a lot of food waste. Therefore, we +proposed SliceIt!, a framework for safely and efficiently learning robot +food-slicing tasks in simulation. Following a real2sim2real approach, our +framework consists of collecting a few real food slicing data, calibrating our +dual simulation environment (a high-fidelity cutting simulator and a robotic +simulator), learning compliant control policies on the calibrated simulation +environment, and finally, deploying the policies on the real robot. + +
+
+ comment: Accepted to ICRA 2024 +
+
+
+
+
+ + ♻ ☆ Learning Variable Compliance Control From a Few Demonstrations for + Bimanual Robot with Haptic Feedback Teleoperation System IROS 2024 + + +
+ Automating dexterous, contact-rich manipulation tasks using rigid robots is a +significant challenge in robotics. Rigid robots, defined by their actuation +through position commands, face issues of excessive contact forces due to their +inability to adapt to contact with the environment, potentially causing damage. +While compliance control schemes have been introduced to mitigate these issues +by controlling forces via external sensors, they are hampered by the need for +fine-tuning task-specific controller parameters. Learning from Demonstrations +(LfD) offers an intuitive alternative, allowing robots to learn manipulations +through observed actions. In this work, we introduce a novel system to enhance +the teaching of dexterous, contact-rich manipulations to rigid robots. Our +system is twofold: firstly, it incorporates a teleoperation interface utilizing +Virtual Reality (VR) controllers, designed to provide an intuitive and +cost-effective method for task demonstration with haptic feedback. Secondly, we +present Comp-ACT (Compliance Control via Action Chunking with Transformers), a +method that leverages the demonstrations to learn variable compliance control +from a few demonstrations. Our methods have been validated across various +complex contact-rich manipulation tasks using single-arm and bimanual robot +setups in simulated and real-world environments, demonstrating the +effectiveness of our system in teaching robots dexterous manipulations with +enhanced adaptability and safety. Code available at: +https://github.com/omron-sinicx/CompACT + +
+
+ comment: Accepted to IROS 2024 +
+
+
+
+
+ + ♻ ☆ AsyncDiff: Parallelizing Diffusion Models by Asynchronous Denoising NeurIPS 2024 + + +
+ Diffusion models have garnered significant interest from the community for +their great generative ability across various applications. However, their +typical multi-step sequential-denoising nature gives rise to high cumulative +latency, thereby precluding the possibilities of parallel computation. To +address this, we introduce AsyncDiff, a universal and plug-and-play +acceleration scheme that enables model parallelism across multiple devices. Our +approach divides the cumbersome noise prediction model into multiple +components, assigning each to a different device. To break the dependency chain +between these components, it transforms the conventional sequential denoising +into an asynchronous process by exploiting the high similarity between hidden +states in consecutive diffusion steps. Consequently, each component is +facilitated to compute in parallel on separate devices. The proposed strategy +significantly reduces inference latency while minimally impacting the +generative quality. Specifically, for the Stable Diffusion v2.1, AsyncDiff +achieves a 2.7x speedup with negligible degradation and a 4.0x speedup with +only a slight reduction of 0.38 in CLIP Score, on four NVIDIA A5000 GPUs. Our +experiments also demonstrate that AsyncDiff can be readily applied to video +diffusion models with encouraging performances. The code is available at +https://github.com/czg1225/AsyncDiff. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ IRSC: A Zero-shot Evaluation Benchmark for Information Retrieval through + Semantic Comprehension in Retrieval-Augmented Generation Scenarios + + +
+ In Retrieval-Augmented Generation (RAG) tasks using Large Language Models +(LLMs), the quality of retrieved information is critical to the final output. +This paper introduces the IRSC benchmark for evaluating the performance of +embedding models in multilingual RAG tasks. The benchmark encompasses five +retrieval tasks: query retrieval, title retrieval, part-of-paragraph retrieval, +keyword retrieval, and summary retrieval. Our research addresses the current +lack of comprehensive testing and effective comparison methods for embedding +models in RAG scenarios. We introduced new metrics: the Similarity of Semantic +Comprehension Index (SSCI) and the Retrieval Capability Contest Index (RCCI), +and evaluated models such as Snowflake-Arctic, BGE, GTE, and M3E. Our +contributions include: 1) the IRSC benchmark, 2) the SSCI and RCCI metrics, and +3) insights into the cross-lingual limitations of embedding models. The IRSC +benchmark aims to enhance the understanding and development of accurate +retrieval systems in RAG tasks. All code and datasets are available at: +https://github.com/Jasaxion/IRSC_Benchmark + +
+
+
+
+
+ + ♻ ☆ Image Denoising with Machine Learning: A Novel Approach to Improve + Quantum Image Processing Quality and Reliability + + +
+ Quantum Image Processing (QIP) is a field that aims to utilize the benefits +of quantum computing for manipulating and analyzing images. However, QIP faces +two challenges: the limitation of qubits and the presence of noise in a quantum +machine. In this research, we propose a novel approach to address the issue of +noise in QIP. By training and employing a machine learning model that +identifies and corrects the noise in quantum-processed images, we can +compensate for the noisiness caused by the machine and retrieve a processing +result similar to that performed by a classical computer with higher +efficiency. The model is trained by learning a dataset consisting of both +existing processed images and quantum-processed images from open-access +datasets. This model will be capable of providing us with the confidence level +for each pixel and its potential original value. To assess the model's accuracy +in compensating for loss and decoherence in QIP, we evaluate it using three +metrics: Peak Signal to Noise Ratio (PSNR), Structural Similarity Index (SSIM), +and Mean Opinion Score (MOS). Additionally, we discuss the applicability of our +model across domains well as its cost effectiveness compared to alternative +methods. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Serving Deep Learning Model in Relational Databases + + +
+ Serving deep learning (DL) models on relational data has become a critical +requirement across diverse commercial and scientific domains, sparking growing +interest recently. In this visionary paper, we embark on a comprehensive +exploration of representative architectures to address the requirement. We +highlight three pivotal paradigms: The state-of-the-art DL-centric architecture +offloads DL computations to dedicated DL frameworks. The potential UDF-centric +architecture encapsulates one or more tensor computations into User Defined +Functions (UDFs) within the relational database management system (RDBMS). The +potential relation-centric architecture aims to represent a large-scale tensor +computation through relational operators. While each of these architectures +demonstrates promise in specific use scenarios, we identify urgent requirements +for seamless integration of these architectures and the middle ground +in-between these architectures. We delve into the gaps that impede the +integration and explore innovative strategies to close them. We present a +pathway to establish a novel RDBMS for enabling a broad class of data-intensive +DL inference applications. + +
+
+ comment: * Authors are ordered alphabetically; Jia Zou is the corresponding + author +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 0000000..69f5da7 --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`