diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/cache.json b/cache.json new file mode 100644 index 0000000..ba77c1c --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-09-26T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.18110v1","updated":"2024-09-26T17:52:57Z","published":"2024-09-26T17:52:57Z","title":"Open-World Evaluation for Retrieving Diverse Perspectives","summary":" We study retrieving a set of documents that covers various perspectives on a\ncomplex and contentious question (e.g., will ChatGPT do more harm than good?).\nWe curate a Benchmark for Retrieval Diversity for Subjective questions (BERDS),\nwhere each example consists of a question and diverse perspectives associated\nwith the question, sourced from survey questions and debate websites. On this\ndata, retrievers paired with a corpus are evaluated to surface a document set\nthat contains diverse perspectives. Our framing diverges from most retrieval\ntasks in that document relevancy cannot be decided by simple string matches to\nreferences. Instead, we build a language model based automatic evaluator that\ndecides whether each retrieved document contains a perspective. This allows us\nto evaluate the performance of three different types of corpus (Wikipedia, web\nsnapshot, and corpus constructed on the fly with retrieved pages from the\nsearch engine) paired with retrievers. Retrieving diverse documents remains\nchallenging, with the outputs from existing retrievers covering all\nperspectives on only 33.74% of the examples. We further study the impact of\nquery expansion and diversity-focused reranking approaches and analyze\nretriever sycophancy. Together, we lay the foundation for future studies in\nretrieval diversity handling complex queries.\n","authors":["Hung-Ting Chen","Eunsol Choi"],"pdf_url":"https://arxiv.org/pdf/2409.18110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12822v3","updated":"2024-09-26T17:39:44Z","published":"2024-06-18T17:43:47Z","title":"Is It Good Data for Multilingual Instruction Tuning or Just Bad\n Multilingual Evaluation for Large Language Models?","summary":" Multilingual large language models are designed, claimed, and expected to\ncater to speakers of varied languages. We hypothesise that the current\npractices of fine-tuning and evaluating these models may not perfectly align\nwith this objective owing to a heavy reliance on translation, which cannot\ncover language-specific knowledge but can introduce translation defects. It\nremains unknown whether the nature of the instruction data has an impact on the\nmodel output; conversely, it is questionable whether translated test sets can\ncapture such nuances. Due to the often coupled practices of using translated\ndata in both stages, such imperfections could have been overlooked. This work\ninvestigates these issues using controlled native or translated data during the\ninstruction tuning and evaluation stages. We show that native or generation\nbenchmarks reveal a notable difference between native and translated\ninstruction data especially when model performance is high, whereas other types\nof test sets cannot. The comparison between round-trip and single-pass\ntranslations reflects the importance of knowledge from language-native\nresources. Finally, we demonstrate that regularization is beneficial to\nbridging this gap on structured but not generative tasks.\n","authors":["Pinzhen Chen","Simon Yu","Zhicheng Guo","Barry Haddow"],"pdf_url":"https://arxiv.org/pdf/2406.12822v3.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18073v1","updated":"2024-09-26T17:19:49Z","published":"2024-09-26T17:19:49Z","title":"Infer Human's Intentions Before Following Natural Language Instructions","summary":" For AI agents to be helpful to humans, they should be able to follow natural\nlanguage instructions to complete everyday cooperative tasks in human\nenvironments. However, real human instructions inherently possess ambiguity,\nbecause the human speakers assume sufficient prior knowledge about their hidden\ngoals and intentions. Standard language grounding and planning methods fail to\naddress such ambiguities because they do not model human internal goals as\nadditional partially observable factors in the environment. We propose a new\nframework, Follow Instructions with Social and Embodied Reasoning (FISER),\naiming for better natural language instruction following in collaborative\nembodied tasks. Our framework makes explicit inferences about human goals and\nintentions as intermediate reasoning steps. We implement a set of\nTransformer-based models and evaluate them over a challenging benchmark,\nHandMeThat. We empirically demonstrate that using social reasoning to\nexplicitly infer human intentions before making action plans surpasses purely\nend-to-end approaches. We also compare our implementation with strong\nbaselines, including Chain of Thought prompting on the largest available\npre-trained language models, and find that FISER provides better performance on\nthe embodied social reasoning tasks under investigation, reaching the\nstate-of-the-art on HandMeThat.\n","authors":["Yanming Wan","Yue Wu","Yiping Wang","Jiayuan Mao","Natasha Jaques"],"pdf_url":"https://arxiv.org/pdf/2409.18073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18046v1","updated":"2024-09-26T16:47:32Z","published":"2024-09-26T16:47:32Z","title":"IFCap: Image-like Retrieval and Frequency-based Entity Filtering for\n Zero-shot Captioning","summary":" Recent advancements in image captioning have explored text-only training\nmethods to overcome the limitations of paired image-text data. However,\nexisting text-only training methods often overlook the modality gap between\nusing text data during training and employing images during inference. To\naddress this issue, we propose a novel approach called Image-like Retrieval,\nwhich aligns text features with visually relevant features to mitigate the\nmodality gap. Our method further enhances the accuracy of generated captions by\ndesigning a Fusion Module that integrates retrieved captions with input\nfeatures. Additionally, we introduce a Frequency-based Entity Filtering\ntechnique that significantly improves caption quality. We integrate these\nmethods into a unified framework, which we refer to as IFCap\n($\\textbf{I}$mage-like Retrieval and $\\textbf{F}$requency-based Entity\nFiltering for Zero-shot $\\textbf{Cap}$tioning). Through extensive\nexperimentation, our straightforward yet powerful approach has demonstrated its\nefficacy, outperforming the state-of-the-art methods by a significant margin in\nboth image captioning and video captioning compared to zero-shot captioning\nbased on text-only training.\n","authors":["Soeun Lee","Si-Woo Kim","Taewhan Kim","Dong-Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18046v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18044v1","updated":"2024-09-26T16:46:46Z","published":"2024-09-26T16:46:46Z","title":"Unveiling the Role of Pretraining in Direct Speech Translation","summary":" Direct speech-to-text translation systems encounter an important drawback in\ndata scarcity. A common solution consists on pretraining the encoder on\nautomatic speech recognition, hence losing efficiency in the training process.\nIn this study, we compare the training dynamics of a system using a pretrained\nencoder, the conventional approach, and one trained from scratch. We observe\nthat, throughout the training, the randomly initialized model struggles to\nincorporate information from the speech inputs for its predictions. Hence, we\nhypothesize that this issue stems from the difficulty of effectively training\nan encoder for direct speech translation. While a model trained from scratch\nneeds to learn acoustic and semantic modeling simultaneously, a pretrained one\ncan just focus on the latter. Based on these findings, we propose a subtle\nchange in the decoder cross-attention to integrate source information from\nearlier steps in training. We show that with this change, the model trained\nfrom scratch can achieve comparable performance to the pretrained one, while\nreducing the training time.\n","authors":["Belen Alastruey","Gerard I. Gállego","Marta R. Costa-jussà"],"pdf_url":"https://arxiv.org/pdf/2409.18044v1.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18042v1","updated":"2024-09-26T16:44:02Z","published":"2024-09-26T16:44:02Z","title":"EMOVA: Empowering Language Models to See, Hear and Speak with Vivid\n Emotions","summary":" GPT-4o, an omni-modal model that enables vocal conversations with diverse\nemotions and tones, marks a milestone for omni-modal foundation models.\nHowever, empowering Large Language Models to perceive and generate images,\ntexts, and speeches end-to-end with publicly available data remains challenging\nin the open-source community. Existing vision-language models rely on external\ntools for the speech processing, while speech-language models still suffer from\nlimited or even without vision-understanding abilities. To address this gap, we\npropose EMOVA (EMotionally Omni-present Voice Assistant), to enable Large\nLanguage Models with end-to-end speech capabilities while maintaining the\nleading vision-language performance. With a semantic-acoustic disentangled\nspeech tokenizer, we notice surprisingly that omni-modal alignment can further\nenhance vision-language and speech abilities compared with the corresponding\nbi-modal aligned counterparts. Moreover, a lightweight style module is proposed\nfor flexible speech style controls (e.g., emotions and pitches). For the first\ntime, EMOVA achieves state-of-the-art performance on both the vision-language\nand speech benchmarks, and meanwhile, supporting omni-modal spoken dialogue\nwith vivid emotions.\n","authors":["Kai Chen","Yunhao Gou","Runhui Huang","Zhili Liu","Daxin Tan","Jing Xu","Chunwei Wang","Yi Zhu","Yihan Zeng","Kuo Yang","Dingdong Wang","Kun Xiang","Haoyuan Li","Haoli Bai","Jianhua Han","Xiaohui Li","Weike Jin","Nian Xie","Yu Zhang","James T. Kwok","Hengshuang Zhao","Xiaodan Liang","Dit-Yan Yeung","Xiao Chen","Zhenguo Li","Wei Zhang","Qun Liu","Lanqing Hong","Lu Hou","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.18042v1.pdf","comment":"Project Page: https://emova-ollm.github.io/"},{"id":"http://arxiv.org/abs/2409.18033v1","updated":"2024-09-26T16:38:56Z","published":"2024-09-26T16:38:56Z","title":"Automated Detection and Analysis of Power Words in Persuasive Text Using\n Natural Language Processing","summary":" Power words are terms that evoke strong emotional responses and significantly\ninfluence readers' behavior, playing a crucial role in fields like marketing,\npolitics, and motivational writing. This study proposes a methodology for the\nautomated detection and analysis of power words in persuasive text using a\ncustom lexicon and the TextBlob library in Python. By identifying the presence\nand frequency of power words within a given text, we aim to classify and\nanalyze their impact on sentiment and reader engagement. This research examines\ndiverse datasets across various domains to provide insights into the\neffectiveness of power words, offering practical applications for content\ncreators, advertisers, and policymakers.\n","authors":["Sahil Garje"],"pdf_url":"https://arxiv.org/pdf/2409.18033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13731v3","updated":"2024-09-26T16:34:35Z","published":"2024-09-10T02:00:28Z","title":"KAG: Boosting LLMs in Professional Domains via Knowledge Augmented\n Generation","summary":" The recently developed retrieval-augmented generation (RAG) technology has\nenabled the efficient construction of domain-specific applications. However, it\nalso has limitations, including the gap between vector similarity and the\nrelevance of knowledge reasoning, as well as insensitivity to knowledge logic,\nsuch as numerical values, temporal relations, expert rules, and others, which\nhinder the effectiveness of professional knowledge services. In this work, we\nintroduce a professional domain knowledge service framework called Knowledge\nAugmented Generation (KAG). KAG is designed to address the aforementioned\nchallenges with the motivation of making full use of the advantages of\nknowledge graph(KG) and vector retrieval, and to improve generation and\nreasoning performance by bidirectionally enhancing large language models (LLMs)\nand KGs through five key aspects: (1) LLM-friendly knowledge representation,\n(2) mutual-indexing between knowledge graphs and original chunks, (3)\nlogical-form-guided hybrid reasoning engine, (4) knowledge alignment with\nsemantic reasoning, and (5) model capability enhancement for KAG. We compared\nKAG with existing RAG methods in multihop question answering and found that it\nsignificantly outperforms state-of-theart methods, achieving a relative\nimprovement of 19.6% on 2wiki and 33.5% on hotpotQA in terms of F1 score. We\nhave successfully applied KAG to two professional knowledge Q&A tasks of Ant\nGroup, including E-Government Q&A and E-Health Q&A, achieving significant\nimprovement in professionalism compared to RAG methods.\n","authors":["Lei Liang","Mengshu Sun","Zhengke Gui","Zhongshu Zhu","Zhouyu Jiang","Ling Zhong","Yuan Qu","Peilong Zhao","Zhongpu Bo","Jin Yang","Huaidong Xiong","Lin Yuan","Jun Xu","Zaoyang Wang","Zhiqiang Zhang","Wen Zhang","Huajun Chen","Wenguang Chen","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.13731v3.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2409.18028v1","updated":"2024-09-26T16:34:35Z","published":"2024-09-26T16:34:35Z","title":"Compositional Hardness of Code in Large Language Models -- A\n Probabilistic Perspective","summary":" A common practice in large language model (LLM) usage for complex analytical\ntasks such as code generation, is to sample a solution for the entire task\nwithin the model's context window. Previous works have shown that subtask\ndecomposition within the model's context (chain of thought), is beneficial for\nsolving such tasks. In this work, we point a limitation of LLMs' ability to\nperform several sub-tasks within the same context window - an in-context\nhardness of composition, pointing to an advantage for distributing a decomposed\nproblem in a multi-agent system of LLMs. The hardness of composition is\nquantified by a generation complexity metric, i.e., the number of LLM\ngenerations required to sample at least one correct solution. We find a gap\nbetween the generation complexity of solving a compositional problem within the\nsame context relative to distributing it among multiple agents, that increases\nexponentially with the solution's length. We prove our results theoretically\nand demonstrate them empirically.\n","authors":["Yotam Wolf","Binyamin Rothberg","Dorin Shteyman","Amnon Shashua"],"pdf_url":"https://arxiv.org/pdf/2409.18028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18025v1","updated":"2024-09-26T16:32:19Z","published":"2024-09-26T16:32:19Z","title":"An Adversarial Perspective on Machine Unlearning for AI Safety","summary":" Large language models are finetuned to refuse questions about hazardous\nknowledge, but these protections can often be bypassed. Unlearning methods aim\nat completely removing hazardous capabilities from models and make them\ninaccessible to adversaries. This work challenges the fundamental differences\nbetween unlearning and traditional safety post-training from an adversarial\nperspective. We demonstrate that existing jailbreak methods, previously\nreported as ineffective against unlearning, can be successful when applied\ncarefully. Furthermore, we develop a variety of adaptive methods that recover\nmost supposedly unlearned capabilities. For instance, we show that finetuning\non 10 unrelated examples or removing specific directions in the activation\nspace can recover most hazardous capabilities for models edited with RMU, a\nstate-of-the-art unlearning method. Our findings challenge the robustness of\ncurrent unlearning approaches and question their advantages over safety\ntraining.\n","authors":["Jakub Łucki","Boyi Wei","Yangsibo Huang","Peter Henderson","Florian Tramèr","Javier Rando"],"pdf_url":"https://arxiv.org/pdf/2409.18025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18023v1","updated":"2024-09-26T16:31:50Z","published":"2024-09-26T16:31:50Z","title":"DARE: Diverse Visual Question Answering with Robustness Evaluation","summary":" Vision Language Models (VLMs) extend remarkable capabilities of text-only\nlarge language models and vision-only models, and are able to learn from and\nprocess multi-modal vision-text input. While modern VLMs perform well on a\nnumber of standard image classification and image-text matching tasks, they\nstill struggle with a number of crucial vision-language (VL) reasoning\nabilities such as counting and spatial reasoning. Moreover, while they might be\nvery brittle to small variations in instructions and/or evaluation protocols,\nexisting benchmarks fail to evaluate their robustness (or rather the lack of\nit). In order to couple challenging VL scenarios with comprehensive robustness\nevaluation, we introduce DARE, Diverse Visual Question Answering with\nRobustness Evaluation, a carefully created and curated multiple-choice VQA\nbenchmark. DARE evaluates VLM performance on five diverse categories and\nincludes four robustness-oriented evaluations based on the variations of:\nprompts, the subsets of answer options, the output format and the number of\ncorrect answers. Among a spectrum of other findings, we report that\nstate-of-the-art VLMs still struggle with questions in most categories and are\nunable to consistently deliver their peak performance across the tested\nrobustness evaluations. The worst case performance across the subsets of\noptions is up to 34% below the performance in the standard case. The robustness\nof the open-source VLMs such as LLaVA 1.6 and Idefics2 cannot match the\nclosed-source models such as GPT-4 and Gemini, but even the latter remain very\nbrittle to different variations.\n","authors":["Hannah Sterz","Jonas Pfeiffer","Ivan Vulić"],"pdf_url":"https://arxiv.org/pdf/2409.18023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18006v1","updated":"2024-09-26T16:15:14Z","published":"2024-09-26T16:15:14Z","title":"Multilingual Evaluation of Long Context Retrieval and Reasoning","summary":" Recent large language models (LLMs) demonstrate impressive capabilities in\nhandling long contexts, some exhibiting near-perfect recall on synthetic\nretrieval tasks. However, these evaluations have mainly focused on English text\nand involved a single target sentence within lengthy contexts. Our work\ninvestigates how LLM performance generalizes to multilingual settings with\nmultiple hidden target sentences. We comprehensively evaluate several\nlong-context LLMs on retrieval and reasoning tasks across five languages:\nEnglish, Vietnamese, Indonesian, Swahili, and Somali. These languages share the\nLatin script but belong to distinct language families and resource levels. Our\nanalysis reveals a significant performance gap between languages. The\nbest-performing models such as Gemini-1.5 and GPT-4o, achieve around 96%\naccuracy in English to around 36% in Somali with a single target sentence.\nHowever, this accuracy drops to 40% in English and 0% in Somali when dealing\nwith three target sentences. Our findings highlight the challenges long-context\nLLMs face when processing longer contexts, an increase in the number of target\nsentences, or languages of lower resource levels.\n","authors":["Ameeta Agrawal","Andy Dang","Sina Bagheri Nezhad","Rhitabrat Pokharel","Russell Scheinberg"],"pdf_url":"https://arxiv.org/pdf/2409.18006v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2409.17990v1","updated":"2024-09-26T16:02:00Z","published":"2024-09-26T16:02:00Z","title":"Extracting Affect Aggregates from Longitudinal Social Media Data with\n Temporal Adapters for Large Language Models","summary":" This paper proposes temporally aligned Large Language Models (LLMs) as a tool\nfor longitudinal analysis of social media data. We fine-tune Temporal Adapters\nfor Llama 3 8B on full timelines from a panel of British Twitter users, and\nextract longitudinal aggregates of emotions and attitudes with established\nquestionnaires. We validate our estimates against representative British survey\ndata and find strong positive, significant correlations for several collective\nemotions. The obtained estimates are robust across multiple training seeds and\nprompt formulations, and in line with collective emotions extracted using a\ntraditional classification model trained on labeled data. To the best of our\nknowledge, this is the first work to extend the analysis of affect in LLMs to a\nlongitudinal setting through Temporal Adapters. Our work enables new approaches\ntowards the longitudinal analysis of social media data.\n","authors":["Georg Ahnert","Max Pellert","David Garcia","Markus Strohmaier"],"pdf_url":"https://arxiv.org/pdf/2409.17990v1.pdf","comment":"Code available at https://github.com/dess-mannheim/temporal-adapters"},{"id":"http://arxiv.org/abs/2409.17972v1","updated":"2024-09-26T15:47:42Z","published":"2024-09-26T15:47:42Z","title":"BEATS: Optimizing LLM Mathematical Capabilities with BackVerify and\n Adaptive Disambiguate based Efficient Tree Search","summary":" Large Language Models (LLMs) have exhibited exceptional performance across a\nbroad range of tasks and domains. However, they still encounter difficulties in\nsolving mathematical problems due to the rigorous and logical nature of\nmathematics. Previous studies have employed techniques such as supervised\nfine-tuning (SFT), prompt engineering, and search-based methods to improve the\nmathematical problem-solving abilities of LLMs. Despite these efforts, their\nperformance remains suboptimal and demands substantial computational resources.\nTo address this issue, we propose a novel approach, BEATS, to enhance\nmathematical problem-solving abilities. Our method leverages newly designed\nprompts that guide the model to iteratively rewrite, advance by one step, and\ngenerate answers based on previous steps. Additionally, we introduce a new\nback-verification technique that uses LLMs to validate the correctness of the\ngenerated answers. Furthermore, we employ a pruning tree search to optimize\nsearch time while achieving strong performance. Notably, our method improves\nQwen2-7b-Instruct's score from 36.94 to 61.52, outperforming GPT4's 42.5 on the\nMATH benchmark.\n","authors":["Linzhuang Sun","Hao Liang","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17958v1","updated":"2024-09-26T15:36:10Z","published":"2024-09-26T15:36:10Z","title":"The Hard Positive Truth about Vision-Language Compositionality","summary":" Several benchmarks have concluded that our best vision-language models (e.g.,\nCLIP) are lacking in compositionality. Given an image, these benchmarks probe a\nmodel's ability to identify its associated caption amongst a set of\ncompositional distractors. In response, a surge of recent proposals show\nimprovements by finetuning CLIP with distractors as hard negatives. Our\ninvestigations reveal that these improvements have, in fact, been significantly\noverstated -- because existing benchmarks do not probe whether finetuned\nvision-language models remain invariant to hard positives. By curating an\nevaluation dataset with 112,382 hard negatives and hard positives, we uncover\nthat including hard positives decreases CLIP's performance by 12.9%, while\nhumans perform effortlessly at 99%. CLIP finetuned with hard negatives results\nin an even larger decrease, up to 38.7%. With this finding, we then produce a\n1,775,259 image-text training set with both hard negative and hard positive\ncaptions. By training with both, we see improvements on existing benchmarks\nwhile simultaneously improving performance on hard positives, indicating a more\nrobust improvement in compositionality. Our work suggests the need for future\nresearch to rigorously test and improve CLIP's understanding of semantic\nrelationships between related \"positive\" concepts.\n","authors":["Amita Kamath","Cheng-Yu Hsieh","Kai-Wei Chang","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2409.17958v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2305.11231v2","updated":"2024-09-26T15:32:47Z","published":"2023-05-18T18:00:44Z","title":"Recent Trends in Unsupervised Summarization","summary":" Unsupervised summarization is a powerful technique that enables training\nsummarizing models without requiring labeled datasets. This survey covers\ndifferent recent techniques and models used for unsupervised summarization. We\ncover extractive, abstractive, and hybrid models and strategies used to achieve\nunsupervised summarization. While the main focus of this survey is on recent\nresearch, we also cover some of the important previous research. We\nadditionally introduce a taxonomy, classifying different research based on\ntheir approach to unsupervised training. Finally, we discuss the current\napproaches and mention some datasets and evaluation methods.\n","authors":["Mohammad Khosravani","Amine Trabelsi"],"pdf_url":"https://arxiv.org/pdf/2305.11231v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13740v2","updated":"2024-09-26T15:27:08Z","published":"2024-09-10T16:37:58Z","title":"Language agents achieve superhuman synthesis of scientific knowledge","summary":" Language models are known to hallucinate incorrect information, and it is\nunclear if they are sufficiently accurate and reliable for use in scientific\nresearch. We developed a rigorous human-AI comparison methodology to evaluate\nlanguage model agents on real-world literature search tasks covering\ninformation retrieval, summarization, and contradiction detection tasks. We\nshow that PaperQA2, a frontier language model agent optimized for improved\nfactuality, matches or exceeds subject matter expert performance on three\nrealistic literature research tasks without any restrictions on humans (i.e.,\nfull access to internet, search tools, and time). PaperQA2 writes cited,\nWikipedia-style summaries of scientific topics that are significantly more\naccurate than existing, human-written Wikipedia articles. We also introduce a\nhard benchmark for scientific literature research called LitQA2 that guided\ndesign of PaperQA2, leading to it exceeding human performance. Finally, we\napply PaperQA2 to identify contradictions within the scientific literature, an\nimportant scientific task that is challenging for humans. PaperQA2 identifies\n2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of\nwhich 70% are validated by human experts. These results demonstrate that\nlanguage model agents are now capable of exceeding domain experts across\nmeaningful tasks on scientific literature.\n","authors":["Michael D. Skarlinski","Sam Cox","Jon M. Laurent","James D. Braza","Michaela Hinks","Michael J. Hammerling","Manvitha Ponnapati","Samuel G. Rodriques","Andrew D. White"],"pdf_url":"https://arxiv.org/pdf/2409.13740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17946v1","updated":"2024-09-26T15:20:37Z","published":"2024-09-26T15:20:37Z","title":"Weak-To-Strong Backdoor Attacks for LLMs with Contrastive Knowledge\n Distillation","summary":" Despite being widely applied due to their exceptional capabilities, Large\nLanguage Models (LLMs) have been proven to be vulnerable to backdoor attacks.\nThese attacks introduce targeted vulnerabilities into LLMs by poisoning\ntraining samples and full-parameter fine-tuning. However, this kind of backdoor\nattack is limited since they require significant computational resources,\nespecially as the size of LLMs increases. Besides, parameter-efficient\nfine-tuning (PEFT) offers an alternative but the restricted parameter updating\nmay impede the alignment of triggers with target labels. In this study, we\nfirst verify that backdoor attacks with PEFT may encounter challenges in\nachieving feasible performance. To address these issues and improve the\neffectiveness of backdoor attacks with PEFT, we propose a novel backdoor attack\nalgorithm from weak to strong based on contrastive knowledge distillation\n(W2SAttack). Specifically, we poison small-scale language models through\nfull-parameter fine-tuning to serve as the teacher model. The teacher model\nthen covertly transfers the backdoor to the large-scale student model through\ncontrastive knowledge distillation, which employs PEFT. Theoretical analysis\nreveals that W2SAttack has the potential to augment the effectiveness of\nbackdoor attacks. We demonstrate the superior performance of W2SAttack on\nclassification tasks across four language models, four backdoor attack\nalgorithms, and two different architectures of teacher models. Experimental\nresults indicate success rates close to 100% for backdoor attacks targeting\nPEFT.\n","authors":["Shuai Zhao","Leilei Gan","Zhongliang Guo","Xiaobao Wu","Luwei Xiao","Xiaoyu Xu","Cong-Duy Nguyen","Luu Anh Tuan"],"pdf_url":"https://arxiv.org/pdf/2409.17946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17943v1","updated":"2024-09-26T15:18:34Z","published":"2024-09-26T15:18:34Z","title":"On Translating Technical Terminology: A Translation Workflow for\n Machine-Translated Acronyms","summary":" The typical workflow for a professional translator to translate a document\nfrom its source language (SL) to a target language (TL) is not always focused\non what many language models in natural language processing (NLP) do - predict\nthe next word in a series of words. While high-resource languages like English\nand French are reported to achieve near human parity using common metrics for\nmeasurement such as BLEU and COMET, we find that an important step is being\nmissed: the translation of technical terms, specifically acronyms. Some\nstate-of-the art machine translation systems like Google Translate which are\npublicly available can be erroneous when dealing with acronyms - as much as 50%\nin our findings. This article addresses acronym disambiguation for MT systems\nby proposing an additional step to the SL-TL (FR-EN) translation workflow where\nwe first offer a new acronym corpus for public consumption and then experiment\nwith a search-based thresholding algorithm that achieves nearly 10% increase\nwhen compared to Google Translate and OpusMT.\n","authors":["Richard Yue","John E. Ortega","Kenneth Ward Church"],"pdf_url":"https://arxiv.org/pdf/2409.17943v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17939v1","updated":"2024-09-26T15:12:59Z","published":"2024-09-26T15:12:59Z","title":"Predicting Anchored Text from Translation Memories for Machine\n Translation Using Deep Learning Methods","summary":" Translation memories (TMs) are the backbone for professional translation\ntools called computer-aided translation (CAT) tools. In order to perform a\ntranslation using a CAT tool, a translator uses the TM to gather translations\nsimilar to the desired segment to translate (s'). Many CAT tools offer a\nfuzzy-match algorithm to locate segments (s) in the TM that are close in\ndistance to s'. After locating two similar segments, the CAT tool will present\nparallel segments (s, t) that contain one segment in the source language along\nwith its translation in the target language. Additionally, CAT tools contain\nfuzzy-match repair (FMR) techniques that will automatically use the parallel\nsegments from the TM to create new TM entries containing a modified version of\nthe original with the idea in mind that it will be the translation of s'. Most\nFMR techniques use machine translation as a way of \"repairing\" those words that\nhave to be modified. In this article, we show that for a large part of those\nwords which are anchored, we can use other techniques that are based on machine\nlearning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we\nshow that for anchored words that follow the continuous bag-of-words (CBOW)\nparadigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for\nsome cases, better results than neural machine translation for translating\nanchored words from French to English.\n","authors":["Richard Yue","John E. Ortega"],"pdf_url":"https://arxiv.org/pdf/2409.17939v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17929v1","updated":"2024-09-26T15:08:17Z","published":"2024-09-26T15:08:17Z","title":"The Lou Dataset -- Exploring the Impact of Gender-Fair Language in\n German Text Classification","summary":" Gender-fair language, an evolving German linguistic variation, fosters\ninclusion by addressing all genders or using neutral forms. Nevertheless, there\nis a significant lack of resources to assess the impact of this linguistic\nshift on classification using language models (LMs), which are probably not\ntrained on such variations. To address this gap, we present Lou, the first\ndataset featuring high-quality reformulations for German text classification\ncovering seven tasks, like stance detection and toxicity classification.\nEvaluating 16 mono- and multi-lingual LMs on Lou shows that gender-fair\nlanguage substantially impacts predictions by flipping labels, reducing\ncertainty, and altering attention patterns. However, existing evaluations\nremain valid, as LM rankings of original and reformulated instances do not\nsignificantly differ. While we offer initial insights on the effect on German\ntext classification, the findings likely apply to other languages, as\nconsistent patterns were observed in multi-lingual and English LMs.\n","authors":["Andreas Waldis","Joel Birrer","Anne Lauscher","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2409.17929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17928v1","updated":"2024-09-26T15:07:30Z","published":"2024-09-26T15:07:30Z","title":"Pioneering Reliable Assessment in Text-to-Image Knowledge Editing:\n Leveraging a Fine-Grained Dataset and an Innovative Criterion","summary":" During pre-training, the Text-to-Image (T2I) diffusion models encode factual\nknowledge into their parameters. These parameterized facts enable realistic\nimage generation, but they may become obsolete over time, thereby\nmisrepresenting the current state of the world. Knowledge editing techniques\naim to update model knowledge in a targeted way. However, facing the dual\nchallenges posed by inadequate editing datasets and unreliable evaluation\ncriterion, the development of T2I knowledge editing encounter difficulties in\neffectively generalizing injected knowledge. In this work, we design a T2I\nknowledge editing framework by comprehensively spanning on three phases: First,\nwe curate a dataset \\textbf{CAKE}, comprising paraphrase and multi-object test,\nto enable more fine-grained assessment on knowledge generalization. Second, we\npropose a novel criterion, \\textbf{adaptive CLIP threshold}, to effectively\nfilter out false successful images under the current criterion and achieve\nreliable editing evaluation. Finally, we introduce \\textbf{MPE}, a simple but\neffective approach for T2I knowledge editing. Instead of tuning parameters, MPE\nprecisely recognizes and edits the outdated part of the conditioning\ntext-prompt to accommodate the up-to-date knowledge. A straightforward\nimplementation of MPE (Based on in-context learning) exhibits better overall\nperformance than previous model editors. We hope these efforts can further\npromote faithful evaluation of T2I knowledge editing methods.\n","authors":["Hengrui Gu","Kaixiong Zhou","Yili Wang","Ruobing Wang","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17928v1.pdf","comment":"EMNLP24 Findings"},{"id":"http://arxiv.org/abs/2409.17912v1","updated":"2024-09-26T14:56:38Z","published":"2024-09-26T14:56:38Z","title":"Atlas-Chat: Adapting Large Language Models for Low-Resource Moroccan\n Arabic Dialect","summary":" We introduce Atlas-Chat, the first-ever collection of large language models\nspecifically developed for dialectal Arabic. Focusing on Moroccan Arabic, also\nknown as Darija, we construct our instruction dataset by consolidating existing\nDarija language resources, creating novel datasets both manually and\nsynthetically, and translating English instructions with stringent quality\ncontrol. Atlas-Chat-9B and 2B models, fine-tuned on the dataset, exhibit\nsuperior ability in following Darija instructions and performing standard NLP\ntasks. Notably, our models outperform both state-of-the-art and\nArabic-specialized LLMs like LLaMa, Jais, and AceGPT, e.g., achieving a 13%\nperformance boost over a larger 13B model on DarijaMMLU, in our newly\nintroduced evaluation suite for Darija covering both discriminative and\ngenerative tasks. Furthermore, we perform an experimental analysis of various\nfine-tuning strategies and base model choices to determine optimal\nconfigurations. All our resources are publicly accessible, and we believe our\nwork offers comprehensive design methodologies of instruction-tuning for\nlow-resource language variants, which are often neglected in favor of data-rich\nlanguages by contemporary LLMs.\n","authors":["Guokan Shang","Hadi Abdine","Yousef Khoubrane","Amr Mohamed","Yassine Abbahaddou","Sofiane Ennadir","Imane Momayiz","Xuguang Ren","Eric Moulines","Preslav Nakov","Michalis Vazirgiannis","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2409.17912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17899v1","updated":"2024-09-26T14:49:09Z","published":"2024-09-26T14:49:09Z","title":"Revisiting Acoustic Similarity in Emotional Speech and Music via\n Self-Supervised Representations","summary":" Emotion recognition from speech and music shares similarities due to their\nacoustic overlap, which has led to interest in transferring knowledge between\nthese domains. However, the shared acoustic cues between speech and music,\nparticularly those encoded by Self-Supervised Learning (SSL) models, remain\nlargely unexplored, given the fact that SSL models for speech and music have\nrarely been applied in cross-domain research. In this work, we revisit the\nacoustic similarity between emotion speech and music, starting with an analysis\nof the layerwise behavior of SSL models for Speech Emotion Recognition (SER)\nand Music Emotion Recognition (MER). Furthermore, we perform cross-domain\nadaptation by comparing several approaches in a two-stage fine-tuning process,\nexamining effective ways to utilize music for SER and speech for MER. Lastly,\nwe explore the acoustic similarities between emotional speech and music using\nFrechet audio distance for individual emotions, uncovering the issue of emotion\nbias in both speech and music SSL models. Our findings reveal that while speech\nand music SSL models do capture shared acoustic features, their behaviors can\nvary depending on different emotions due to their training strategies and\ndomain-specificities. Additionally, parameter-efficient fine-tuning can enhance\nSER and MER performance by leveraging knowledge from each other. This study\nprovides new insights into the acoustic similarity between emotional speech and\nmusic, and highlights the potential for cross-domain generalization to improve\nSER and MER systems.\n","authors":["Yujia Sun","Zeyu Zhao","Korin Richmond","Yuanchao Li"],"pdf_url":"https://arxiv.org/pdf/2409.17899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18789v2","updated":"2024-09-26T14:48:42Z","published":"2024-07-26T14:52:37Z","title":"Granularity is crucial when applying differential privacy to text: An\n investigation for neural machine translation","summary":" Applying differential privacy (DP) by means of the DP-SGD algorithm to\nprotect individual data points during training is becoming increasingly popular\nin NLP. However, the choice of granularity at which DP is applied is often\nneglected. For example, neural machine translation (NMT) typically operates on\nthe sentence-level granularity. From the perspective of DP, this setup assumes\nthat each sentence belongs to a single person and any two sentences in the\ntraining dataset are independent. This assumption is however violated in many\nreal-world NMT datasets, e.g., those including dialogues. For proper\napplication of DP we thus must shift from sentences to entire documents. In\nthis paper, we investigate NMT at both the sentence and document levels,\nanalyzing the privacy/utility trade-off for both scenarios, and evaluating the\nrisks of not using the appropriate privacy granularity in terms of leaking\npersonally identifiable information (PII). Our findings indicate that the\ndocument-level NMT system is more resistant to membership inference attacks,\nemphasizing the significance of using the appropriate granularity when working\nwith DP.\n","authors":["Doan Nam Long Vu","Timour Igamberdiev","Ivan Habernal"],"pdf_url":"https://arxiv.org/pdf/2407.18789v2.pdf","comment":"Accepted at EMNLP Findings 2024"},{"id":"http://arxiv.org/abs/2409.17892v1","updated":"2024-09-26T14:40:45Z","published":"2024-09-26T14:40:45Z","title":"EMMA-500: Enhancing Massively Multilingual Adaptation of Large Language\n Models","summary":" In this work, we introduce EMMA-500, a large-scale multilingual language\nmodel continue-trained on texts across 546 languages designed for enhanced\nmultilingual performance, focusing on improving language coverage for\nlow-resource languages. To facilitate continual pre-training, we compile the\nMaLA corpus, a comprehensive multilingual dataset enriched with curated\ndatasets across diverse domains. Leveraging this corpus, we conduct extensive\ncontinual pre-training of the Llama 2 7B model, resulting in EMMA-500, which\ndemonstrates robust performance across a wide collection of benchmarks,\nincluding a comprehensive set of multilingual tasks and PolyWrite, an\nopen-ended generation benchmark developed in this study. Our results highlight\nthe effectiveness of continual pre-training in expanding large language models'\nlanguage capacity, particularly for underrepresented languages, demonstrating\nsignificant gains in cross-lingual transfer, task generalization, and language\nadaptability.\n","authors":["Shaoxiong Ji","Zihao Li","Indraneil Paul","Jaakko Paavola","Peiqin Lin","Pinzhen Chen","Dayyán O'Brien","Hengyu Luo","Hinrich Schütze","Jörg Tiedemann","Barry Haddow"],"pdf_url":"https://arxiv.org/pdf/2409.17892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09577v2","updated":"2024-09-26T14:34:53Z","published":"2024-04-15T08:38:43Z","title":"Transformers, Contextualism, and Polysemy","summary":" The transformer architecture, introduced by Vaswani et al. (2017), is at the\nheart of the remarkable recent progress in the development of language models,\nincluding widely-used chatbots such as Chat-GPT and Claude. In this paper, I\nargue that we can extract from the way the transformer architecture works a\ntheory of the relationship between context and meaning. I call this the\ntransformer theory, and I argue that it is novel with regard to two related\nphilosophical debates: the contextualism debate regarding the extent of\ncontext-sensitivity across natural language, and the polysemy debate regarding\nhow polysemy should be captured within an account of word meaning.\n","authors":["Jumbly Grindrod"],"pdf_url":"https://arxiv.org/pdf/2404.09577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17865v1","updated":"2024-09-26T14:15:54Z","published":"2024-09-26T14:15:54Z","title":"Implementing a Nordic-Baltic Federated Health Data Network: a case\n report","summary":" Background: Centralized collection and processing of healthcare data across\nnational borders pose significant challenges, including privacy concerns, data\nheterogeneity and legal barriers. To address some of these challenges, we\nformed an interdisciplinary consortium to develop a feder-ated health data\nnetwork, comprised of six institutions across five countries, to facilitate\nNordic-Baltic cooperation on secondary use of health data. The objective of\nthis report is to offer early insights into our experiences developing this\nnetwork. Methods: We used a mixed-method ap-proach, combining both experimental\ndesign and implementation science to evaluate the factors affecting the\nimplementation of our network. Results: Technically, our experiments indicate\nthat the network functions without significant performance degradation compared\nto centralized simu-lation. Conclusion: While use of interdisciplinary\napproaches holds a potential to solve challeng-es associated with establishing\nsuch collaborative networks, our findings turn the spotlight on the uncertain\nregulatory landscape playing catch up and the significant operational costs.\n","authors":["Taridzo Chomutare","Aleksandar Babic","Laura-Maria Peltonen","Silja Elunurm","Peter Lundberg","Arne Jönsson","Emma Eneling","Ciprian-Virgil Gerstenberger","Troels Siggaard","Raivo Kolde","Oskar Jerdhaf","Martin Hansson","Alexandra Makhlysheva","Miroslav Muzny","Erik Ylipää","Søren Brunak","Hercules Dalianis"],"pdf_url":"https://arxiv.org/pdf/2409.17865v1.pdf","comment":"24 pages (including appendices), 1 figure"},{"id":"http://arxiv.org/abs/2409.17834v1","updated":"2024-09-26T13:36:00Z","published":"2024-09-26T13:36:00Z","title":"PEDRO: Parameter-Efficient Fine-tuning with Prompt DEpenDent\n Representation MOdification","summary":" Due to their substantial sizes, large language models (LLMs) are typically\ndeployed within a single-backbone multi-tenant framework. In this setup, a\nsingle instance of an LLM backbone must cater to multiple users or tasks\nthrough the application of various parameter-efficient fine-tuning (PEFT)\nmodels. Despite the availability of numerous effective PEFT techniques such as\nLoRA, there remains a need for a PEFT approach that achieves both high\nefficiency during inference and competitive performance on downstream tasks. In\nthis research, we introduce a new and straightforward PEFT methodology named\n\\underline{P}rompt D\\underline{E}pen\\underline{D}ent \\underline{R}epresentation\nM\\underline{O}dification (PEDRO). The proposed method involves integrating a\nlightweight vector generator into each Transformer layer, which generates\nvectors contingent upon the input prompts. These vectors then modify the hidden\nrepresentations created by the LLM through a dot product operation, thereby\ninfluencing the semantic output and generated content of the model. Extensive\nexperimentation across a variety of tasks indicates that: (a) PEDRO surpasses\nrecent PEFT benchmarks when using a similar number of tunable parameters. (b)\nUnder the single-backbone multi-tenant deployment model, PEDRO exhibits\nsuperior efficiency compared to LoRA, indicating significant industrial\npotential.\n","authors":["Tianfang Xie","Tianjing Li","Wei Zhu","Wei Han","Yi Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.17834v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.18203"},{"id":"http://arxiv.org/abs/2409.17827v1","updated":"2024-09-26T13:26:46Z","published":"2024-09-26T13:26:46Z","title":"BeanCounter: A low-toxicity, large-scale, and open dataset of\n business-oriented text","summary":" Many of the recent breakthroughs in language modeling have resulted from\nscaling effectively the same model architecture to larger datasets. In this\nvein, recent work has highlighted performance gains from increasing training\ndataset size and quality, suggesting a need for novel sources of large-scale\ndatasets. In this work, we introduce BeanCounter, a public dataset consisting\nof more than 159B tokens extracted from businesses' disclosures. We show that\nthis data is indeed novel: less than 0.1% of BeanCounter appears in Common\nCrawl-based datasets and it is an order of magnitude larger than datasets\nrelying on similar sources. Given the data's provenance, we hypothesize that\nBeanCounter is comparatively more factual and less toxic than web-based\ndatasets. Exploring this hypothesis, we find that many demographic identities\noccur with similar prevalence in BeanCounter but with significantly less toxic\ncontext relative to other datasets. To demonstrate the utility of BeanCounter,\nwe evaluate and compare two LLMs continually pre-trained on BeanCounter with\ntheir base models. We find an 18-33% reduction in toxic generation and improved\nperformance within the finance domain for the continually pretrained models.\nCollectively, our work suggests that BeanCounter is a novel source of\nlow-toxicity and high-quality domain-specific data with sufficient scale to\ntrain multi-billion parameter LLMs.\n","authors":["Siyan Wang","Bradford Levy"],"pdf_url":"https://arxiv.org/pdf/2409.17827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16934v2","updated":"2024-09-26T13:22:37Z","published":"2024-09-25T13:45:23Z","title":"Investigating OCR-Sensitive Neurons to Improve Entity Recognition in\n Historical Documents","summary":" This paper investigates the presence of OCR-sensitive neurons within the\nTransformer architecture and their influence on named entity recognition (NER)\nperformance on historical documents. By analysing neuron activation patterns in\nresponse to clean and noisy text inputs, we identify and then neutralise\nOCR-sensitive neurons to improve model performance. Based on two open access\nlarge language models (Llama2 and Mistral), experiments demonstrate the\nexistence of OCR-sensitive regions and show improvements in NER performance on\nhistorical newspapers and classical commentaries, highlighting the potential of\ntargeted neuron modulation to improve models' performance on noisy text.\n","authors":["Emanuela Boros","Maud Ehrmann"],"pdf_url":"https://arxiv.org/pdf/2409.16934v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17819v1","updated":"2024-09-26T13:15:18Z","published":"2024-09-26T13:15:18Z","title":"Inference-Time Language Model Alignment via Integrated Value Guidance","summary":" Large language models are typically fine-tuned to align with human\npreferences, but tuning large models is computationally intensive and complex.\nIn this work, we introduce $\\textit{Integrated Value Guidance}$ (IVG), a method\nthat uses implicit and explicit value functions to guide language model\ndecoding at token and chunk-level respectively, efficiently aligning large\nlanguage models purely at inference time. This approach circumvents the\ncomplexities of direct fine-tuning and outperforms traditional methods.\nEmpirically, we demonstrate the versatility of IVG across various tasks. In\ncontrolled sentiment generation and summarization tasks, our method\nsignificantly improves the alignment of large models using inference-time\nguidance from $\\texttt{gpt2}$-based value functions. Moreover, in a more\nchallenging instruction-following benchmark AlpacaEval 2.0, we show that both\nspecifically tuned and off-the-shelf value functions greatly improve the\nlength-controlled win rates of large models against $\\texttt{gpt-4-turbo}$\n(e.g., $19.51\\% \\rightarrow 26.51\\%$ for $\\texttt{Mistral-7B-Instruct-v0.2}$\nand $25.58\\% \\rightarrow 33.75\\%$ for $\\texttt{Mixtral-8x7B-Instruct-v0.1}$\nwith Tulu guidance).\n","authors":["Zhixuan Liu","Zhanhui Zhou","Yuanfu Wang","Chao Yang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2409.17819v1.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17791v1","updated":"2024-09-26T12:37:26Z","published":"2024-09-26T12:37:26Z","title":"Self-supervised Preference Optimization: Enhance Your Language Model\n with Preference Degree Awareness","summary":" Recently, there has been significant interest in replacing the reward model\nin Reinforcement Learning with Human Feedback (RLHF) methods for Large Language\nModels (LLMs), such as Direct Preference Optimization (DPO) and its variants.\nThese approaches commonly use a binary cross-entropy mechanism on pairwise\nsamples, i.e., minimizing and maximizing the loss based on preferred or\ndis-preferred responses, respectively. However, while this training strategy\nomits the reward model, it also overlooks the varying preference degrees within\ndifferent responses. We hypothesize that this is a key factor hindering LLMs\nfrom sufficiently understanding human preferences. To address this problem, we\npropose a novel Self-supervised Preference Optimization (SPO) framework, which\nconstructs a self-supervised preference degree loss combined with the alignment\nloss, thereby helping LLMs improve their ability to understand the degree of\npreference. Extensive experiments are conducted on two widely used datasets of\ndifferent tasks. The results demonstrate that SPO can be seamlessly integrated\nwith existing preference optimization methods and significantly boost their\nperformance to achieve state-of-the-art performance. We also conduct detailed\nanalyses to offer comprehensive insights into SPO, which verifies its\neffectiveness. The code is available at https://github.com/lijian16/SPO.\n","authors":["Jian Li","Haojing Huang","Yujia Zhang","Pengfei Xu","Xi Chen","Rui Song","Lida Shi","Jingwen Wang","Hao Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17791v1.pdf","comment":"Accepted at EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2403.15676v4","updated":"2024-09-26T12:18:21Z","published":"2024-03-23T01:44:57Z","title":"AC4: Algebraic Computation Checker for Circuit Constraints in ZKPs","summary":" Zero-knowledge proof (ZKP) systems have surged attention and held a\nfundamental role in contemporary cryptography. Zero-knowledge succinct\nnon-interactive argument of knowledge (zk-SNARK) protocols dominate the ZKP\nusage, implemented through arithmetic circuit programming paradigm. However,\nunderconstrained or overconstrained circuits may lead to bugs. The former\nrefers to circuits that lack the necessary constraints, resulting in unexpected\nsolutions and causing the verifier to accept a bogus witness, and the latter\nrefers to circuits that are constrained excessively, resulting in lacking\nnecessary solutions and causing the verifier to accept no witness. This paper\nintroduces a novel approach for pinpointing two distinct types of bugs in ZKP\ncircuits. The method involves encoding the arithmetic circuit constraints to\npolynomial equation systems and solving them over finite fields by the computer\nalgebra system. The classification of verification results is refined, greatly\nenhancing the expressive power of the system. A tool, AC4, is proposed to\nrepresent the implementation of the method. Experiments show that AC4\ndemonstrates a increase in the checked ratio, showing a 29% improvement over\nPicus, a checker for Circom circuits, and a 10% improvement over\nhalo2-analyzer, a checker for halo2 circuits. Within a solvable range, the\nchecking time has also exhibited noticeable improvement, demonstrating a\nmagnitude increase compared to previous efforts.\n","authors":["Hao Chen","Guoqiang Li","Minyu Chen","Ruibang Liu","Sinka Gao"],"pdf_url":"https://arxiv.org/pdf/2403.15676v4.pdf","comment":"24 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17774v1","updated":"2024-09-26T12:11:28Z","published":"2024-09-26T12:11:28Z","title":"Faithfulness and the Notion of Adversarial Sensitivity in NLP\n Explanations","summary":" Faithfulness is arguably the most critical metric to assess the reliability\nof explainable AI. In NLP, current methods for faithfulness evaluation are\nfraught with discrepancies and biases, often failing to capture the true\nreasoning of models. We introduce Adversarial Sensitivity as a novel approach\nto faithfulness evaluation, focusing on the explainer's response when the model\nis under adversarial attack. Our method accounts for the faithfulness of\nexplainers by capturing sensitivity to adversarial input changes. This work\naddresses significant limitations in existing evaluation techniques, and\nfurthermore, quantifies faithfulness from a crucial yet underexplored paradigm.\n","authors":["Supriya Manna","Niladri Sett"],"pdf_url":"https://arxiv.org/pdf/2409.17774v1.pdf","comment":"Accepted as a Full Paper at EMNLP 2024 Workshop BlackBoxNLP"},{"id":"http://arxiv.org/abs/2409.13832v2","updated":"2024-09-26T12:07:20Z","published":"2024-09-20T18:18:14Z","title":"GTSinger: A Global Multi-Technique Singing Corpus with Realistic Music\n Scores for All Singing Tasks","summary":" The scarcity of high-quality and multi-task singing datasets significantly\nhinders the development of diverse controllable and personalized singing tasks,\nas existing singing datasets suffer from low quality, limited diversity of\nlanguages and singers, absence of multi-technique information and realistic\nmusic scores, and poor task suitability. To tackle these problems, we present\nGTSinger, a large global, multi-technique, free-to-use, high-quality singing\ncorpus with realistic music scores, designed for all singing tasks, along with\nits benchmarks. Particularly, (1) we collect 80.59 hours of high-quality\nsinging voices, forming the largest recorded singing dataset; (2) 20\nprofessional singers across nine widely spoken languages offer diverse timbres\nand styles; (3) we provide controlled comparison and phoneme-level annotations\nof six commonly used singing techniques, helping technique modeling and\ncontrol; (4) GTSinger offers realistic music scores, assisting real-world\nmusical composition; (5) singing voices are accompanied by manual\nphoneme-to-audio alignments, global style labels, and 16.16 hours of paired\nspeech for various singing tasks. Moreover, to facilitate the use of GTSinger,\nwe conduct four benchmark experiments: technique-controllable singing voice\nsynthesis, technique recognition, style transfer, and speech-to-singing\nconversion. The corpus and demos can be found at http://gtsinger.github.io. We\nprovide the dataset and the code for processing data and conducting benchmarks\nat https://huggingface.co/datasets/GTSinger/GTSinger and\nhttps://github.com/GTSinger/GTSinger.\n","authors":["Yu Zhang","Changhao Pan","Wenxiang Guo","Ruiqi Li","Zhiyuan Zhu","Jialei Wang","Wenhao Xu","Jingyu Lu","Zhiqing Hong","Chuxin Wang","LiChao Zhang","Jinzheng He","Ziyue Jiang","Yuxin Chen","Chen Yang","Jiecheng Zhou","Xinyu Cheng","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.13832v2.pdf","comment":"Accepted by NeurIPS 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2409.17757v1","updated":"2024-09-26T11:46:58Z","published":"2024-09-26T11:46:58Z","title":"Integrating Hierarchical Semantic into Iterative Generation Model for\n Entailment Tree Explanation","summary":" Manifestly and logically displaying the line of reasoning from evidence to\nanswer is significant to explainable question answering (QA). The entailment\ntree exhibits the lines structurally, which is different from the\nself-explanation principle in large-scale language models. Existing methods\nrarely consider the semantic association of sentences between and within\nhierarchies within the tree structure, which is prone to apparent mistakes in\ncombinations. In this work, we propose an architecture of integrating the\nHierarchical Semantics of sentences under the framework of Controller-Generator\n(HiSCG) to explain answers. The HiSCG designs a hierarchical mapping between\nhypotheses and facts, discriminates the facts involved in tree constructions,\nand optimizes single-step entailments. To the best of our knowledge, We are the\nfirst to notice hierarchical semantics of sentences between the same layer and\nadjacent layers to yield improvements. The proposed method achieves comparable\nperformance on all three settings of the EntailmentBank dataset. The\ngeneralization results on two out-of-domain datasets also demonstrate the\neffectiveness of our method.\n","authors":["Qin Wang","Jianzhou Feng","Yiming Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04259v2","updated":"2024-09-26T11:42:35Z","published":"2024-08-08T06:57:49Z","title":"EfficientRAG: Efficient Retriever for Multi-Hop Question Answering","summary":" Retrieval-augmented generation (RAG) methods encounter difficulties when\naddressing complex questions like multi-hop queries. While iterative retrieval\nmethods improve performance by gathering additional information, current\napproaches often rely on multiple calls of large language models (LLMs). In\nthis paper, we introduce EfficientRAG, an efficient retriever for multi-hop\nquestion answering. EfficientRAG iteratively generates new queries without the\nneed for LLM calls at each iteration and filters out irrelevant information.\nExperimental results demonstrate that EfficientRAG surpasses existing RAG\nmethods on three open-domain multi-hop question-answering datasets.\n","authors":["Ziyuan Zhuang","Zhiyang Zhang","Sitao Cheng","Fangkai Yang","Jia Liu","Shujian Huang","Qingwei Lin","Saravan Rajmohan","Dongmei Zhang","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.04259v2.pdf","comment":"20 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17755v1","updated":"2024-09-26T11:40:07Z","published":"2024-09-26T11:40:07Z","title":"SECURE: Semantics-aware Embodied Conversation under Unawareness for\n Lifelong Robot Learning","summary":" This paper addresses a challenging interactive task learning scenario we call\nrearrangement under unawareness: to manipulate a rigid-body environment in a\ncontext where the robot is unaware of a concept that's key to solving the\ninstructed task. We propose SECURE, an interactive task learning framework\ndesigned to solve such problems by fixing a deficient domain model using\nembodied conversation. Through dialogue, the robot discovers and then learns to\nexploit unforeseen possibilities. Using SECURE, the robot not only learns from\nthe user's corrective feedback when it makes a mistake, but it also learns to\nmake strategic dialogue decisions for revealing useful evidence about novel\nconcepts for solving the instructed task. Together, these abilities allow the\nrobot to generalise to subsequent tasks using newly acquired knowledge. We\ndemonstrate that a robot that is semantics-aware -- that is, it exploits the\nlogical consequences of both sentence and discourse semantics in the learning\nand inference process -- learns to solve rearrangement under unawareness more\neffectively than a robot that lacks such capabilities.\n","authors":["Rimvydas Rubavicius","Peter David Fagan","Alex Lascarides","Subramanian Ramamoorthy"],"pdf_url":"https://arxiv.org/pdf/2409.17755v1.pdf","comment":"10 pages,4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.17750v1","updated":"2024-09-26T11:31:18Z","published":"2024-09-26T11:31:18Z","title":"Are Transformers in Pre-trained LM A Good ASR Encoder? An Empirical\n Study","summary":" In this study, we delve into the efficacy of transformers within pre-trained\nlanguage models (PLMs) when repurposed as encoders for Automatic Speech\nRecognition (ASR). Our underlying hypothesis posits that, despite being\ninitially trained on text-based corpora, these transformers possess a\nremarkable capacity to extract effective features from the input sequence. This\ninherent capability, we argue, is transferrable to speech data, thereby\naugmenting the acoustic modeling ability of ASR. Through rigorous empirical\nanalysis, our findings reveal a notable improvement in Character Error Rate\n(CER) and Word Error Rate (WER) across diverse ASR tasks when transformers from\npre-trained LMs are incorporated. Particularly, they serve as an advantageous\nstarting point for initializing ASR encoders. Furthermore, we uncover that\nthese transformers, when integrated into a well-established ASR encoder, can\nsignificantly boost performance, especially in scenarios where profound\nsemantic comprehension is pivotal. This underscores the potential of leveraging\nthe semantic prowess embedded within pre-trained transformers to advance ASR\nsystems' capabilities.\n","authors":["Keyu An","Shiliang Zhang","Zhijie Yan"],"pdf_url":"https://arxiv.org/pdf/2409.17750v1.pdf","comment":"8pages"},{"id":"http://arxiv.org/abs/2402.12844v2","updated":"2024-09-26T11:29:04Z","published":"2024-02-20T09:13:15Z","title":"ICON: Improving Inter-Report Consistency in Radiology Report Generation\n via Lesion-aware Mixup Augmentation","summary":" Previous research on radiology report generation has made significant\nprogress in terms of increasing the clinical accuracy of generated reports. In\nthis paper, we emphasize another crucial quality that it should possess, i.e.,\ninter-report consistency, which refers to the capability of generating\nconsistent reports for semantically equivalent radiographs. This quality is\neven of greater significance than the overall report accuracy in terms of\nensuring the system's credibility, as a system prone to providing conflicting\nresults would severely erode users' trust. Regrettably, existing approaches\nstruggle to maintain inter-report consistency, exhibiting biases towards common\npatterns and susceptibility to lesion variants. To address this issue, we\npropose ICON, which improves the inter-report consistency of radiology report\ngeneration. Aiming to enhance the system's ability to capture similarities in\nsemantically equivalent lesions, our approach first involves extracting lesions\nfrom input images and examining their characteristics. Then, we introduce a\nlesion-aware mixup technique to ensure that the representations of the\nsemantically equivalent lesions align with the same attributes, achieved\nthrough a linear combination during the training phase. Extensive experiments\non three publicly available chest X-ray datasets verify the effectiveness of\nour approach, both in terms of improving the consistency and accuracy of the\ngenerated reports.\n","authors":["Wenjun Hou","Yi Cheng","Kaishuai Xu","Yan Hu","Wenjie Li","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.12844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17745v1","updated":"2024-09-26T11:19:09Z","published":"2024-09-26T11:19:09Z","title":"Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval\n Model","summary":" A supervised ranking model, despite its advantage of being effective, usually\ninvolves complex processing - typically multiple stages of task-specific\npre-training and fine-tuning. This has motivated researchers to explore simpler\npipelines leveraging large language models (LLMs) that are capable of working\nin a zero-shot manner. However, since zero-shot inference does not make use of\na training set of pairs of queries and their relevant documents, its\nperformance is mostly worse than that of supervised models, which are trained\non such example pairs. Motivated by the existing findings that training\nexamples generally improve zero-shot performance, in our work, we explore if\nthis also applies to ranking models. More specifically, given a query and a\npair of documents, the preference prediction task is improved by augmenting\nexamples of preferences for similar queries from a training set. Our proposed\npairwise few-shot ranker demonstrates consistent improvements over the\nzero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset)\nretrieval benchmarks. Our method also achieves a close performance to that of a\nsupervised model without requiring any complex training pipeline.\n","authors":["Nilanjan Sinhababu","Andrew Parry","Debasis Ganguly","Debasis Samanta","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2409.17745v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2402.10712v3","updated":"2024-09-26T11:15:14Z","published":"2024-02-16T14:15:15Z","title":"An Empirical Study on Cross-lingual Vocabulary Adaptation for Efficient\n Language Model Inference","summary":" The development of state-of-the-art generative large language models (LLMs)\ndisproportionately relies on English-centric tokenizers, vocabulary and\npre-training data. Despite the fact that some LLMs have multilingual\ncapabilities, recent studies have shown that their inference efficiency\ndeteriorates when generating text in languages other than English. This results\nin increased inference time and costs. Cross-lingual vocabulary adaptation\n(CVA) methods have been proposed for adapting models to a target language\naiming to improve downstream performance. However, the effectiveness of these\nmethods on increasing inference efficiency of generative LLMs has yet to be\nexplored. In this paper, we perform an empirical study of five CVA methods on\nfour generative LLMs (including monolingual and multilingual models) across\nfour typologically-diverse languages and four natural language understanding\ntasks. We find that CVA substantially contributes to LLM inference speedups of\nup to 271.5\\%. We also show that adapting LLMs that have been pre-trained on\nmore balanced multilingual data results in downstream performance comparable to\nthe original models.\n","authors":["Atsuki Yamaguchi","Aline Villavicencio","Nikolaos Aletras"],"pdf_url":"https://arxiv.org/pdf/2402.10712v3.pdf","comment":"Accepted at EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2406.12442v2","updated":"2024-09-26T11:15:14Z","published":"2024-06-18T09:46:44Z","title":"Abstraction-of-Thought Makes Language Models Better Reasoners","summary":" Abstract reasoning, the ability to reason from the abstract essence of a\nproblem, serves as a key to generalization in human reasoning. However,\neliciting language models to perform reasoning with abstraction remains\nunexplored. This paper seeks to bridge this gap by introducing a novel\nstructured reasoning format called Abstraction-of-Thought (AoT). The uniqueness\nof AoT lies in its explicit requirement for varying levels of abstraction\nwithin the reasoning process. This approach could elicit language models to\nfirst contemplate on the abstract level before incorporating concrete details,\nwhich is overlooked by the prevailing step-by-step Chain-of-Thought (CoT)\nmethod. To align models with the AoT format, we present AoT Collection, a\ngeneric finetuning dataset consisting of 348k high-quality samples with AoT\nreasoning processes, collected via an automated and scalable pipeline. We\nfinetune a wide range of language models with AoT Collection and conduct\nextensive evaluations on 23 unseen tasks from the challenging benchmark\nBig-Bench Hard. Experimental results indicate that models aligned to AoT\nreasoning format substantially outperform those aligned to CoT in many\nreasoning tasks.\n","authors":["Ruixin Hong","Hongming Zhang","Xiaoman Pan","Dong Yu","Changshui Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.12442v2.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2405.14722v2","updated":"2024-09-26T10:23:33Z","published":"2024-05-23T15:51:24Z","title":"DAPE: Data-Adaptive Positional Encoding for Length Extrapolation","summary":" Positional encoding plays a crucial role in transformers, significantly\nimpacting model performance and length generalization. Prior research has\nintroduced absolute positional encoding (APE) and relative positional encoding\n(RPE) to distinguish token positions in given sequences. However, both APE and\nRPE remain fixed after model training regardless of input data, limiting their\nadaptability and flexibility. Hence, we expect that the desired positional\nencoding should be data-adaptive and can be dynamically adjusted with the given\nattention. In this paper, we propose a Data-Adaptive Positional Encoding (DAPE)\nmethod, which dynamically and semantically adjusts based on input context and\nlearned fixed priors. Experimental validation on real-world datasets (Arxiv,\nBooks3, and CHE) demonstrates that DAPE enhances model performances in terms of\ntrained length and length generalization, where the improvements are\nstatistically significant. The model visualization suggests that our model can\nkeep both local and anti-local information. Finally, we successfully train the\nmodel on sequence length 128 and achieve better performance at evaluation\nsequence length 8192, compared with other static positional encoding methods,\nrevealing the benefit of the adaptive positional encoding method.\n","authors":["Chuanyang Zheng","Yihang Gao","Han Shi","Minbin Huang","Jingyao Li","Jing Xiong","Xiaozhe Ren","Michael Ng","Xin Jiang","Zhenguo Li","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2405.14722v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2407.14788v2","updated":"2024-09-26T10:21:33Z","published":"2024-07-20T07:39:07Z","title":"On the Design and Analysis of LLM-Based Algorithms","summary":" We initiate a formal investigation into the design and analysis of LLM-based\nalgorithms, i.e. algorithms that contain one or multiple calls of large\nlanguage models (LLMs) as sub-routines and critically rely on the capabilities\nof LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt\nengineering to complicated LLM-powered agent systems and compound AI systems,\nhave achieved remarkable empirical success, the design and optimization of them\nhave mostly relied on heuristics and trial-and-errors, which is largely due to\na lack of formal and analytical study for these algorithms. To fill this gap,\nwe start by identifying the computational-graph representation of LLM-based\nalgorithms, the design principle of task decomposition, and some key\nabstractions, which then facilitate our formal analysis for the accuracy and\nefficiency of LLM-based algorithms, despite the black-box nature of LLMs.\nThrough extensive analytical and empirical investigation in a series of case\nstudies, we demonstrate that the proposed framework is broadly applicable to a\nwide range of scenarios and diverse patterns of LLM-based algorithms, such as\nparallel, hierarchical and recursive task decomposition. Our proposed framework\nholds promise for advancing LLM-based algorithms, by revealing the reasons\nbehind curious empirical phenomena, guiding the choices of hyperparameters,\npredicting the empirical performance of algorithms, and inspiring new algorithm\ndesign. To promote further study of LLM-based algorithms, we release our source\ncode at\nhttps://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm.\n","authors":["Yanxi Chen","Yaliang Li","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.14788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01432v3","updated":"2024-09-26T10:03:41Z","published":"2024-03-03T08:07:55Z","title":"Fine Tuning vs. Retrieval Augmented Generation for Less Popular\n Knowledge","summary":" Language Models (LMs) memorize a vast amount of factual knowledge, exhibiting\nstrong performance across diverse tasks and domains. However, it has been\nobserved that the performance diminishes when dealing with less-popular or\nlow-frequency concepts and entities, for example in domain specific\napplications. The two prominent approaches to enhance the performance of LMs on\nlow-frequent topics are: Retrieval Augmented Generation (RAG) and fine-tuning\n(FT) over synthetic data. This paper explores and evaluates the impact of RAG\nand FT on customizing LMs in handling low-frequency entities on question\nanswering tasks. We conduct extensive experiments on twelve LMs of varying size\nand type and different fine tuning, data augmentation, and retrieval models.\nOur findings indicate that while FT boosts the performance across entities of\nvarying popularity, RAG surpasses FT by a large margin particularly for least\npopular factual knowledge. Additionally, the success of both RAG and FT\napproaches is amplified by improving retrieval and data augmentation\ntechniques. Fine tuning, while beneficial for small LMs, requires extensive\nresources. To address this issue, we propose the new Stimulus RAG approach that\nsurpasses the effectiveness of fine tuning based approaches, thereby\neliminating the need for the costly data augmentation and fine tuning step for\nenriching LMs with less popular factual knowledge.\n","authors":["Heydar Soudani","Evangelos Kanoulas","Faegheh Hasibi"],"pdf_url":"https://arxiv.org/pdf/2403.01432v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14374v2","updated":"2024-09-26T10:01:17Z","published":"2024-09-22T09:33:54Z","title":"J2N -- Nominal Adjective Identification and its Application","summary":" This paper explores the challenges posed by nominal adjectives (NAs) in\nnatural language processing (NLP) tasks, particularly in part-of-speech (POS)\ntagging. We propose treating NAs as a distinct POS tag, \"JN,\" and investigate\nits impact on POS tagging, BIO chunking, and coreference resolution. Our study\nshows that reclassifying NAs can improve the accuracy of syntactic analysis and\nstructural understanding in NLP. We present experimental results using Hidden\nMarkov Models (HMMs), Maximum Entropy (MaxEnt) models, and Spacy, demonstrating\nthe feasibility and potential benefits of this approach. Additionally we\ntrained a bert model to identify the NA in untagged text.\n","authors":["Lemeng Qi","Yang Han","Zhuotong Xie"],"pdf_url":"https://arxiv.org/pdf/2409.14374v2.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17692v1","updated":"2024-09-26T09:57:16Z","published":"2024-09-26T09:57:16Z","title":"MIO: A Foundation Model on Multimodal Tokens","summary":" In this paper, we introduce MIO, a novel foundation model built on multimodal\ntokens, capable of understanding and generating speech, text, images, and\nvideos in an end-to-end, autoregressive manner. While the emergence of large\nlanguage models (LLMs) and multimodal large language models (MM-LLMs) propels\nadvancements in artificial general intelligence through their versatile\ncapabilities, they still lack true any-to-any understanding and generation.\nRecently, the release of GPT-4o has showcased the remarkable potential of\nany-to-any LLMs for complex real-world tasks, enabling omnidirectional input\nand output across images, speech, and text. However, it is closed-source and\ndoes not support the generation of multimodal interleaved sequences. To address\nthis gap, we present MIO, which is trained on a mixture of discrete tokens\nacross four modalities using causal multimodal modeling. MIO undergoes a\nfour-stage training process: (1) alignment pre-training, (2) interleaved\npre-training, (3) speech-enhanced pre-training, and (4) comprehensive\nsupervised fine-tuning on diverse textual, visual, and speech tasks. Our\nexperimental results indicate that MIO exhibits competitive, and in some cases\nsuperior, performance compared to previous dual-modal baselines, any-to-any\nmodel baselines, and even modality-specific baselines. Moreover, MIO\ndemonstrates advanced capabilities inherent to its any-to-any feature, such as\ninterleaved video-text generation, chain-of-visual-thought reasoning, visual\nguideline generation, instructional image editing, etc.\n","authors":["Zekun Wang","King Zhu","Chunpu Xu","Wangchunshu Zhou","Jiaheng Liu","Yibo Zhang","Jiashuo Wang","Ning Shi","Siyu Li","Yizhi Li","Haoran Que","Zhaoxiang Zhang","Yuanxing Zhang","Ge Zhang","Ke Xu","Jie Fu","Wenhao Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17692v1.pdf","comment":"Technical Report. Codes and models will be available soon"},{"id":"http://arxiv.org/abs/2404.00459v2","updated":"2024-09-26T09:54:57Z","published":"2024-03-30T19:46:59Z","title":"NumeroLogic: Number Encoding for Enhanced LLMs' Numerical Reasoning","summary":" Language models struggle with handling numerical data and performing\narithmetic operations. We hypothesize that this limitation can be partially\nattributed to non-intuitive textual numbers representation. When a digit is\nread or generated by a causal language model it does not know its place value\n(e.g. thousands vs. hundreds) until the entire number is processed. To address\nthis issue, we propose a simple adjustment to how numbers are represented by\nincluding the count of digits before each number. For instance, instead of\n\"42\", we suggest using \"{2:42}\" as the new format. This approach, which we term\nNumeroLogic, offers an added advantage in number generation by serving as a\nChain of Thought (CoT). By requiring the model to consider the number of digits\nfirst, it enhances the reasoning process before generating the actual number.\nWe use arithmetic tasks to demonstrate the effectiveness of the NumeroLogic\nformatting. We further demonstrate NumeroLogic applicability to general natural\nlanguage modeling, improving language understanding performance in the MMLU\nbenchmark.\n","authors":["Eli Schwartz","Leshem Choshen","Joseph Shtok","Sivan Doveh","Leonid Karlinsky","Assaf Arbelle"],"pdf_url":"https://arxiv.org/pdf/2404.00459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06802v2","updated":"2024-09-26T09:52:20Z","published":"2024-05-10T20:29:25Z","title":"Leveraging summary of radiology reports with transformers","summary":" Two fundamental problems in health-care stem from patient handoff and triage.\nDoctors are often required to perform complex findings summarization to\nfacilitate efficient communication with specialists and decision making on the\nurgency of each case. To address these challenges, we present a state of the\nart radiology report summarization model utilizing adjusted bidirectional\nencoder representation from transformers BERTtoBERT encoder and decoder\narchitecture. We also provide a data processing pipeline for future models\ndeveloped on the the MIMIC CXR dataset. Our approach includes a novel method\nfor augmenting medical data and a comprehensive performance analysis. Our best\nperforming model achieved a recall oriented understudy for gisting evaluation L\nF1 score of 58.75/100, outperforming specialized checkpoints with more\nsophisticated attention mechanisms. We also provide a data processing pipeline\nfor future models developed on the MIMIC chest X-ray dataset. The model\nintroduced in this paper demonstrates significantly improved capacity in\nradiology report summarization, highlighting the potential for ensuring better\nclinical workflows and enhanced patient care.\n","authors":["Raul Salles de Padua","Imran Qureshi"],"pdf_url":"https://arxiv.org/pdf/2405.06802v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17683v1","updated":"2024-09-26T09:49:27Z","published":"2024-09-26T09:49:27Z","title":"Zero- and Few-shot Named Entity Recognition and Text Expansion in\n Medication Prescriptions using ChatGPT","summary":" Introduction: Medication prescriptions are often in free text and include a\nmix of two languages, local brand names, and a wide range of idiosyncratic\nformats and abbreviations. Large language models (LLMs) have shown promising\nability to generate text in response to input prompts. We use ChatGPT 3.5 to\nautomatically structure and expand medication statements in discharge summaries\nand thus make them easier to interpret for people and machines. Methods:\nNamed-entity Recognition (NER) and Text Expansion (EX) are used in a zero- and\nfew-shot setting with different prompt strategies. 100 medication statements\nwere manually annotated and curated. NER performance was measured by using\nstrict and partial matching. For the task EX, two experts interpreted the\nresults by assessing semantic equivalence between original and expanded\nstatements. The model performance was measured by precision, recall, and F1\nscore. Results: For NER, the best-performing prompt reached an average F1 score\nof 0.94 in the test set. For EX, the few-shot prompt showed superior\nperformance among other prompts, with an average F1 score of 0.87. Conclusion:\nOur study demonstrates good performance for NER and EX tasks in free-text\nmedication statements using ChatGPT. Compared to a zero-shot baseline, a\nfew-shot approach prevented the system from hallucinating, which would be\nunacceptable when processing safety-relevant medication data.\n","authors":["Natthanaphop Isaradech","Andrea Riedel","Wachiranun Sirikul","Markus Kreuzthaler","Stefan Schulz"],"pdf_url":"https://arxiv.org/pdf/2409.17683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13167v2","updated":"2024-09-26T09:42:48Z","published":"2024-06-19T02:46:18Z","title":"QRMeM: Unleash the Length Limitation through Question then Reflection\n Memory Mechanism","summary":" While large language models (LLMs) have made notable advancements in natural\nlanguage processing, they continue to struggle with processing extensive text.\nMemory mechanism offers a flexible solution for managing long contexts,\nutilizing techniques such as compression, summarization, and structuring to\nfacilitate nuanced and efficient handling of large volumes of text. However,\nexisting techniques face challenges with static knowledge integration, leading\nto insufficient adaptation to task-specific needs and missing\nmulti-segmentation relationships, which hinders the dynamic reorganization and\nlogical combination of relevant segments during the response process. To\naddress these issues, we introduce a novel strategy, Question then Reflection\nMemory Mechanism (QRMeM), incorporating a dual-structured memory pool. This\npool synergizes static textual content with structured graph guidance,\nfostering a reflective trial-and-error approach for navigating and identifying\nrelevant segments. Our evaluation across multiple-choice questions (MCQ) and\nmulti-document question answering (Multi-doc QA) benchmarks showcases QRMeM\nenhanced performance compared to existing approaches.\n","authors":["Bo Wang","Heyan Huang","Yixin Cao","Jiahao Ying","Wei Tang","Chong Feng"],"pdf_url":"https://arxiv.org/pdf/2406.13167v2.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17673v1","updated":"2024-09-26T09:32:12Z","published":"2024-09-26T09:32:12Z","title":"Cross-lingual Human-Preference Alignment for Neural Machine Translation\n with Direct Quality Optimization","summary":" Reinforcement Learning from Human Feedback (RLHF) and derivative techniques\nlike Direct Preference Optimization (DPO) are task-alignment algorithms used to\nrepurpose general, foundational models for specific tasks. We show that\napplying task-alignment to neural machine translation (NMT) addresses an\nexisting task--data mismatch in NMT, leading to improvements across all\nlanguages of a multilingual model, even when task-alignment is only applied to\na subset of those languages. We do so by introducing Direct Quality\nOptimization (DQO), a variant of DPO leveraging a pre-trained translation\nquality estimation model as a proxy for human preferences, and verify the\nimprovements with both automatic metrics and human evaluation.\n","authors":["Kaden Uhlig","Joern Wuebker","Raphael Reinauer","John DeNero"],"pdf_url":"https://arxiv.org/pdf/2409.17673v1.pdf","comment":"17 pages, 1 figure"},{"id":"http://arxiv.org/abs/2404.09486v2","updated":"2024-09-26T09:31:48Z","published":"2024-04-15T06:15:46Z","title":"MMCode: Benchmarking Multimodal Large Language Models for Code\n Generation with Visually Rich Programming Problems","summary":" Programming often involves converting detailed and complex specifications\ninto code, a process during which developers typically utilize visual aids to\nmore effectively convey concepts. While recent developments in Large Multimodal\nModels have demonstrated remarkable abilities in visual reasoning and\nmathematical tasks, there is little work on investigating whether these models\ncan effectively interpret visual elements for code generation. To this end, we\npresent MMCode, the first multi-modal coding dataset for evaluating algorithmic\nproblem-solving skills in visually rich contexts. MMCode contains 3,548\nquestions and 6,620 images collected from real-world programming challenges\nharvested from 10 code competition websites, presenting significant challenges\ndue to the extreme demand for reasoning abilities. Our experiment results show\nthat current state-of-the-art models struggle to solve these problems. The\nresults highlight the lack of powerful vision-code models, and we hope MMCode\ncan serve as an inspiration for future works in this domain. The data and code\nare publicly available at https://github.com/likaixin2000/MMCode.\n","authors":["Kaixin Li","Yuchen Tian","Qisheng Hu","Ziyang Luo","Zhiyong Huang","Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2404.09486v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2407.16693v2","updated":"2024-09-26T09:27:30Z","published":"2024-07-23T17:56:32Z","title":"Explanation Regularisation through the Lens of Attributions","summary":" Explanation regularisation (ER) has been introduced as a way to guide text\nclassifiers to form their predictions relying on input tokens that humans\nconsider plausible. This is achieved by introducing an auxiliary explanation\nloss that measures how well the output of an input attribution technique for\nthe model agrees with human-annotated rationales. The guidance appears to\nbenefit performance in out-of-domain (OOD) settings, presumably due to an\nincreased reliance on \"plausible\" tokens. However, previous work has\nunder-explored the impact of guidance on that reliance, particularly when\nreliance is measured using attribution techniques different from those used to\nguide the model. In this work, we seek to close this gap, and also explore the\nrelationship between reliance on plausible features and OOD performance. We\nfind that the connection between ER and the ability of a classifier to rely on\nplausible features has been overstated and that a stronger reliance on\nplausible tokens does not seem to be the cause for OOD improvements.\n","authors":["Pedro Ferreira","Ivan Titov","Wilker Aziz"],"pdf_url":"https://arxiv.org/pdf/2407.16693v2.pdf","comment":"22 pages, 14 figures, 9 tables"},{"id":"http://arxiv.org/abs/2404.12753v2","updated":"2024-09-26T09:17:10Z","published":"2024-04-19T09:59:44Z","title":"AutoScraper: A Progressive Understanding Web Agent for Web Scraper\n Generation","summary":" Web scraping is a powerful technique that extracts data from websites,\nenabling automated data collection, enhancing data analysis capabilities, and\nminimizing manual data entry efforts. Existing methods, wrappers-based methods\nsuffer from limited adaptability and scalability when faced with a new website,\nwhile language agents, empowered by large language models (LLMs), exhibit poor\nreusability in diverse web environments. In this work, we introduce the\nparadigm of generating web scrapers with LLMs and propose AutoScraper, a\ntwo-stage framework that can handle diverse and changing web environments more\nefficiently. AutoScraper leverages the hierarchical structure of HTML and\nsimilarity across different web pages for generating web scrapers. Besides, we\npropose a new executability metric for better measuring the performance of web\nscraper generation tasks. We conduct comprehensive experiments with multiple\nLLMs and demonstrate the effectiveness of our framework. Resources of this\npaper can be found at \\url{https://github.com/EZ-hwh/AutoScraper}\n","authors":["Wenhao Huang","Zhouhong Gu","Chenghao Peng","Zhixu Li","Jiaqing Liang","Yanghua Xiao","Liqian Wen","Zulong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.12753v2.pdf","comment":"19 pages, 4 figures, 18 tables. Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17650v1","updated":"2024-09-26T08:56:54Z","published":"2024-09-26T08:56:54Z","title":"Digital Twin Ecosystem for Oncology Clinical Operations","summary":" Artificial Intelligence (AI) and Large Language Models (LLMs) hold\nsignificant promise in revolutionizing healthcare, especially in clinical\napplications. Simultaneously, Digital Twin technology, which models and\nsimulates complex systems, has gained traction in enhancing patient care.\nHowever, despite the advances in experimental clinical settings, the potential\nof AI and digital twins to streamline clinical operations remains largely\nuntapped. This paper introduces a novel digital twin framework specifically\ndesigned to enhance oncology clinical operations. We propose the integration of\nmultiple specialized digital twins, such as the Medical Necessity Twin, Care\nNavigator Twin, and Clinical History Twin, to enhance workflow efficiency and\npersonalize care for each patient based on their unique data. Furthermore, by\nsynthesizing multiple data sources and aligning them with the National\nComprehensive Cancer Network (NCCN) guidelines, we create a dynamic Cancer Care\nPath, a continuously evolving knowledge base that enables these digital twins\nto provide precise, tailored clinical recommendations.\n","authors":["Himanshu Pandey","Akhil Amod"," Shivang","Kshitij Jaggi","Ruchi Garg","Abheet Jain","Vinayak Tantia"],"pdf_url":"https://arxiv.org/pdf/2409.17650v1.pdf","comment":"Pre Print"},{"id":"http://arxiv.org/abs/2409.17648v1","updated":"2024-09-26T08:55:21Z","published":"2024-09-26T08:55:21Z","title":"Efficient In-Domain Question Answering for Resource-Constrained\n Environments","summary":" Retrieval Augmented Generation (RAG) is a common method for integrating\nexternal knowledge into pretrained Large Language Models (LLMs) to enhance\naccuracy and relevancy in question answering (QA) tasks. However, prompt\nengineering and resource efficiency remain significant bottlenecks in\ndeveloping optimal and robust RAG solutions for real-world QA applications.\nRecent studies have shown success in using fine tuning to address these\nproblems; in particular, Retrieval Augmented Fine Tuning (RAFT) applied to\nsmaller 7B models has demonstrated superior performance compared to RAG setups\nwith much larger models such as GPT-3.5. The combination of RAFT with\nparameter-efficient fine tuning (PEFT) techniques, such as Low-Rank Adaptation\n(LoRA), promises an even more efficient solution, yet remains an unexplored\narea. In this work, we combine RAFT with LoRA to reduce fine tuning and storage\nrequirements and gain faster inference times while maintaining comparable RAG\nperformance. This results in a more compute-efficient RAFT, or CRAFT, which is\nparticularly useful for knowledge-intensive QA tasks in resource-constrained\nenvironments where internet access may be restricted and hardware resources\nlimited.\n","authors":["Isaac Chung","Phat Vo","Arman Kizilkale","Aaron Reite"],"pdf_url":"https://arxiv.org/pdf/2409.17648v1.pdf","comment":"6 pages, 2 tables"},{"id":"http://arxiv.org/abs/2405.16908v2","updated":"2024-09-26T08:53:01Z","published":"2024-05-27T07:56:23Z","title":"Can Large Language Models Faithfully Express Their Intrinsic Uncertainty\n in Words?","summary":" We posit that large language models (LLMs) should be capable of expressing\ntheir intrinsic uncertainty in natural language. For example, if the LLM is\nequally likely to output two contradicting answers to the same question, then\nits generated response should reflect this uncertainty by hedging its answer\n(e.g., \"I'm not sure, but I think...\"). We formalize faithful response\nuncertainty based on the gap between the model's intrinsic confidence in the\nassertions it makes and the decisiveness by which they are conveyed. This\nexample-level metric reliably indicates whether the model reflects its\nuncertainty, as it penalizes both excessive and insufficient hedging. We\nevaluate a variety of aligned LLMs at faithfully communicating uncertainty on\nseveral knowledge-intensive question answering tasks. Our results provide\nstrong evidence that modern LLMs are poor at faithfully conveying their\nuncertainty, and that better alignment is necessary to improve their\ntrustworthiness.\n","authors":["Gal Yona","Roee Aharoni","Mor Geva"],"pdf_url":"https://arxiv.org/pdf/2405.16908v2.pdf","comment":"To appear in EMNLP 2024 (main conference)"},{"id":"http://arxiv.org/abs/2408.10902v2","updated":"2024-09-26T08:47:36Z","published":"2024-08-20T14:45:23Z","title":"Soda-Eval: Open-Domain Dialogue Evaluation in the age of LLMs","summary":" Although human evaluation remains the gold standard for open-domain dialogue\nevaluation, the growing popularity of automated evaluation using Large Language\nModels (LLMs) has also extended to dialogue. However, most frameworks leverage\nbenchmarks that assess older chatbots on aspects such as fluency and relevance,\nwhich are not reflective of the challenges associated with contemporary models.\nIn fact, a qualitative analysis on Soda, a GPT-3.5 generated dialogue dataset,\nsuggests that current chatbots may exhibit several recurring issues related to\ncoherence and commonsense knowledge, but generally produce highly fluent and\nrelevant responses.\n Noting the aforementioned limitations, this paper introduces Soda-Eval, an\nannotated dataset based on Soda that covers over 120K turn-level assessments\nacross 10K dialogues, where the annotations were generated by GPT-4. Using\nSoda-Eval as a benchmark, we then study the performance of several open-access\ninstruction-tuned LLMs, finding that dialogue evaluation remains challenging.\nFine-tuning these models improves performance over few-shot inferences, both in\nterms of correlation and explanation.\n","authors":["John Mendonça","Isabel Trancoso","Alon Lavie"],"pdf_url":"https://arxiv.org/pdf/2408.10902v2.pdf","comment":"Accepted to EMNLP2024 (findings)"},{"id":"http://arxiv.org/abs/2409.17640v1","updated":"2024-09-26T08:44:38Z","published":"2024-09-26T08:44:38Z","title":"T3: A Novel Zero-shot Transfer Learning Framework Iteratively Training\n on an Assistant Task for a Target Task","summary":" Long text summarization, gradually being essential for efficiently processing\nlarge volumes of information, stays challenging for Large Language Models\n(LLMs) such as GPT and LLaMA families because of the insufficient open-sourced\ntraining datasets and the high requirement of contextual details dealing. To\naddress the issue, we design a novel zero-shot transfer learning framework,\nabbreviated as T3, to iteratively training a baseline LLM on an assistant task\nfor the target task, where the former should own richer data resources and\nshare structural or semantic similarity with the latter. In practice, T3 is\napproached to deal with the long text summarization task by utilizing question\nanswering as the assistant task, and further validated its effectiveness on the\nBBC summary, NarraSum, FairytaleQA, and NLQuAD datasets, with up to nearly 14%\nimprovement in ROUGE, 35% improvement in BLEU, and 16% improvement in Factscore\ncompared to three baseline LLMs, demonstrating its potential for more\nassistant-target task combinations.\n","authors":["Xindi Tong","Yujin Zhu","Shijian Fan","Liang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09802v2","updated":"2024-09-26T08:15:50Z","published":"2023-11-16T11:26:21Z","title":"Neuro-Symbolic Integration Brings Causal and Reliable Reasoning Proofs","summary":" Two lines of approaches are adopted for complex reasoning with LLMs. One line\nof work prompts LLMs with various reasoning structures, while the structural\noutputs can be naturally regarded as intermediate reasoning steps. Another line\nof work adopt LLM-free declarative solvers to do the reasoning task, rendering\nhigher reasoning accuracy but lacking interpretability due to the black-box\nnature of the solvers. Aiming to resolve the trade-off between answer accuracy\nand interpretability, we present a simple extension to the latter line of work.\nSpecifically, we showcase that the intermediate search logs generated by Prolog\ninterpreters can be accessed and interpreted into human-readable reasoning\nproofs. As long as LLMs correctly translate problem descriptions into Prolog\nrepresentations, the corresponding reasoning proofs are ensured to be causal\nand reliable. On two logical reasoning and one arithmetic reasoning datasets,\nour framework obtains significant improvements in terms of both answer accuracy\nand reasoning proof accuracy. Our code is released at\nhttps://github.com/DAMO-NLP-SG/CaRing\n","authors":["Sen Yang","Xin Li","Leyang Cui","Lidong Bing","Wai Lam"],"pdf_url":"https://arxiv.org/pdf/2311.09802v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15254v3","updated":"2024-09-26T08:01:39Z","published":"2024-09-23T17:53:42Z","title":"Archon: An Architecture Search Framework for Inference-Time Techniques","summary":" Inference-time techniques are emerging as highly effective tools to increase\nlarge language model (LLM) capabilities. However, there is still limited\nunderstanding of the best practices for developing systems that combine\ninference-time techniques with one or more LLMs, with challenges including: (1)\neffectively allocating inference compute budget, (2) understanding the\ninteractions between different combinations of inference-time techniques and\ntheir impact on downstream performance, and 3) efficiently searching over the\nlarge space of model choices, inference-time techniques, and their\ncompositions. To address these challenges, we introduce Archon, an automated\nframework for designing inference-time architectures. Archon defines an\nextensible design space, encompassing methods such as generation ensembling,\nmulti-sampling, ranking, fusion, critiquing, verification, and unit testing. It\nthen transforms the problem of selecting and combining LLMs and inference-time\ntechniques into a hyperparameter optimization objective. To optimize this\nobjective, we introduce automated Inference-Time Architecture Search (ITAS)\nalgorithms. Given target benchmark(s), an inference compute budget, and\navailable LLMs, ITAS outputs optimized architectures. We evaluate Archon\narchitectures across a wide range of instruction-following and reasoning\nbenchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval,\nMixEval Hard, MATH, and CodeContests. We show that automatically designed\ninference-time architectures by Archon outperform strong models such as GPT-4o\nand Claude 3.5 Sonnet on these benchmarks, achieving an average increase of\n15.1 and 11.2 percentage points with all-source models and open-source models,\nrespectively. We make our code and datasets available publicly on Github:\nhttps://github.com/ScalingIntelligence/Archon.\n","authors":["Jon Saad-Falcon","Adrian Gamarra Lafuente","Shlok Natarajan","Nahum Maru","Hristo Todorov","Etash Guha","E. Kelly Buchanan","Mayee Chen","Neel Guha","Christopher Ré","Azalia Mirhoseini"],"pdf_url":"https://arxiv.org/pdf/2409.15254v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17610v1","updated":"2024-09-26T07:55:57Z","published":"2024-09-26T07:55:57Z","title":"ZALM3: Zero-Shot Enhancement of Vision-Language Alignment via In-Context\n Information in Multi-Turn Multimodal Medical Dialogue","summary":" The rocketing prosperity of large language models (LLMs) in recent years has\nboosted the prevalence of vision-language models (VLMs) in the medical sector.\nIn our online medical consultation scenario, a doctor responds to the texts and\nimages provided by a patient in multiple rounds to diagnose her/his health\ncondition, forming a multi-turn multimodal medical dialogue format. Unlike\nhigh-quality images captured by professional equipment in traditional medical\nvisual question answering (Med-VQA), the images in our case are taken by\npatients' mobile phones. These images have poor quality control, with issues\nsuch as excessive background elements and the lesion area being significantly\noff-center, leading to degradation of vision-language alignment in the model\ntraining phase. In this paper, we propose ZALM3, a Zero-shot strategy to\nimprove vision-language ALignment in Multi-turn Multimodal Medical dialogue.\nSince we observe that the preceding text conversations before an image can\ninfer the regions of interest (RoIs) in the image, ZALM3 employs an LLM to\nsummarize the keywords from the preceding context and a visual grounding model\nto extract the RoIs. The updated images eliminate unnecessary background noise\nand provide more effective vision-language alignment. To better evaluate our\nproposed method, we design a new subjective assessment metric for multi-turn\nunimodal/multimodal medical dialogue to provide a fine-grained performance\ncomparison. Our experiments across three different clinical departments\nremarkably demonstrate the efficacy of ZALM3 with statistical significance.\n","authors":["Zhangpu Li","Changhong Zou","Suxue Ma","Zhicheng Yang","Chen Du","Youbao Tang","Zhenjie Cao","Ning Zhang","Jui-Hsin Lai","Ruei-Sung Lin","Yuan Ni","Xingzhi Sun","Jing Xiao","Kai Zhang","Mei Han"],"pdf_url":"https://arxiv.org/pdf/2409.17610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16341v2","updated":"2024-09-26T07:54:10Z","published":"2024-09-24T17:20:02Z","title":"Quality Matters: Evaluating Synthetic Data for Tool-Using LLMs","summary":" Training large language models (LLMs) for external tool usage is a rapidly\nexpanding field, with recent research focusing on generating synthetic data to\naddress the shortage of available data. However, the absence of systematic data\nquality checks poses complications for properly training and testing models. To\nthat end, we propose two approaches for assessing the reliability of data for\ntraining LLMs to use external tools. The first approach uses intuitive,\nhuman-defined correctness criteria. The second approach uses a model-driven\nassessment with in-context evaluation. We conduct a thorough evaluation of data\nquality on two popular benchmarks, followed by an extrinsic evaluation that\nshowcases the impact of data quality on model performance. Our results\ndemonstrate that models trained on high-quality data outperform those trained\non unvalidated data, even when trained with a smaller quantity of data. These\nfindings empirically support the significance of assessing and ensuring the\nreliability of training data for tool-using LLMs.\n","authors":["Shadi Iskander","Nachshon Cohen","Zohar Karnin","Ori Shapira","Sofia Tolmach"],"pdf_url":"https://arxiv.org/pdf/2409.16341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17603v1","updated":"2024-09-26T07:40:03Z","published":"2024-09-26T07:40:03Z","title":"Deep CLAS: Deep Contextual Listen, Attend and Spell","summary":" Contextual-LAS (CLAS) has been shown effective in improving Automatic Speech\nRecognition (ASR) of rare words. It relies on phrase-level contextual modeling\nand attention-based relevance scoring without explicit contextual constraint\nwhich lead to insufficient use of contextual information. In this work, we\npropose deep CLAS to use contextual information better. We introduce bias loss\nforcing model to focus on contextual information. The query of bias attention\nis also enriched to improve the accuracy of the bias attention score. To get\nfine-grained contextual information, we replace phrase-level encoding with\ncharacter-level encoding and encode contextual information with conformer\nrather than LSTM. Moreover, we directly use the bias attention score to correct\nthe output probability distribution of the model. Experiments using the public\nAISHELL-1 and AISHELL-NER. On AISHELL-1, compared to CLAS baselines, deep CLAS\nobtains a 65.78% relative recall and a 53.49% relative F1-score increase in the\nnamed entity recognition scene.\n","authors":["Shifu Xiong","Mengzhi Wang","Genshun Wan","Hang Chen","Jianqing Gao","Lirong Dai"],"pdf_url":"https://arxiv.org/pdf/2409.17603v1.pdf","comment":"Accepted by NCMMSC 2022"},{"id":"http://arxiv.org/abs/2409.17588v1","updated":"2024-09-26T07:07:14Z","published":"2024-09-26T07:07:14Z","title":"DualCoTs: Dual Chain-of-Thoughts Prompting for Sentiment Lexicon\n Expansion of Idioms","summary":" Idioms represent a ubiquitous vehicle for conveying sentiments in the realm\nof everyday discourse, rendering the nuanced analysis of idiom sentiment\ncrucial for a comprehensive understanding of emotional expression within\nreal-world texts. Nevertheless, the existing corpora dedicated to idiom\nsentiment analysis considerably limit research in text sentiment analysis. In\nthis paper, we propose an innovative approach to automatically expand the\nsentiment lexicon for idioms, leveraging the capabilities of large language\nmodels through the application of Chain-of-Thought prompting. To demonstrate\nthe effectiveness of this approach, we integrate multiple existing resources\nand construct an emotional idiom lexicon expansion dataset (called EmoIdiomE),\nwhich encompasses a comprehensive repository of Chinese and English idioms.\nThen we designed the Dual Chain-of-Thoughts (DualCoTs) method, which combines\ninsights from linguistics and psycholinguistics, to demonstrate the\neffectiveness of using large models to automatically expand the sentiment\nlexicon for idioms. Experiments show that DualCoTs is effective in idioms\nsentiment lexicon expansion in both Chinese and English. For reproducibility,\nwe will release the data and code upon acceptance.\n","authors":["Fuqiang Niu","Minghuan Tan","Bowen Zhang","Min Yang","Ruifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10267v2","updated":"2024-09-26T06:57:27Z","published":"2024-06-11T09:24:18Z","title":"Unused information in token probability distribution of generative LLM:\n improving LLM reading comprehension through calculation of expected values","summary":" LLM text decoding is key component for perceived LLM quality. We demonstrate\ntwo experiments showing that decoding methods could be improved by manipulation\nof token probabilities. First, we test few LLM on SummEval summary scoring\ndataset, to measure reading comprehension. We compare scores from greedy\ndecoding to expected values over the next token distribution. We scale logits\nby large temperature to increase the entropy of scores. This allows strong\nimprovement of performance on SummEval (in terms of correlations to human\njudgement). We see improvement from 6-8% to 13-28% for 7B Mistral and from\n20%-46% to 37%-56% for Mixtral, beating GPT 4 0314 result on two metrics. Part\nof the gain seems related to positional bias. Secondly, we use\nprobability-based tree sampling algorithm, to examine all most probable\ngenerations for given prompt.\n","authors":["Krystian Zawistowski"],"pdf_url":"https://arxiv.org/pdf/2406.10267v2.pdf","comment":"7 pages, 1 figure, presented at FEDCSIS 2024 conference,"},{"id":"http://arxiv.org/abs/2409.17577v1","updated":"2024-09-26T06:46:53Z","published":"2024-09-26T06:46:53Z","title":"Leveraging Annotator Disagreement for Text Classification","summary":" It is common practice in text classification to only use one majority label\nfor model training even if a dataset has been annotated by multiple annotators.\nDoing so can remove valuable nuances and diverse perspectives inherent in the\nannotators' assessments. This paper proposes and compares three different\nstrategies to leverage annotator disagreement for text classification: a\nprobability-based multi-label method, an ensemble system, and instruction\ntuning. All three approaches are evaluated on the tasks of hate speech and\nabusive conversation detection, which inherently entail a high degree of\nsubjectivity. Moreover, to evaluate the effectiveness of embracing annotation\ndisagreements for model training, we conduct an online survey that compares the\nperformance of the multi-label model against a baseline model, which is trained\nwith the majority label.\n The results show that in hate speech detection, the multi-label method\noutperforms the other two approaches, while in abusive conversation detection,\ninstruction tuning achieves the best performance. The results of the survey\nalso show that the outputs from the multi-label models are considered a better\nrepresentation of the texts than the single-label model.\n","authors":["Jin Xu","Mariët Theune","Daniel Braun"],"pdf_url":"https://arxiv.org/pdf/2409.17577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02436v2","updated":"2024-09-26T06:27:08Z","published":"2024-03-04T19:33:39Z","title":"How does Architecture Influence the Base Capabilities of Pre-trained\n Language Models? A Case Study Based on FFN-Wider and MoE Transformers","summary":" Pre-trained language models have been proven to possess strong base\ncapabilities, which not only excel in in-distribution language modeling but\nalso show powerful abilities in out-of-distribution language modeling, transfer\nlearning and few-shot learning. Unlike existing work focusing on the influence\nof scale on base capabilities, our work examines the influence of architecture\non those. Specifically, our concern is: How does architecture influence the\nbase capabilities of pre-trained language models? In this work, we attempt to\nexplain and reverse the decline in base capabilities caused by the architecture\nof FFN-Wider Transformers, seeking to provide some insights. Through analysis,\nwe found the contribution ratio of Multi-Head Attention (a combination\nfunction) to pre-trained language modeling is a key factor affecting base\ncapabilities. FFN-Wider Transformers reduce the contribution ratio of this\ncombination function, leading to a decline in base capabilities. We confirmed\nthis by experiments and proposed Combination Enhanced Architecture (CEA) to\naddress the decline in base capabilities of such models. Significantly, we\nextended our explanation and CEA to Mixture of Experts (MoE) Transformers. We\nsuccessfully achieved significant improvements in base capabilities on a 14B\nparameter MoE model, demonstrating the practical application value of our work.\nThis also indicates that our analysis has a certain guiding significance for\narchitecture analysis, architecture improvement and architecture design.\n","authors":["Xin Lu","Yanyan Zhao","Bing Qin","Liangyu Huo","Qing Yang","Dongliang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.02436v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05013v2","updated":"2024-09-26T06:19:34Z","published":"2024-06-07T15:23:53Z","title":"CHIQ: Contextual History Enhancement for Improving Query Rewriting in\n Conversational Search","summary":" In this paper, we study how open-source large language models (LLMs) can be\neffectively deployed for improving query rewriting in conversational search,\nespecially for ambiguous queries. We introduce CHIQ, a two-step method that\nleverages the capabilities of LLMs to resolve ambiguities in the conversation\nhistory before query rewriting. This approach contrasts with prior studies that\npredominantly use closed-source LLMs to directly generate search queries from\nconversation history. We demonstrate on five well-established benchmarks that\nCHIQ leads to state-of-the-art results across most settings, showing highly\ncompetitive performances with systems leveraging closed-source LLMs. Our study\nprovides a first step towards leveraging open-source LLMs in conversational\nsearch, as a competitive alternative to the prevailing reliance on commercial\nLLMs. Data, models, and source code will be publicly available upon acceptance\nat https://github.com/fengranMark/CHIQ.\n","authors":["Fengran Mo","Abbas Ghaddar","Kelong Mao","Mehdi Rezagholizadeh","Boxing Chen","Qun Liu","Jian-Yun Nie"],"pdf_url":"https://arxiv.org/pdf/2406.05013v2.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2406.17255v2","updated":"2024-09-26T06:18:44Z","published":"2024-06-25T03:45:28Z","title":"MPCODER: Multi-user Personalized Code Generator with Explicit and\n Implicit Style Representation Learning","summary":" Large Language Models (LLMs) have demonstrated great potential for assisting\ndevelopers in their daily development. However, most research focuses on\ngenerating correct code, how to use LLMs to generate personalized code has\nseldom been investigated. To bridge this gap, we proposed MPCoder (Multi-user\nPersonalized Code Generator) to generate personalized code for multiple users.\nTo better learn coding style features, we utilize explicit coding style\nresidual learning to capture the syntax code style standards and implicit style\nlearning to capture the semantic code style conventions. We train a multi-user\nstyle adapter to better differentiate the implicit feature representations of\ndifferent users through contrastive learning, ultimately enabling personalized\ncode generation for multiple users. We further propose a novel evaluation\nmetric for estimating similarities between codes of different coding styles.\nThe experimental results show the effectiveness of our approach for this novel\ntask.\n","authors":["Zhenlong Dai","Chang Yao","WenKang Han","Ying Yuan","Zhipeng Gao","Jingyuan Chen"],"pdf_url":"https://arxiv.org/pdf/2406.17255v2.pdf","comment":"Accepted by ACL 2024, Main Conference"},{"id":"http://arxiv.org/abs/2409.15977v2","updated":"2024-09-26T05:26:50Z","published":"2024-09-24T11:18:09Z","title":"TCSinger: Zero-Shot Singing Voice Synthesis with Style Transfer and\n Multi-Level Style Control","summary":" Zero-shot singing voice synthesis (SVS) with style transfer and style control\naims to generate high-quality singing voices with unseen timbres and styles\n(including singing method, emotion, rhythm, technique, and pronunciation) from\naudio and text prompts. However, the multifaceted nature of singing styles\nposes a significant challenge for effective modeling, transfer, and control.\nFurthermore, current SVS models often fail to generate singing voices rich in\nstylistic nuances for unseen singers. To address these challenges, we introduce\nTCSinger, the first zero-shot SVS model for style transfer across cross-lingual\nspeech and singing styles, along with multi-level style control. Specifically,\nTCSinger proposes three primary modules: 1) the clustering style encoder\nemploys a clustering vector quantization model to stably condense style\ninformation into a compact latent space; 2) the Style and Duration Language\nModel (S\\&D-LM) concurrently predicts style information and phoneme duration,\nwhich benefits both; 3) the style adaptive decoder uses a novel mel-style\nadaptive normalization method to generate singing voices with enhanced details.\nExperimental results show that TCSinger outperforms all baseline models in\nsynthesis quality, singer similarity, and style controllability across various\ntasks, including zero-shot style transfer, multi-level style control,\ncross-lingual style transfer, and speech-to-singing style transfer. Singing\nvoice samples can be accessed at https://tcsinger.github.io/.\n","authors":["Yu Zhang","Ziyue Jiang","Ruiqi Li","Changhao Pan","Jinzheng He","Rongjie Huang","Chuxin Wang","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.15977v2.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17545v1","updated":"2024-09-26T05:24:14Z","published":"2024-09-26T05:24:14Z","title":"Modulated Intervention Preference Optimization (MIPO): Keey the Easy,\n Refine the Difficult","summary":" Preference optimization methods typically begin training with a well-trained\nSFT model as a reference model. In RLHF and DPO, a regularization term is used\nduring the preference optimization process to prevent the policy model from\ndeviating too far from the reference model's distribution, thereby avoiding the\ngeneration of anomalous responses. When the reference model is already\nwell-aligned with the given data or only requires slight adjustments, this\napproach can produce a well-aligned model. However, if the reference model is\nnot aligned with the given data and requires significant deviation from its\ncurrent state, a regularization term may actually hinder the model alignment.\nIn this study, we propose \\textbf{Modulated Intervention Preference\nOptimization (MIPO)} to address this issue. MIPO modulates the degree of\nintervention from the reference model based on how well the given data is\naligned with it. If the data is well-aligned, the intervention is increased to\nprevent the policy model from diverging significantly from reference model.\nConversely, if the alignment is poor, the interference is reduced to facilitate\nmore extensive training. We compare the performance of MIPO and DPO using\nMistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental\nresults demonstrate that MIPO consistently outperforms DPO across various\nevaluation scenarios.\n","authors":["Cheolhun Jang"],"pdf_url":"https://arxiv.org/pdf/2409.17545v1.pdf","comment":"8pages, submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2409.17539v1","updated":"2024-09-26T04:59:45Z","published":"2024-09-26T04:59:45Z","title":"Logic-of-Thought: Injecting Logic into Contexts for Full Reasoning in\n Large Language Models","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities across\nvarious tasks but their performance in complex logical reasoning tasks remains\nunsatisfactory. Although some prompting methods, such as Chain-of-Thought, can\nimprove the reasoning ability of LLMs to some extent, they suffer from an\nunfaithful issue where derived conclusions may not align with the generated\nreasoning chain. To address this issue, some studies employ the approach of\npropositional logic to further enhance logical reasoning abilities of LLMs.\nHowever, the potential omissions in the extraction of logical expressions in\nthese methods can cause information loss in the logical reasoning process,\nthereby generating incorrect results. To this end, we propose Logic-of-Thought\n(LoT) prompting which employs propositional logic to generate expanded logical\ninformation from input context, and utilizes the generated logical information\nas an additional augmentation to the input prompts, thereby enhancing the\ncapability of logical reasoning. The LoT is orthogonal to existing prompting\nmethods and can be seamlessly integrated with them. Extensive experiments\ndemonstrate that LoT boosts the performance of various prompting methods with a\nstriking margin across five logical reasoning tasks. In particular, the LoT\nenhances Chain-of-Thought's performance on the ReClor dataset by +4.35%;\nmoreover, it improves Chain-of-Thought with Self-Consistency's performance on\nLogiQA by +5%; additionally, it boosts performance of Tree-of-Thoughts on\nProofWriter dataset by +8%.\n","authors":["Tongxuan Liu","Wenjiang Xu","Weizhe Huang","Xingyu Wang","Jiaxing Wang","Hailong Yang","Jing Li"],"pdf_url":"https://arxiv.org/pdf/2409.17539v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2409.17538v1","updated":"2024-09-26T04:56:49Z","published":"2024-09-26T04:56:49Z","title":"On the Implicit Relation Between Low-Rank Adaptation and Differential\n Privacy","summary":" A significant approach in natural language processing involves large-scale\npre-training on general domain data followed by adaptation to specific tasks or\ndomains. As models grow in size, full fine-tuning all parameters becomes\nincreasingly impractical. To address this, some methods for low-rank task\nadaptation of language models have been proposed, e.g. LoRA and FLoRA. These\nmethods keep the pre-trained model weights fixed and incorporate trainable\nlow-rank decomposition matrices into some layers of the transformer\narchitecture, called adapters. This approach significantly reduces the number\nof trainable parameters required for downstream tasks compared to full\nfine-tuning all parameters. In this work, we look at low-rank adaptation from\nthe lens of data privacy. We show theoretically that the low-rank adaptation\nused in LoRA and FLoRA is equivalent to injecting some random noise into the\nbatch gradients w.r.t the adapter parameters coming from their full\nfine-tuning, and we quantify the variance of the injected noise. By\nestablishing a Berry-Esseen type bound on the total variation distance between\nthe noise distribution and a Gaussian distribution with the same variance, we\nshow that the dynamics of LoRA and FLoRA are very close to differentially\nprivate full fine-tuning the adapters, which suggests that low-rank adaptation\nimplicitly provides privacy w.r.t the fine-tuning data. Finally, using\nJohnson-Lindenstrauss lemma, we show that when augmented with gradient\nclipping, low-rank adaptation is almost equivalent to differentially private\nfull fine-tuning adapters with a fixed noise scale.\n","authors":["Saber Malekmohammadi","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2409.17538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17536v1","updated":"2024-09-26T04:48:20Z","published":"2024-09-26T04:48:20Z","title":"MUSE: Integrating Multi-Knowledge for Knowledge Graph Completion","summary":" Knowledge Graph Completion (KGC) aims to predict the missing [relation] part\nof (head entity)--[relation]->(tail entity) triplet. Most existing KGC methods\nfocus on single features (e.g., relation types) or sub-graph aggregation.\nHowever, they do not fully explore the Knowledge Graph (KG) features and\nneglect the guidance of external semantic knowledge. To address these\nshortcomings, we propose a knowledge-aware reasoning model (MUSE), which\ndesigns a novel multi-knowledge representation learning mechanism for missing\nrelation prediction. Our model develops a tailored embedding space through\nthree parallel components: 1) Prior Knowledge Learning for enhancing the\ntriplets' semantic representation by fine-tuning BERT; 2) Context Message\nPassing for enhancing the context messages of KG; 3) Relational Path\nAggregation for enhancing the path representation from the head entity to the\ntail entity. The experimental results show that MUSE significantly outperforms\nother baselines on four public datasets, achieving over 5.50% H@1 improvement\nand 4.20% MRR improvement on the NELL995 dataset. The code and datasets will be\nreleased via https://github.com/SUSTech-TP/ADMA2024-MUSE.git.\n","authors":["Pengjie Liu"],"pdf_url":"https://arxiv.org/pdf/2409.17536v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2408.05283"},{"id":"http://arxiv.org/abs/2409.17527v1","updated":"2024-09-26T04:30:32Z","published":"2024-09-26T04:30:32Z","title":"Data Proportion Detection for Optimized Data Management for Large\n Language Models","summary":" Large language models (LLMs) have demonstrated exceptional performance across\na wide range of tasks and domains, with data preparation playing a critical\nrole in achieving these results. Pre-training data typically combines\ninformation from multiple domains. To maximize performance when integrating\ndata from various domains, determining the optimal data proportion is\nessential. However, state-of-the-art (SOTA) LLMs rarely disclose details about\ntheir pre-training data, making it difficult for researchers to identify ideal\ndata proportions. In this paper, we introduce a new topic, \\textit{data\nproportion detection}, which enables the automatic estimation of pre-training\ndata proportions by analyzing the generated outputs of LLMs. We provide\nrigorous theoretical proofs, practical algorithms, and preliminary experimental\nresults for data proportion detection. Based on these findings, we offer\nvaluable insights into the challenges and future directions for effective data\nproportion detection and data management.\n","authors":["Hao Liang","Keshi Zhao","Yajie Yang","Bin Cui","Guosheng Dong","Zenan Zhou","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17525v1","updated":"2024-09-26T04:24:52Z","published":"2024-09-26T04:24:52Z","title":"When A Man Says He Is Pregnant: ERP Evidence for A Rational Account of\n Speaker-contextualized Language Comprehension","summary":" Spoken language is often, if not always, understood in a context that\nincludes the identities of speakers. For instance, we can easily make sense of\nan utterance such as \"I'm going to have a manicure this weekend\" or \"The first\ntime I got pregnant I had a hard time\" when the utterance is spoken by a woman,\nbut it would be harder to understand when it is spoken by a man. Previous\nevent-related potential (ERP) studies have shown mixed results regarding the\nneurophysiological responses to such speaker-mismatched utterances, with some\nreporting an N400 effect and others a P600 effect. In an experiment involving\n64 participants, we showed that these different ERP effects reflect distinct\ncognitive processes employed to resolve the speaker-message mismatch. When\npossible, the message is integrated with the speaker context to arrive at an\ninterpretation, as in the case of violations of social stereotypes (e.g., men\ngetting a manicure), resulting in an N400 effect. However, when such\nintegration is impossible due to violations of biological knowledge (e.g., men\ngetting pregnant), listeners engage in an error correction process to revise\neither the perceived utterance or the speaker context, resulting in a P600\neffect. Additionally, we found that the social N400 effect decreased as a\nfunction of the listener's personality trait of openness, while the biological\nP600 effect remained robust. Our findings help to reconcile the empirical\ninconsistencies in the literature and provide a rational account of\nspeaker-contextualized language comprehension.\n","authors":["Hanlin Wu","Zhenguang G. Cai"],"pdf_url":"https://arxiv.org/pdf/2409.17525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17513v1","updated":"2024-09-26T03:48:47Z","published":"2024-09-26T03:48:47Z","title":"Comparing Unidirectional, Bidirectional, and Word2vec Models for\n Discovering Vulnerabilities in Compiled Lifted Code","summary":" Ransomware and other forms of malware cause significant financial and\noperational damage to organizations by exploiting long-standing and often\ndifficult-to-detect software vulnerabilities. To detect vulnerabilities such as\nbuffer overflows in compiled code, this research investigates the application\nof unidirectional transformer-based embeddings, specifically GPT-2. Using a\ndataset of LLVM functions, we trained a GPT-2 model to generate embeddings,\nwhich were subsequently used to build LSTM neural networks to differentiate\nbetween vulnerable and non-vulnerable code. Our study reveals that embeddings\nfrom the GPT-2 model significantly outperform those from bidirectional models\nof BERT and RoBERTa, achieving an accuracy of 92.5% and an F1-score of 89.7%.\nLSTM neural networks were developed with both frozen and unfrozen embedding\nmodel layers. The model with the highest performance was achieved when the\nembedding layers were unfrozen. Further, the research finds that, in exploring\nthe impact of different optimizers within this domain, the SGD optimizer\ndemonstrates superior performance over Adam. Overall, these findings reveal\nimportant insights into the potential of unidirectional transformer-based\napproaches in enhancing cybersecurity defenses.\n","authors":["Gary A. McCully","John D. Hastings","Shengjie Xu","Adam Fortier"],"pdf_url":"https://arxiv.org/pdf/2409.17513v1.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.17504v1","updated":"2024-09-26T03:22:09Z","published":"2024-09-26T03:22:09Z","title":"HaloScope: Harnessing Unlabeled LLM Generations for Hallucination\n Detection","summary":" The surge in applications of large language models (LLMs) has prompted\nconcerns about the generation of misleading or fabricated information, known as\nhallucinations. Therefore, detecting hallucinations has become critical to\nmaintaining trust in LLM-generated content. A primary challenge in learning a\ntruthfulness classifier is the lack of a large amount of labeled truthful and\nhallucinated data. To address the challenge, we introduce HaloScope, a novel\nlearning framework that leverages the unlabeled LLM generations in the wild for\nhallucination detection. Such unlabeled data arises freely upon deploying LLMs\nin the open world, and consists of both truthful and hallucinated information.\nTo harness the unlabeled data, we present an automated membership estimation\nscore for distinguishing between truthful and untruthful generations within\nunlabeled mixture data, thereby enabling the training of a binary truthfulness\nclassifier on top. Importantly, our framework does not require extra data\ncollection and human annotations, offering strong flexibility and practicality\nfor real-world applications. Extensive experiments show that HaloScope can\nachieve superior hallucination detection performance, outperforming the\ncompetitive rivals by a significant margin. Code is available at\nhttps://github.com/deeplearningwisc/haloscope.\n","authors":["Xuefeng Du","Chaowei Xiao","Yixuan Li"],"pdf_url":"https://arxiv.org/pdf/2409.17504v1.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2402.10669v5","updated":"2024-09-26T03:16:52Z","published":"2024-02-16T13:21:06Z","title":"Humans or LLMs as the Judge? A Study on Judgement Biases","summary":" Adopting human and large language models (LLM) as judges (a.k.a human- and\nLLM-as-a-judge) for evaluating the performance of LLMs has recently gained\nattention. Nonetheless, this approach concurrently introduces potential biases\nfrom human and LLMs, questioning the reliability of the evaluation results. In\nthis paper, we propose a novel framework that is free from referencing\ngroundtruth annotations for investigating Misinformation Oversight Bias, Gender\nBias, Authority Bias and Beauty Bias on LLM and human judges. We curate a\ndataset referring to the revised Bloom's Taxonomy and conduct thousands of\nevaluations. Results show that human and LLM judges are vulnerable to\nperturbations to various degrees, and that even the cutting-edge judges possess\nconsiderable biases. We further exploit these biases to conduct attacks on LLM\njudges. We hope that our work can notify the community of the bias and\nvulnerability of human- and LLM-as-a-judge, as well as the urgency of\ndeveloping robust evaluation systems.\n","authors":["Guiming Hardy Chen","Shunian Chen","Ziche Liu","Feng Jiang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2402.10669v5.pdf","comment":"EMNLP2024"},{"id":"http://arxiv.org/abs/2409.14509v3","updated":"2024-09-26T03:15:53Z","published":"2024-09-22T16:13:00Z","title":"Can AI writing be salvaged? Mitigating Idiosyncrasies and Improving\n Human-AI Alignment in the Writing Process through Edits","summary":" LLM-based applications are helping people write, and LLM-generated text is\nmaking its way into social media, journalism, and our classrooms. However, the\ndifferences between LLM-generated and human-written text remain unclear. To\nexplore this, we hired professional writers to edit paragraphs in several\ncreative domains. We first found these writers agree on undesirable\nidiosyncrasies in LLM-generated text, formalizing it into a seven-category\ntaxonomy (e.g. cliches, unnecessary exposition). Second, we curated the LAMP\ncorpus: 1,057 LLM-generated paragraphs edited by professional writers according\nto our taxonomy. Analysis of LAMP reveals that none of the LLMs used in our\nstudy (GPT4o, Claude-3.5-Sonnet, Llama-3.1-70b) outperform each other in terms\nof writing quality, revealing common limitations across model families. Third,\nwe explored automatic editing methods to improve LLM-generated text. A\nlarge-scale preference annotation confirms that although experts largely prefer\ntext edited by other experts, automatic editing methods show promise in\nimproving alignment between LLM-generated and human-written text.\n","authors":["Tuhin Chakrabarty","Philippe Laban","Chien-Sheng Wu"],"pdf_url":"https://arxiv.org/pdf/2409.14509v3.pdf","comment":"NLP+HCI, Behavioral Science"},{"id":"http://arxiv.org/abs/2409.17481v1","updated":"2024-09-26T02:37:41Z","published":"2024-09-26T02:37:41Z","title":"MaskLLM: Learnable Semi-Structured Sparsity for Large Language Models","summary":" Large Language Models (LLMs) are distinguished by their massive parameter\ncounts, which typically result in significant redundancy. This work introduces\nMaskLLM, a learnable pruning method that establishes Semi-structured (or\n``N:M'') Sparsity in LLMs, aimed at reducing computational overhead during\ninference. Instead of developing a new importance criterion, MaskLLM explicitly\nmodels N:M patterns as a learnable distribution through Gumbel Softmax\nsampling. This approach facilitates end-to-end training on large-scale datasets\nand offers two notable advantages: 1) High-quality Masks - our method\neffectively scales to large datasets and learns accurate masks; 2)\nTransferability - the probabilistic modeling of mask distribution enables the\ntransfer learning of sparsity across domains or tasks. We assessed MaskLLM\nusing 2:4 sparsity on various LLMs, including LLaMA-2, Nemotron-4, and GPT-3,\nwith sizes ranging from 843M to 15B parameters, and our empirical results show\nsubstantial improvements over state-of-the-art methods. For instance, leading\napproaches achieve a perplexity (PPL) of 10 or greater on Wikitext compared to\nthe dense model's 5.12 PPL, but MaskLLM achieves a significantly lower 6.72 PPL\nsolely by learning the masks with frozen weights. Furthermore, MaskLLM's\nlearnable nature allows customized masks for lossless application of 2:4\nsparsity to downstream tasks or domains. Code is available at\n\\url{https://github.com/NVlabs/MaskLLM}.\n","authors":["Gongfan Fang","Hongxu Yin","Saurav Muralidharan","Greg Heinrich","Jeff Pool","Jan Kautz","Pavlo Molchanov","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17481v1.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2409.17474v1","updated":"2024-09-26T02:19:13Z","published":"2024-09-26T02:19:13Z","title":"Reducing and Exploiting Data Augmentation Noise through Meta Reweighting\n Contrastive Learning for Text Classification","summary":" Data augmentation has shown its effectiveness in resolving the data-hungry\nproblem and improving model's generalization ability. However, the quality of\naugmented data can be varied, especially compared with the raw/original data.\nTo boost deep learning models' performance given augmented data/samples in text\nclassification tasks, we propose a novel framework, which leverages both meta\nlearning and contrastive learning techniques as parts of our design for\nreweighting the augmented samples and refining their feature representations\nbased on their quality. As part of the framework, we propose novel\nweight-dependent enqueue and dequeue algorithms to utilize augmented samples'\nweight/quality information effectively. Through experiments, we show that our\nframework can reasonably cooperate with existing deep learning models (e.g.,\nRoBERTa-base and Text-CNN) and augmentation techniques (e.g., Wordnet and\nEasydata) for specific supervised learning tasks. Experiment results show that\nour framework achieves an average of 1.6%, up to 4.3% absolute improvement on\nText-CNN encoders and an average of 1.4%, up to 4.4% absolute improvement on\nRoBERTa-base encoders on seven GLUE benchmark datasets compared with the best\nbaseline. We present an indepth analysis of our framework design, revealing the\nnon-trivial contributions of our network components. Our code is publicly\navailable for better reproducibility.\n","authors":["Guanyi Mou","Yichuan Li","Kyumin Lee"],"pdf_url":"https://arxiv.org/pdf/2409.17474v1.pdf","comment":"IEEE BigData 2021"},{"id":"http://arxiv.org/abs/2409.17472v1","updated":"2024-09-26T02:16:48Z","published":"2024-09-26T02:16:48Z","title":"Autoregressive Multi-trait Essay Scoring via Reinforcement Learning with\n Scoring-aware Multiple Rewards","summary":" Recent advances in automated essay scoring (AES) have shifted towards\nevaluating multiple traits to provide enriched feedback. Like typical AES\nsystems, multi-trait AES employs the quadratic weighted kappa (QWK) to measure\nagreement with human raters, aligning closely with the rating schema; however,\nits non-differentiable nature prevents its direct use in neural network\ntraining. In this paper, we propose Scoring-aware Multi-reward Reinforcement\nLearning (SaMRL), which integrates actual evaluation schemes into the training\nprocess by designing QWK-based rewards with a mean-squared error penalty for\nmulti-trait AES. Existing reinforcement learning (RL) applications in AES are\nlimited to classification models despite associated performance degradation, as\nRL requires probability distributions; instead, we adopt an autoregressive\nscore generation framework to leverage token generation probabilities for\nrobust multi-trait score predictions. Empirical analyses demonstrate that SaMRL\nfacilitates model training, notably enhancing scoring of previously inferior\nprompts.\n","authors":["Heejin Do","Sangwon Ryu","Gary Geunbae Lee"],"pdf_url":"https://arxiv.org/pdf/2409.17472v1.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.14552v2","updated":"2024-09-26T02:02:13Z","published":"2024-09-22T18:29:10Z","title":"Unleashing the Power of Emojis in Texts via Self-supervised Graph\n Pre-Training","summary":" Emojis have gained immense popularity on social platforms, serving as a\ncommon means to supplement or replace text. However, existing data mining\napproaches generally either completely ignore or simply treat emojis as\nordinary Unicode characters, which may limit the model's ability to grasp the\nrich semantic information in emojis and the interaction between emojis and\ntexts. Thus, it is necessary to release the emoji's power in social media data\nmining. To this end, we first construct a heterogeneous graph consisting of\nthree types of nodes, i.e. post, word and emoji nodes to improve the\nrepresentation of different elements in posts. The edges are also well-defined\nto model how these three elements interact with each other. To facilitate the\nsharing of information among post, word and emoji nodes, we propose a graph\npre-train framework for text and emoji co-modeling, which contains two graph\npre-training tasks: node-level graph contrastive learning and edge-level link\nreconstruction learning. Extensive experiments on the Xiaohongshu and Twitter\ndatasets with two types of downstream tasks demonstrate that our approach\nproves significant improvement over previous strong baseline methods.\n","authors":["Zhou Zhang","Dongzeng Tan","Jiaan Wang","Yilong Chen","Jiarong Xu"],"pdf_url":"https://arxiv.org/pdf/2409.14552v2.pdf","comment":"Accepted by EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2409.17467v1","updated":"2024-09-26T01:57:27Z","published":"2024-09-26T01:57:27Z","title":"What is the social benefit of hate speech detection research? A\n Systematic Review","summary":" While NLP research into hate speech detection has grown exponentially in the\nlast three decades, there has been minimal uptake or engagement from policy\nmakers and non-profit organisations. We argue the absence of ethical frameworks\nhave contributed to this rift between current practice and best practice. By\nadopting appropriate ethical frameworks, NLP researchers may enable the social\nimpact potential of hate speech research. This position paper is informed by\nreviewing forty-eight hate speech detection systems associated with\nthirty-seven publications from different venues.\n","authors":["Sidney Gig-Jan Wong"],"pdf_url":"https://arxiv.org/pdf/2409.17467v1.pdf","comment":"Accepted to the 3rd Workshop on NLP for Positive Impact"},{"id":"http://arxiv.org/abs/2409.17458v1","updated":"2024-09-26T01:24:17Z","published":"2024-09-26T01:24:17Z","title":"RED QUEEN: Safeguarding Large Language Models against Concealed\n Multi-Turn Jailbreaking","summary":" The rapid progress of Large Language Models (LLMs) has opened up new\nopportunities across various domains and applications; yet it also presents\nchallenges related to potential misuse. To mitigate such risks, red teaming has\nbeen employed as a proactive security measure to probe language models for\nharmful outputs via jailbreak attacks. However, current jailbreak attack\napproaches are single-turn with explicit malicious queries that do not fully\ncapture the complexity of real-world interactions. In reality, users can engage\nin multi-turn interactions with LLM-based chat assistants, allowing them to\nconceal their true intentions in a more covert manner. To bridge this gap, we,\nfirst, propose a new jailbreak approach, RED QUEEN ATTACK. This method\nconstructs a multi-turn scenario, concealing the malicious intent under the\nguise of preventing harm. We craft 40 scenarios that vary in turns and select\n14 harmful categories to generate 56k multi-turn attack data points. We conduct\ncomprehensive experiments on the RED QUEEN ATTACK with four representative LLM\nfamilies of different sizes. Our experiments reveal that all LLMs are\nvulnerable to RED QUEEN ATTACK, reaching 87.62% attack success rate on GPT-4o\nand 75.4% on Llama3-70B. Further analysis reveals that larger models are more\nsusceptible to the RED QUEEN ATTACK, with multi-turn structures and concealment\nstrategies contributing to its success. To prioritize safety, we introduce a\nstraightforward mitigation strategy called RED QUEEN GUARD, which aligns LLMs\nto effectively counter adversarial attacks. This approach reduces the attack\nsuccess rate to below 1% while maintaining the model's performance across\nstandard benchmarks. Full implementation and dataset are publicly accessible at\nhttps://github.com/kriti-hippo/red_queen.\n","authors":["Yifan Jiang","Kriti Aggarwal","Tanmay Laud","Kashif Munir","Jay Pujara","Subhabrata Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2409.17458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17455v1","updated":"2024-09-26T01:17:42Z","published":"2024-09-26T01:17:42Z","title":"Navigating the Shortcut Maze: A Comprehensive Analysis of Shortcut\n Learning in Text Classification by Language Models","summary":" Language models (LMs), despite their advances, often depend on spurious\ncorrelations, undermining their accuracy and generalizability. This study\naddresses the overlooked impact of subtler, more complex shortcuts that\ncompromise model reliability beyond oversimplified shortcuts. We introduce a\ncomprehensive benchmark that categorizes shortcuts into occurrence, style, and\nconcept, aiming to explore the nuanced ways in which these shortcuts influence\nthe performance of LMs. Through extensive experiments across traditional LMs,\nlarge language models, and state-of-the-art robust models, our research\nsystematically investigates models' resilience and susceptibilities to\nsophisticated shortcuts. Our benchmark and code can be found at:\nhttps://github.com/yuqing-zhou/shortcut-learning-in-text-classification.\n","authors":["Yuqing Zhou","Ruixiang Tang","Ziyu Yao","Ziwei Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.17455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17452v1","updated":"2024-09-26T01:08:09Z","published":"2024-09-26T01:08:09Z","title":"Description-based Controllable Text-to-Speech with Cross-Lingual Voice\n Control","summary":" We propose a novel description-based controllable text-to-speech (TTS) method\nwith cross-lingual control capability. To address the lack of audio-description\npaired data in the target language, we combine a TTS model trained on the\ntarget language with a description control model trained on another language,\nwhich maps input text descriptions to the conditional features of the TTS\nmodel. These two models share disentangled timbre and style representations\nbased on self-supervised learning (SSL), allowing for disentangled voice\ncontrol, such as controlling speaking styles while retaining the original\ntimbre. Furthermore, because the SSL-based timbre and style representations are\nlanguage-agnostic, combining the TTS and description control models while\nsharing the same embedding space effectively enables cross-lingual control of\nvoice characteristics. Experiments on English and Japanese TTS demonstrate that\nour method achieves high naturalness and controllability for both languages,\neven though no Japanese audio-description pairs are used.\n","authors":["Ryuichi Yamamoto","Yuma Shirahata","Masaya Kawamura","Kentaro Tachibana"],"pdf_url":"https://arxiv.org/pdf/2409.17452v1.pdf","comment":"Submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2109.04993v3","updated":"2024-09-26T00:58:15Z","published":"2021-09-04T22:48:46Z","title":"LAViTeR: Learning Aligned Visual and Textual Representations Assisted by\n Image and Caption Generation","summary":" Pre-training visual and textual representations from large-scale image-text\npairs is becoming a standard approach for many downstream vision-language\ntasks. The transformer-based models learn inter and intra-modal attention\nthrough a list of self-supervised learning tasks. This paper proposes LAViTeR,\na novel architecture for visual and textual representation learning. The main\nmodule, Visual Textual Alignment (VTA) will be assisted by two auxiliary tasks,\nGAN-based image synthesis and Image Captioning. We also propose a new\nevaluation metric measuring the similarity between the learnt visual and\ntextual embedding. The experimental results on two public datasets, CUB and\nMS-COCO, demonstrate superior visual and textual representation alignment in\nthe joint feature embedding space\n","authors":["Mohammad Abuzar Hashemi","Zhanghexuan Li","Mihir Chauhan","Yan Shen","Abhishek Satbhai","Mir Basheer Ali","Mingchen Gao","Sargur Srihari"],"pdf_url":"https://arxiv.org/pdf/2109.04993v3.pdf","comment":"15 pages, 10 Figures, 5 Tables. Oral Presentation at Irish Machine\n Vision and Image Processing Conference Proceedings, 2024"},{"id":"http://arxiv.org/abs/2409.17448v1","updated":"2024-09-26T00:54:17Z","published":"2024-09-26T00:54:17Z","title":"Enhancing Financial Sentiment Analysis with Expert-Designed Hint","summary":" This paper investigates the role of expert-designed hint in enhancing\nsentiment analysis on financial social media posts. We explore the capability\nof large language models (LLMs) to empathize with writer perspectives and\nanalyze sentiments. Our findings reveal that expert-designed hint, i.e.,\npointing out the importance of numbers, significantly improve performances\nacross various LLMs, particularly in cases requiring perspective-taking skills.\nFurther analysis on tweets containing different types of numerical data\ndemonstrates that the inclusion of expert-designed hint leads to notable\nimprovements in sentiment analysis performance, especially for tweets with\nmonetary-related numbers. Our findings contribute to the ongoing discussion on\nthe applicability of Theory of Mind in NLP and open new avenues for improving\nsentiment analysis in financial domains through the strategic use of expert\nknowledge.\n","authors":["Chung-Chi Chen","Hiroya Takamura","Ichiro Kobayashi","Yusuke Miyao"],"pdf_url":"https://arxiv.org/pdf/2409.17448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00948v2","updated":"2024-09-26T00:24:25Z","published":"2024-07-01T04:07:49Z","title":"View From Above: A Framework for Evaluating Distribution Shifts in Model\n Behavior","summary":" When large language models (LLMs) are asked to perform certain tasks, how can\nwe be sure that their learned representations align with reality? We propose a\ndomain-agnostic framework for systematically evaluating distribution shifts in\nLLMs decision-making processes, where they are given control of mechanisms\ngoverned by pre-defined rules. While individual LLM actions may appear\nconsistent with expected behavior, across a large number of trials,\nstatistically significant distribution shifts can emerge. To test this, we\nconstruct a well-defined environment with known outcome logic: blackjack. In\nmore than 1,000 trials, we uncover statistically significant evidence\nsuggesting behavioral misalignment in the learned representations of LLM.\n","authors":["Tanush Chopra","Michael Li","Jacob Haimes"],"pdf_url":"https://arxiv.org/pdf/2407.00948v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17073v2","updated":"2024-09-26T20:40:15Z","published":"2024-09-25T16:32:35Z","title":"Enhancing Post-Hoc Attributions in Long Document Comprehension via\n Coarse Grained Answer Decomposition","summary":" Accurately attributing answer text to its source document is crucial for\ndeveloping a reliable question-answering system. However, attribution for long\ndocuments remains largely unexplored. Post-hoc attribution systems are designed\nto map answer text back to the source document, yet the granularity of this\nmapping has not been addressed. Furthermore, a critical question arises: What\nexactly should be attributed? This involves identifying the specific\ninformation units within an answer that require grounding. In this paper, we\npropose and investigate a novel approach to the factual decomposition of\ngenerated answers for attribution, employing template-based in-context\nlearning. To accomplish this, we utilize the question and integrate negative\nsampling during few-shot in-context learning for decomposition. This approach\nenhances the semantic understanding of both abstractive and extractive answers.\nWe examine the impact of answer decomposition by providing a thorough\nexamination of various attribution approaches, ranging from retrieval-based\ntechniques to LLM-based attributors.\n","authors":["Pritika Ramu","Koustava Goswami","Apoorv Saxena","Balaji Vasan Srinivavsan"],"pdf_url":"https://arxiv.org/pdf/2409.17073v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18346v1","updated":"2024-09-26T23:48:08Z","published":"2024-09-26T23:48:08Z","title":"MultiClimate: Multimodal Stance Detection on Climate Change Videos","summary":" Climate change (CC) has attracted increasing attention in NLP in recent\nyears. However, detecting the stance on CC in multimodal data is understudied\nand remains challenging due to a lack of reliable datasets. To improve the\nunderstanding of public opinions and communication strategies, this paper\npresents MultiClimate, the first open-source manually-annotated stance\ndetection dataset with $100$ CC-related YouTube videos and $4,209$\nframe-transcript pairs. We deploy state-of-the-art vision and language models,\nas well as multimodal models for MultiClimate stance detection. Results show\nthat text-only BERT significantly outperforms image-only ResNet50 and ViT.\nCombining both modalities achieves state-of-the-art, $0.747$/$0.749$ in\naccuracy/F1. Our 100M-sized fusion models also beat CLIP and BLIP, as well as\nthe much larger 9B-sized multimodal IDEFICS and text-only Llama3 and Gemma2,\nindicating that multimodal stance detection remains challenging for large\nlanguage models. Our code, dataset, as well as supplementary materials, are\navailable at https://github.com/werywjw/MultiClimate.\n","authors":["Jiawen Wang","Longfei Zuo","Siyao Peng","Barbara Plank"],"pdf_url":"https://arxiv.org/pdf/2409.18346v1.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2407.08044v2","updated":"2024-09-26T23:47:03Z","published":"2024-07-10T20:52:18Z","title":"RoLoRA: Fine-tuning Rotated Outlier-free LLMs for Effective\n Weight-Activation Quantization","summary":" Low-Rank Adaptation (LoRA), as a representative Parameter-Efficient\nFine-Tuning (PEFT)method, significantly enhances the training efficiency by\nupdating only a small portion of the weights in Large Language Models (LLMs).\nRecently, weight-only quantization techniques have also been applied to LoRA\nmethods to reduce the memory footprint of fine-tuning. However, applying\nweight-activation quantization to the LoRA pipeline is under-explored, and we\nobserve substantial performance degradation primarily due to the presence of\nactivation outliers. In this work, we propose RoLoRA, the first LoRA-based\nscheme for effective weight-activation quantization. RoLoRA utilizes rotation\nfor outlier elimination and proposes rotation-aware fine-tuning to preserve the\noutlier-free characteristics in rotated LLMs. Experimental results show RoLoRA\nconsistently improves low-bit LoRA convergence and post-training quantization\nrobustness in weight-activation settings. We evaluate RoLoRA across\nLLaMA2-7B/13B, LLaMA3-8B models, achieving up to 29.5% absolute accuracy gain\nof 4-bit weight-activation quantized LLaMA2- 13B on commonsense reasoning tasks\ncompared to LoRA baseline. We further demonstrate its effectiveness on Large\nMultimodal Models (LLaVA-1.5-7B). Codes are available at\nhttps://github.com/HuangOwen/RoLoRA\n","authors":["Xijie Huang","Zechun Liu","Shih-Yang Liu","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.08044v2.pdf","comment":"EMNLP 2024 Findings, Codes: https://github.com/HuangOwen/RoLoRA,\n Models:\n https://huggingface.co/collections/ScarletAce/rolora-66f5f228a90681c7c4512b28"},{"id":"http://arxiv.org/abs/2409.18345v1","updated":"2024-09-26T23:46:15Z","published":"2024-09-26T23:46:15Z","title":"A Generalized LLM-Augmented BIM Framework: Application to a\n Speech-to-BIM system","summary":" Performing building information modeling (BIM) tasks is a complex process\nthat imposes a steep learning curve and a heavy cognitive load due to the\nnecessity of remembering sequences of numerous commands. With the rapid\nadvancement of large language models (LLMs), it is foreseeable that BIM tasks,\nincluding querying and managing BIM data, 4D and 5D BIM, design compliance\nchecking, or authoring a design, using written or spoken natural language\n(i.e., text-to-BIM or speech-to-BIM), will soon supplant traditional graphical\nuser interfaces. This paper proposes a generalized LLM-augmented BIM framework\nto expedite the development of LLM-enhanced BIM applications by providing a\nstep-by-step development process. The proposed framework consists of six steps:\ninterpret-fill-match-structure-execute-check. The paper demonstrates the\napplicability of the proposed framework through implementing a speech-to-BIM\napplication, NADIA-S (Natural-language-based Architectural Detailing through\nInteraction with Artificial Intelligence via Speech), using exterior wall\ndetailing as an example.\n","authors":["Ghang Lee","Suhyung Jang","Seokho Hyun"],"pdf_url":"https://arxiv.org/pdf/2409.18345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18339v1","updated":"2024-09-26T23:25:21Z","published":"2024-09-26T23:25:21Z","title":"AER-LLM: Ambiguity-aware Emotion Recognition Leveraging Large Language\n Models","summary":" Recent advancements in Large Language Models (LLMs) have demonstrated great\nsuccess in many Natural Language Processing (NLP) tasks. In addition to their\ncognitive intelligence, exploring their capabilities in emotional intelligence\nis also crucial, as it enables more natural and empathetic conversational AI.\nRecent studies have shown LLMs' capability in recognizing emotions, but they\noften focus on single emotion labels and overlook the complex and ambiguous\nnature of human emotions. This study is the first to address this gap by\nexploring the potential of LLMs in recognizing ambiguous emotions, leveraging\ntheir strong generalization capabilities and in-context learning. We design\nzero-shot and few-shot prompting and incorporate past dialogue as context\ninformation for ambiguous emotion recognition. Experiments conducted using\nthree datasets indicate significant potential for LLMs in recognizing ambiguous\nemotions, and highlight the substantial benefits of including context\ninformation. Furthermore, our findings indicate that LLMs demonstrate a high\ndegree of effectiveness in recognizing less ambiguous emotions and exhibit\npotential for identifying more ambiguous emotions, paralleling human perceptual\ncapabilities.\n","authors":["Xin Hong","Yuan Gong","Vidhyasaharan Sethu","Ting Dang"],"pdf_url":"https://arxiv.org/pdf/2409.18339v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.18335v1","updated":"2024-09-26T23:16:47Z","published":"2024-09-26T23:16:47Z","title":"A Fairness-Driven Method for Learning Human-Compatible Negotiation\n Strategies","summary":" Despite recent advancements in AI and NLP, negotiation remains a difficult\ndomain for AI agents. Traditional game theoretic approaches that have worked\nwell for two-player zero-sum games struggle in the context of negotiation due\nto their inability to learn human-compatible strategies. On the other hand,\napproaches that only use human data tend to be domain-specific and lack the\ntheoretical guarantees provided by strategies grounded in game theory.\nMotivated by the notion of fairness as a criterion for optimality in general\nsum games, we propose a negotiation framework called FDHC which incorporates\nfairness into both the reward design and search to learn human-compatible\nnegotiation strategies. Our method includes a novel, RL+search technique called\nLGM-Zero which leverages a pre-trained language model to retrieve\nhuman-compatible offers from large action spaces. Our results show that our\nmethod is able to achieve more egalitarian negotiation outcomes and improve\nnegotiation quality.\n","authors":["Ryan Shea","Zhou Yu"],"pdf_url":"https://arxiv.org/pdf/2409.18335v1.pdf","comment":"EMNLP Findings 2024"},{"id":"http://arxiv.org/abs/2404.05892v4","updated":"2024-09-26T22:39:08Z","published":"2024-04-08T22:20:59Z","title":"Eagle and Finch: RWKV with Matrix-Valued States and Dynamic Recurrence","summary":" We present Eagle (RWKV-5) and Finch (RWKV-6), sequence models improving upon\nthe RWKV (RWKV-4) architecture. Our architectural design advancements include\nmulti-headed matrix-valued states and a dynamic recurrence mechanism that\nimprove expressivity while maintaining the inference efficiency characteristics\nof RNNs. We introduce a new multilingual corpus with 1.12 trillion tokens and a\nfast tokenizer based on greedy matching for enhanced multilinguality. We\ntrained four Eagle models, ranging from 0.46 to 7.5 billion parameters, and two\nFinch models with 1.6 and 3.1 billion parameters and find that they achieve\ncompetitive performance across a wide variety of benchmarks. We release all our\nmodels on HuggingFace under the Apache 2.0 license. Models at:\nhttps://huggingface.co/RWKV Training code at: https://github.com/RWKV/RWKV-LM\nInference code at: https://github.com/RWKV/ChatRWKV Time-parallel training code\nat: https://github.com/RWKV/RWKV-infctx-trainer\n","authors":["Bo Peng","Daniel Goldstein","Quentin Anthony","Alon Albalak","Eric Alcaide","Stella Biderman","Eugene Cheah","Xingjian Du","Teddy Ferdinan","Haowen Hou","Przemysław Kazienko","Kranthi Kiran GV","Jan Kocoń","Bartłomiej Koptyra","Satyapriya Krishna","Ronald McClelland Jr.","Jiaju Lin","Niklas Muennighoff","Fares Obeid","Atsushi Saito","Guangyu Song","Haoqin Tu","Cahya Wirawan","Stanisław Woźniak","Ruichong Zhang","Bingchen Zhao","Qihang Zhao","Peng Zhou","Jian Zhu","Rui-Jie Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.05892v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13994v2","updated":"2024-09-26T22:24:13Z","published":"2024-09-21T03:09:10Z","title":"Contrastive Learning for Knowledge-Based Question Generation in Large\n Language Models","summary":" With the rapid development of artificial intelligence technology, especially\nthe increasingly widespread application of question-and-answer systems,\nhigh-quality question generation has become a key component in supporting the\ndevelopment of these systems. This article focuses on knowledge-based question\ngeneration technology, which aims to enable computers to simulate the human\nquestioning process based on understanding specific texts or knowledge bases.\nIn light of the issues of hallucination and knowledge gaps present in\nlarge-scale language models when applied to knowledge-intensive tasks, this\npaper proposes an enhanced question generation method that incorporates\ncontrastive learning. This method utilizes multiple models to jointly mine\ndomain knowledge and uses contrastive learning to guide the model in reducing\nnoise and hallucinations in generation. Experimental results show that by\ndesigning prompts containing contrasting examples, the model's performance in\nquestion generation improves considerably, particularly when contrasting\ninstructions and examples are used simultaneously, leading to the highest\nquality of generated questions and improved accuracy. These results demonstrate\nthat the method proposed in this study, which combines contrasting context and\nchain-of-thought prompts, can effectively improve both the quality and the\npracticality of question generation.\n","authors":["Zhenhong Zhang","Jiajing Chen","Weiyan Shi","Lingjie Yi","Chihang Wang","Qian Yu"],"pdf_url":"https://arxiv.org/pdf/2409.13994v2.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.18319v1","updated":"2024-09-26T21:59:11Z","published":"2024-09-26T21:59:11Z","title":"Cross-Institutional Structured Radiology Reporting for Lung Cancer\n Screening Using a Dynamic Template-Constrained Large Language Model","summary":" Structured radiology reporting is advantageous for optimizing clinical\nworkflows and patient outcomes. Current LLMs in creating structured reports\nface the challenges of formatting errors, content hallucinations, and privacy\nleakage concerns when uploaded to external servers. We aim to develop an\nenhanced open-source LLM for creating structured and standardized LCS reports\nfrom free-text descriptions. After institutional IRB approvals, 5,442\nde-identified LCS reports from two institutions were retrospectively analyzed.\n500 reports were randomly selected from the two institutions evenly and then\nmanually labeled for evaluation. Two radiologists from the two institutions\ndeveloped a standardized template including 29 features for lung nodule\nreporting. We proposed template-constrained decoding to enhance\nstate-of-the-art open-source LLMs, including LLAMA, Qwen, and Mistral. The LLM\nperformance was extensively evaluated in terms of F1 score, confidence\ninterval, McNemar test, and z-test. Based on the structured reports created\nfrom the large-scale dataset, a nodule-level retrieval system was prototyped\nand an automatic statistical analysis was performed. Our software,\nvLLM-structure, is publicly available for local deployment with enhanced LLMs.\nOur template-constrained decoding approach consistently enhanced the LLM\nperformance on multi-institutional datasets, with neither formatting errors nor\ncontent hallucinations. Our method improved the best open-source LLAMA-3.1 405B\nby up to 10.42%, and outperformed GPT-4o by 17.19%. A novel nodule retrieval\nsystem was successfully prototyped and demonstrated on a large-scale multimodal\ndatabase using our enhanced LLM technologies. The automatically derived\nstatistical distributions were closely consistent with the prior findings in\nterms of nodule type, location, size, status, and Lung-RADS.\n","authors":["Chuang Niu","Parisa Kaviani","Qing Lyu","Mannudeep K. Kalra","Christopher T. Whitlow","Ge Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12830v2","updated":"2024-09-26T21:46:00Z","published":"2024-06-18T17:51:24Z","title":"What Are the Odds? Language Models Are Capable of Probabilistic\n Reasoning","summary":" Language models (LM) are capable of remarkably complex linguistic tasks;\nhowever, numerical reasoning is an area in which they frequently struggle. An\nimportant but rarely evaluated form of reasoning is understanding probability\ndistributions. In this paper, we focus on evaluating the probabilistic\nreasoning capabilities of LMs using idealized and real-world statistical\ndistributions. We perform a systematic evaluation of state-of-the-art LMs on\nthree tasks: estimating percentiles, drawing samples, and calculating\nprobabilities. We evaluate three ways to provide context to LMs 1) anchoring\nexamples from within a distribution or family of distributions, 2) real-world\ncontext, 3) summary statistics on which to base a Normal approximation. Models\ncan make inferences about distributions, and can be further aided by the\nincorporation of real-world context, example shots and simplified assumptions,\neven if these assumptions are incorrect or misspecified. To conduct this work,\nwe developed a comprehensive benchmark distribution dataset with associated\nquestion-answer pairs that we will release publicly.\n","authors":["Akshay Paruchuri","Jake Garrison","Shun Liao","John Hernandez","Jacob Sunshine","Tim Althoff","Xin Liu","Daniel McDuff"],"pdf_url":"https://arxiv.org/pdf/2406.12830v2.pdf","comment":"21 pages, 9 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.18314v1","updated":"2024-09-26T21:44:20Z","published":"2024-09-26T21:44:20Z","title":"Realistic Evaluation of Model Merging for Compositional Generalization","summary":" Merging has become a widespread way to cheaply combine individual models into\na single model that inherits their capabilities and attains better performance.\nThis popularity has spurred rapid development of many new merging methods,\nwhich are typically validated in disparate experimental settings and frequently\ndiffer in the assumptions made about model architecture, data availability, and\ncomputational budget. In this work, we characterize the relative merits of\ndifferent merging methods by evaluating them in a shared experimental setting\nand precisely identifying the practical requirements of each method.\nSpecifically, our setting focuses on using merging for compositional\ngeneralization of capabilities in image classification, image generation, and\nnatural language processing. Additionally, we measure the computational costs\nof different merging methods as well as how they perform when scaling the\nnumber of models being merged. Taken together, our results clarify the state of\nthe field of model merging and provide a comprehensive and rigorous\nexperimental setup to test new methods.\n","authors":["Derek Tam","Yash Kant","Brian Lester","Igor Gilitschenski","Colin Raffel"],"pdf_url":"https://arxiv.org/pdf/2409.18314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18286v1","updated":"2024-09-26T20:58:11Z","published":"2024-09-26T20:58:11Z","title":"Advancing Object Detection in Transportation with Multimodal Large\n Language Models (MLLMs): A Comprehensive Review and Empirical Testing","summary":" This study aims to comprehensively review and empirically evaluate the\napplication of multimodal large language models (MLLMs) and Large Vision Models\n(VLMs) in object detection for transportation systems. In the first fold, we\nprovide a background about the potential benefits of MLLMs in transportation\napplications and conduct a comprehensive review of current MLLM technologies in\nprevious studies. We highlight their effectiveness and limitations in object\ndetection within various transportation scenarios. The second fold involves\nproviding an overview of the taxonomy of end-to-end object detection in\ntransportation applications and future directions. Building on this, we\nproposed empirical analysis for testing MLLMs on three real-world\ntransportation problems that include object detection tasks namely, road safety\nattributes extraction, safety-critical event detection, and visual reasoning of\nthermal images. Our findings provide a detailed assessment of MLLM performance,\nuncovering both strengths and areas for improvement. Finally, we discuss\npractical limitations and challenges of MLLMs in enhancing object detection in\ntransportation, thereby offering a roadmap for future research and development\nin this critical area.\n","authors":["Huthaifa I. Ashqar","Ahmed Jaber","Taqwa I. Alhadidi","Mohammed Elhenawy"],"pdf_url":"https://arxiv.org/pdf/2409.18286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16383v2","updated":"2024-09-26T20:26:24Z","published":"2024-09-24T18:35:09Z","title":"RISCORE: Enhancing In-Context Riddle Solving in Language Models through\n Context-Reconstructed Example Augmentation","summary":" Riddle-solving requires advanced reasoning skills, pushing LLMs to engage in\nabstract thinking and creative problem-solving, often revealing limitations in\ntheir cognitive abilities. In this paper, we examine the riddle-solving\ncapabilities of LLMs using a multiple-choice format, exploring how different\nprompting techniques impact performance on riddles that demand diverse\nreasoning skills. To enhance results, we introduce RISCORE (RIddle Solving with\nCOntext REcontruciton) a novel fully automated prompting method that generates\nand utilizes contextually reconstructed sentence-based puzzles in conjunction\nwith the original examples to create few-shot exemplars. Our experiments\ndemonstrate that RISCORE significantly improves the performance of language\nmodels in both vertical and lateral thinking tasks, surpassing traditional\nexemplar selection strategies across a variety of few-shot settings.\n","authors":["Ioannis Panagiotopoulos","Giorgos Filandrianos","Maria Lymperaiou","Giorgos Stamou"],"pdf_url":"https://arxiv.org/pdf/2409.16383v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18263v1","updated":"2024-09-26T20:15:46Z","published":"2024-09-26T20:15:46Z","title":"DisGeM: Distractor Generation for Multiple Choice Questions with Span\n Masking","summary":" Recent advancements in Natural Language Processing (NLP) have impacted\nnumerous sub-fields such as natural language generation, natural language\ninference, question answering, and more. However, in the field of question\ngeneration, the creation of distractors for multiple-choice questions (MCQ)\nremains a challenging task. In this work, we present a simple, generic\nframework for distractor generation using readily available Pre-trained\nLanguage Models (PLMs). Unlike previous methods, our framework relies solely on\npre-trained language models and does not require additional training on\nspecific datasets. Building upon previous research, we introduce a two-stage\nframework consisting of candidate generation and candidate selection. Our\nproposed distractor generation framework outperforms previous methods without\nthe need for training or fine-tuning. Human evaluations confirm that our\napproach produces more effective and engaging distractors. The related codebase\nis publicly available at https://github.com/obss/disgem.\n","authors":["Devrim Cavusoglu","Secil Sen","Ulas Sert"],"pdf_url":"https://arxiv.org/pdf/2409.18263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02369v2","updated":"2024-09-26T19:07:30Z","published":"2024-01-04T17:23:44Z","title":"SPEER: Sentence-Level Planning of Long Clinical Summaries via Embedded\n Entity Retrieval","summary":" Clinician must write a lengthy summary each time a patient is discharged from\nthe hospital. This task is time-consuming due to the sheer number of unique\nclinical concepts covered in the admission. Identifying and covering salient\nentities is vital for the summary to be clinically useful. We fine-tune\nopen-source LLMs (Mistral-7B-Instruct and Zephyr-7B-beta) on the task and find\nthat they generate incomplete and unfaithful summaries. To increase entity\ncoverage, we train a smaller, encoder-only model to predict salient entities,\nwhich are treated as content-plans to guide the LLM. To encourage the LLM to\nfocus on specific mentions in the source notes, we propose SPEER:\nSentence-level Planning via Embedded Entity Retrieval. Specifically, we mark\neach salient entity span with special \"{{ }}\" boundary tags and instruct the\nLLM to retrieve marked spans before generating each sentence. Sentence-level\nplanning acts as a form of state tracking in that the model is explicitly\nrecording the entities it uses. We fine-tune Mistral and Zephyr variants on a\nlarge-scale, diverse dataset of ~167k in-patient hospital admissions and\nevaluate on 3 datasets. SPEER shows gains in both coverage and faithfulness\nmetrics over non-guided and guided baselines.\n","authors":["Griffin Adams","Jason Zucker","Noémie Elhadad"],"pdf_url":"https://arxiv.org/pdf/2401.02369v2.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2409.18216v1","updated":"2024-09-26T18:51:46Z","published":"2024-09-26T18:51:46Z","title":"MMMT-IF: A Challenging Multimodal Multi-Turn Instruction Following\n Benchmark","summary":" Evaluating instruction following capabilities for multimodal, multi-turn\ndialogue is challenging. With potentially multiple instructions in the input\nmodel context, the task is time-consuming for human raters and we show LLM\nbased judges are biased towards answers from the same model. We propose\nMMMT-IF, an image based multi-turn Q$\\&$A evaluation set with added global\ninstructions between questions, constraining the answer format. This challenges\nmodels to retrieve instructions dispersed across long dialogues and reason\nunder instruction constraints. All instructions are objectively verifiable\nthrough code execution. We introduce the Programmatic Instruction Following\n($\\operatorname{PIF}$) metric to measure the fraction of the instructions that\nare correctly followed while performing a reasoning task. The\n$\\operatorname{PIF-N-K}$ set of metrics further evaluates robustness by\nmeasuring the fraction of samples in a corpus where, for each sample, at least\nK out of N generated model responses achieve a $\\operatorname{PIF}$ score of\none. The $\\operatorname{PIF}$ metric aligns with human instruction following\nratings, showing 60 percent correlation. Experiments show Gemini 1.5 Pro,\nGPT-4o, and Claude 3.5 Sonnet, have a $\\operatorname{PIF}$ metric that drops\nfrom 0.81 on average at turn 1 across the models, to 0.64 at turn 20. Across\nall turns, when each response is repeated 4 times ($\\operatorname{PIF-4-4}$),\nGPT-4o and Gemini successfully follow all instructions only $11\\%$ of the time.\nWhen all the instructions are also appended to the end of the model input\ncontext, the $\\operatorname{PIF}$ metric improves by 22.3 points on average,\nshowing that the challenge with the task lies not only in following the\ninstructions, but also in retrieving the instructions spread out in the model\ncontext. We plan to open source the MMMT-IF dataset and metric computation\ncode.\n","authors":["Elliot L. Epstein","Kaisheng Yao","Jing Li","Xinyi Bai","Hamid Palangi"],"pdf_url":"https://arxiv.org/pdf/2409.18216v1.pdf","comment":"24 pages, 16 figures"},{"id":"http://arxiv.org/abs/2311.12015v4","updated":"2024-09-26T18:35:52Z","published":"2023-11-20T18:54:39Z","title":"GPT-4V(ision) for Robotics: Multimodal Task Planning from Human\n Demonstration","summary":" We introduce a pipeline that enhances a general-purpose Vision Language\nModel, GPT-4V(ision), to facilitate one-shot visual teaching for robotic\nmanipulation. This system analyzes videos of humans performing tasks and\noutputs executable robot programs that incorporate insights into affordances.\nThe process begins with GPT-4V analyzing the videos to obtain textual\nexplanations of environmental and action details. A GPT-4-based task planner\nthen encodes these details into a symbolic task plan. Subsequently, vision\nsystems spatially and temporally ground the task plan in the videos. Objects\nare identified using an open-vocabulary object detector, and hand-object\ninteractions are analyzed to pinpoint moments of grasping and releasing. This\nspatiotemporal grounding allows for the gathering of affordance information\n(e.g., grasp types, waypoints, and body postures) critical for robot execution.\nExperiments across various scenarios demonstrate the method's efficacy in\nenabling real robots to operate from one-shot human demonstrations. Meanwhile,\nquantitative tests have revealed instances of hallucination in GPT-4V,\nhighlighting the importance of incorporating human supervision within the\npipeline. The prompts of GPT-4V/GPT-4 are available at this project page:\nhttps://microsoft.github.io/GPT4Vision-Robot-Manipulation-Prompts/\n","authors":["Naoki Wake","Atsushi Kanehira","Kazuhiro Sasabuchi","Jun Takamatsu","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2311.12015v4.pdf","comment":"8 pages, 10 figures, 3 tables. Published in IEEE Robotics and\n Automation Letters (RA-L) (in press). Last updated on September 26th, 2024"},{"id":"http://arxiv.org/abs/2409.18203v1","updated":"2024-09-26T18:34:16Z","published":"2024-09-26T18:34:16Z","title":"AI Policy Projector: Grounding LLM Policy Design in Iterative Mapmaking","summary":" Whether a large language model policy is an explicit constitution or an\nimplicit reward model, it is challenging to assess coverage over the unbounded\nset of real-world situations that a policy must contend with. We introduce an\nAI policy design process inspired by mapmaking, which has developed tactics for\nvisualizing and iterating on maps even when full coverage is not possible. With\nPolicy Projector, policy designers can survey the landscape of model\ninput-output pairs, define custom regions (e.g., \"violence\"), and navigate\nthese regions with rules that can be applied to LLM outputs (e.g., if output\ncontains \"violence\" and \"graphic details,\" then rewrite without \"graphic\ndetails\"). Policy Projector supports interactive policy authoring using LLM\nclassification and steering and a map visualization reflecting the policy\ndesigner's work. In an evaluation with 12 AI safety experts, our system helps\npolicy designers to address problematic model behaviors extending beyond an\nexisting, comprehensive harm taxonomy.\n","authors":["Michelle S. Lam","Fred Hohman","Dominik Moritz","Jeffrey P. Bigham","Kenneth Holstein","Mary Beth Kery"],"pdf_url":"https://arxiv.org/pdf/2409.18203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18199v1","updated":"2024-09-26T18:29:10Z","published":"2024-09-26T18:29:10Z","title":"LangSAMP: Language-Script Aware Multilingual Pretraining","summary":" Recent multilingual pretrained language models (mPLMs) often avoid using\nlanguage embeddings -- learnable vectors assigned to different languages. These\nembeddings are discarded for two main reasons: (1) mPLMs are expected to have a\nsingle, unified parameter set across all languages, and (2) they need to\nfunction seamlessly as universal text encoders without requiring language IDs\nas input. However, this removal increases the burden on token embeddings to\nencode all language-specific information, which may hinder the model's ability\nto produce more language-neutral representations. To address this challenge, we\npropose Language-Script Aware Multilingual Pretraining (LangSAMP), a method\nthat incorporates both language and script embeddings to enhance representation\nlearning while maintaining a simple architecture. Specifically, we integrate\nthese embeddings into the output of the transformer blocks before passing the\nfinal representations to the language modeling head for prediction. We apply\nLangSAMP to the continual pretraining of XLM-R on a highly multilingual corpus\ncovering more than 500 languages. The resulting model consistently outperforms\nthe baseline. Extensive analysis further shows that language/script embeddings\nencode language/script-specific information, which improves the selection of\nsource languages for crosslingual transfer. We make our code and models\npublicly available at \\url{https://github.com/cisnlp/LangSAMP}.\n","authors":["Yihong Liu","Haotian Ye","Chunlan Ma","Mingyang Wang","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2409.18199v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2409.18193v1","updated":"2024-09-26T18:10:26Z","published":"2024-09-26T18:10:26Z","title":"LowREm: A Repository of Word Embeddings for 87 Low-Resource Languages\n Enhanced with Multilingual Graph Knowledge","summary":" Contextualized embeddings based on large language models (LLMs) are available\nfor various languages, but their coverage is often limited for lower resourced\nlanguages. Training LLMs for such languages is often difficult due to\ninsufficient data and high computational cost. Especially for very low resource\nlanguages, static word embeddings thus still offer a viable alternative. There\nis, however, a notable lack of comprehensive repositories with such embeddings\nfor diverse languages. To address this, we present LowREm, a centralized\nrepository of static embeddings for 87 low-resource languages. We also propose\na novel method to enhance GloVe-based embeddings by integrating multilingual\ngraph knowledge, utilizing another source of knowledge. We demonstrate the\nsuperior performance of our enhanced embeddings as compared to contextualized\nembeddings extracted from XLM-R on sentiment analysis. Our code and data are\npublicly available under https://huggingface.co/DFKI.\n","authors":["Daniil Gurgurov","Rishu Kumar","Simon Ostermann"],"pdf_url":"https://arxiv.org/pdf/2409.18193v1.pdf","comment":"Short paper, preview"},{"id":"http://arxiv.org/abs/2409.18170v1","updated":"2024-09-26T17:58:26Z","published":"2024-09-26T17:58:26Z","title":"Evaluation of Large Language Models for Summarization Tasks in the\n Medical Domain: A Narrative Review","summary":" Large Language Models have advanced clinical Natural Language Generation,\ncreating opportunities to manage the volume of medical text. However, the\nhigh-stakes nature of medicine requires reliable evaluation, which remains a\nchallenge. In this narrative review, we assess the current evaluation state for\nclinical summarization tasks and propose future directions to address the\nresource constraints of expert human evaluation.\n","authors":["Emma Croxford","Yanjun Gao","Nicholas Pellegrino","Karen K. Wong","Graham Wills","Elliot First","Frank J. Liao","Cherodeep Goswami","Brian Patterson","Majid Afshar"],"pdf_url":"https://arxiv.org/pdf/2409.18170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18164v1","updated":"2024-09-26T17:30:28Z","published":"2024-09-26T17:30:28Z","title":"Data-Prep-Kit: getting your data ready for LLM application development","summary":" Data preparation is the first and a very important step towards any Large\nLanguage Model (LLM) development. This paper introduces an easy-to-use,\nextensible, and scale-flexible open-source data preparation toolkit called Data\nPrep Kit (DPK). DPK is architected and designed to enable users to scale their\ndata preparation to their needs. With DPK they can prepare data on a local\nmachine or effortlessly scale to run on a cluster with thousands of CPU Cores.\nDPK comes with a highly scalable, yet extensible set of modules that transform\nnatural language and code data. If the user needs additional transforms, they\ncan be easily developed using extensive DPK support for transform creation.\nThese modules can be used independently or pipelined to perform a series of\noperations. In this paper, we describe DPK architecture and show its\nperformance from a small scale to a very large number of CPUs. The modules from\nDPK have been used for the preparation of Granite Models [1] [2]. We believe\nDPK is a valuable contribution to the AI community to easily prepare data to\nenhance the performance of their LLM models or to fine-tune models with\nRetrieval-Augmented Generation (RAG).\n","authors":["David Wood","Boris Lublinsky","Alexy Roytman","Shivdeep Singh","Abdulhamid Adebayo","Revital Eres","Mohammad Nassar","Hima Patel","Yousaf Shah","Constantin Adam","Petros Zerfos","Nirmit Desai","Daiki Tsuzuku","Takuya Goto","Michele Dolfi","Saptha Surendran","Paramesvaran Selvam","Sungeun An","Yuan Chi Chang","Dhiraj Joshi","Hajar Emami-Gohari","Xuan-Hong Dang","Yan Koyfman","Shahrokh Daijavad"],"pdf_url":"https://arxiv.org/pdf/2409.18164v1.pdf","comment":"10 pages, 7 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.18128v1","updated":"2024-09-26T17:59:51Z","published":"2024-09-26T17:59:51Z","title":"FlowTurbo: Towards Real-time Flow-Based Image Generation with Velocity\n Refiner","summary":" Building on the success of diffusion models in visual generation, flow-based\nmodels reemerge as another prominent family of generative models that have\nachieved competitive or better performance in terms of both visual quality and\ninference speed. By learning the velocity field through flow-matching,\nflow-based models tend to produce a straighter sampling trajectory, which is\nadvantageous during the sampling process. However, unlike diffusion models for\nwhich fast samplers are well-developed, efficient sampling of flow-based\ngenerative models has been rarely explored. In this paper, we propose a\nframework called FlowTurbo to accelerate the sampling of flow-based models\nwhile still enhancing the sampling quality. Our primary observation is that the\nvelocity predictor's outputs in the flow-based models will become stable during\nthe sampling, enabling the estimation of velocity via a lightweight velocity\nrefiner. Additionally, we introduce several techniques including a pseudo\ncorrector and sample-aware compilation to further reduce inference time. Since\nFlowTurbo does not change the multi-step sampling paradigm, it can be\neffectively applied for various tasks such as image editing, inpainting, etc.\nBy integrating FlowTurbo into different flow-based models, we obtain an\nacceleration ratio of 53.1%$\\sim$58.3% on class-conditional generation and\n29.8%$\\sim$38.5% on text-to-image generation. Notably, FlowTurbo reaches an FID\nof 2.12 on ImageNet with 100 (ms / img) and FID of 3.93 with 38 (ms / img),\nachieving the real-time image generation and establishing the new\nstate-of-the-art. Code is available at https://github.com/shiml20/FlowTurbo.\n","authors":["Wenliang Zhao","Minglei Shi","Xumin Yu","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2409.18128v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.18127v1","updated":"2024-09-26T17:59:31Z","published":"2024-09-26T17:59:31Z","title":"EgoLM: Multi-Modal Language Model of Egocentric Motions","summary":" As the prevalence of wearable devices, learning egocentric motions becomes\nessential to develop contextual AI. In this work, we present EgoLM, a versatile\nframework that tracks and understands egocentric motions from multi-modal\ninputs, e.g., egocentric videos and motion sensors. EgoLM exploits rich\ncontexts for the disambiguation of egomotion tracking and understanding, which\nare ill-posed under single modality conditions. To facilitate the versatile and\nmulti-modal framework, our key insight is to model the joint distribution of\negocentric motions and natural languages using large language models (LLM).\nMulti-modal sensor inputs are encoded and projected to the joint latent space\nof language models, and used to prompt motion generation or text generation for\negomotion tracking or understanding, respectively. Extensive experiments on\nlarge-scale multi-modal human motion dataset validate the effectiveness of\nEgoLM as a generalist model for universal egocentric learning.\n","authors":["Fangzhou Hong","Vladimir Guzov","Hyo Jin Kim","Yuting Ye","Richard Newcombe","Ziwei Liu","Lingni Ma"],"pdf_url":"https://arxiv.org/pdf/2409.18127v1.pdf","comment":"Project Page: https://hongfz16.github.io/projects/EgoLM"},{"id":"http://arxiv.org/abs/2409.18125v1","updated":"2024-09-26T17:59:11Z","published":"2024-09-26T17:59:11Z","title":"LLaVA-3D: A Simple yet Effective Pathway to Empowering LMMs with\n 3D-awareness","summary":" Recent advancements in Large Multimodal Models (LMMs) have greatly enhanced\ntheir proficiency in 2D visual understanding tasks, enabling them to\neffectively process and understand images and videos. However, the development\nof LMMs with 3D-awareness for 3D scene understanding has been hindered by the\nlack of large-scale 3D vision-language datasets and powerful 3D encoders. In\nthis paper, we introduce a simple yet effective framework called LLaVA-3D.\nLeveraging the strong 2D understanding priors from LLaVA, our LLaVA-3D\nefficiently adapts LLaVA for 3D scene understanding without compromising 2D\nunderstanding capabilities. To achieve this, we employ a simple yet effective\nrepresentation, 3D Patch, which connects 2D CLIP patch features with their\ncorresponding positions in 3D space. By integrating the 3D Patches into 2D LMMs\nand employing joint 2D and 3D vision-language instruction tuning, we establish\na unified architecture for both 2D image understanding and 3D scene\nunderstanding. Experimental results show that LLaVA-3D converges 3.5x faster\nthan existing 3D LMMs when trained on 3D vision-language datasets. Moreover,\nLLaVA-3D not only achieves state-of-the-art performance across various 3D tasks\nbut also maintains comparable 2D image understanding and vision-language\nconversation capabilities with LLaVA.\n","authors":["Chenming Zhu","Tai Wang","Wenwei Zhang","Jiangmiao Pang","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18125v1.pdf","comment":"Project page: https://zcmax.github.io/projects/LLaVA-3D/"},{"id":"http://arxiv.org/abs/2409.18124v1","updated":"2024-09-26T17:58:55Z","published":"2024-09-26T17:58:55Z","title":"Lotus: Diffusion-based Visual Foundation Model for High-quality Dense\n Prediction","summary":" Leveraging the visual priors of pre-trained text-to-image diffusion models\noffers a promising solution to enhance zero-shot generalization in dense\nprediction tasks. However, existing methods often uncritically use the original\ndiffusion formulation, which may not be optimal due to the fundamental\ndifferences between dense prediction and image generation. In this paper, we\nprovide a systemic analysis of the diffusion formulation for the dense\nprediction, focusing on both quality and efficiency. And we find that the\noriginal parameterization type for image generation, which learns to predict\nnoise, is harmful for dense prediction; the multi-step noising/denoising\ndiffusion process is also unnecessary and challenging to optimize. Based on\nthese insights, we introduce Lotus, a diffusion-based visual foundation model\nwith a simple yet effective adaptation protocol for dense prediction.\nSpecifically, Lotus is trained to directly predict annotations instead of\nnoise, thereby avoiding harmful variance. We also reformulate the diffusion\nprocess into a single-step procedure, simplifying optimization and\nsignificantly boosting inference speed. Additionally, we introduce a novel\ntuning strategy called detail preserver, which achieves more accurate and\nfine-grained predictions. Without scaling up the training data or model\ncapacity, Lotus achieves SoTA performance in zero-shot depth and normal\nestimation across various datasets. It also significantly enhances efficiency,\nbeing hundreds of times faster than most existing diffusion-based methods.\n","authors":["Jing He","Haodong Li","Wei Yin","Yixun Liang","Leheng Li","Kaiqiang Zhou","Hongbo Liu","Bingbing Liu","Ying-Cong Chen"],"pdf_url":"https://arxiv.org/pdf/2409.18124v1.pdf","comment":"Project page: https://lotus3d.github.io/"},{"id":"http://arxiv.org/abs/2409.18121v1","updated":"2024-09-26T17:57:16Z","published":"2024-09-26T17:57:16Z","title":"Robot See Robot Do: Imitating Articulated Object Manipulation with\n Monocular 4D Reconstruction","summary":" Humans can learn to manipulate new objects by simply watching others;\nproviding robots with the ability to learn from such demonstrations would\nenable a natural interface specifying new behaviors. This work develops Robot\nSee Robot Do (RSRD), a method for imitating articulated object manipulation\nfrom a single monocular RGB human demonstration given a single static\nmulti-view object scan. We first propose 4D Differentiable Part Models\n(4D-DPM), a method for recovering 3D part motion from a monocular video with\ndifferentiable rendering. This analysis-by-synthesis approach uses part-centric\nfeature fields in an iterative optimization which enables the use of geometric\nregularizers to recover 3D motions from only a single video. Given this 4D\nreconstruction, the robot replicates object trajectories by planning bimanual\narm motions that induce the demonstrated object part motion. By representing\ndemonstrations as part-centric trajectories, RSRD focuses on replicating the\ndemonstration's intended behavior while considering the robot's own\nmorphological limits, rather than attempting to reproduce the hand's motion. We\nevaluate 4D-DPM's 3D tracking accuracy on ground truth annotated 3D part\ntrajectories and RSRD's physical execution performance on 9 objects across 10\ntrials each on a bimanual YuMi robot. Each phase of RSRD achieves an average of\n87% success rate, for a total end-to-end success rate of 60% across 90 trials.\nNotably, this is accomplished using only feature fields distilled from large\npretrained vision models -- without any task-specific training, fine-tuning,\ndataset collection, or annotation. Project page:\nhttps://robot-see-robot-do.github.io\n","authors":["Justin Kerr","Chung Min Kim","Mingxuan Wu","Brent Yi","Qianqian Wang","Ken Goldberg","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2409.18121v1.pdf","comment":"CoRL 2024, Project page: https://robot-see-robot-do.github.io"},{"id":"http://arxiv.org/abs/2409.18120v1","updated":"2024-09-26T17:57:15Z","published":"2024-09-26T17:57:15Z","title":"EvMAPPER: High Altitude Orthomapping with Event Cameras","summary":" Traditionally, unmanned aerial vehicles (UAVs) rely on CMOS-based cameras to\ncollect images about the world below. One of the most successful applications\nof UAVs is to generate orthomosaics or orthomaps, in which a series of images\nare integrated together to develop a larger map. However, the use of CMOS-based\ncameras with global or rolling shutters mean that orthomaps are vulnerable to\nchallenging light conditions, motion blur, and high-speed motion of\nindependently moving objects under the camera. Event cameras are less sensitive\nto these issues, as their pixels are able to trigger asynchronously on\nbrightness changes. This work introduces the first orthomosaic approach using\nevent cameras. In contrast to existing methods relying only on CMOS cameras,\nour approach enables map generation even in challenging light conditions,\nincluding direct sunlight and after sunset.\n","authors":["Fernando Cladera","Kenneth Chaney","M. Ani Hsieh","Camillo J. Taylor","Vijay Kumar"],"pdf_url":"https://arxiv.org/pdf/2409.18120v1.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.18119v1","updated":"2024-09-26T17:56:59Z","published":"2024-09-26T17:56:59Z","title":"Multi-View and Multi-Scale Alignment for Contrastive Language-Image\n Pre-training in Mammography","summary":" Contrastive Language-Image Pre-training (CLIP) shows promise in medical image\nanalysis but requires substantial data and computational resources. Due to\nthese restrictions, existing CLIP applications in medical imaging focus mainly\non modalities like chest X-rays that have abundant image-report data available,\nleaving many other important modalities under-explored. Here, we propose the\nfirst adaptation of the full CLIP model to mammography, which presents\nsignificant challenges due to labeled data scarcity, high-resolution images\nwith small regions of interest, and data imbalance. We first develop a\nspecialized supervision framework for mammography that leverages its multi-view\nnature. Furthermore, we design a symmetric local alignment module to better\nfocus on detailed features in high-resolution images. Lastly, we incorporate a\nparameter-efficient fine-tuning approach for large language models pre-trained\nwith medical knowledge to address data limitations. Our multi-view and\nmulti-scale alignment (MaMA) method outperforms state-of-the-art baselines for\nthree different tasks on two large real-world mammography datasets, EMBED and\nRSNA-Mammo, with only 52% model size compared with the largest baseline.\n","authors":["Yuexi Du","John Onofrey","Nicha C. Dvornek"],"pdf_url":"https://arxiv.org/pdf/2409.18119v1.pdf","comment":"This work is also the basis of the overall best solution for the\n MICCAI 2024 CXR-LT Challenge"},{"id":"http://arxiv.org/abs/2409.18114v1","updated":"2024-09-26T17:55:02Z","published":"2024-09-26T17:55:02Z","title":"EdgeRunner: Auto-regressive Auto-encoder for Artistic Mesh Generation","summary":" Current auto-regressive mesh generation methods suffer from issues such as\nincompleteness, insufficient detail, and poor generalization. In this paper, we\npropose an Auto-regressive Auto-encoder (ArAE) model capable of generating\nhigh-quality 3D meshes with up to 4,000 faces at a spatial resolution of\n$512^3$. We introduce a novel mesh tokenization algorithm that efficiently\ncompresses triangular meshes into 1D token sequences, significantly enhancing\ntraining efficiency. Furthermore, our model compresses variable-length\ntriangular meshes into a fixed-length latent space, enabling training latent\ndiffusion models for better generalization. Extensive experiments demonstrate\nthe superior quality, diversity, and generalization capabilities of our model\nin both point cloud and image-conditioned mesh generation tasks.\n","authors":["Jiaxiang Tang","Zhaoshuo Li","Zekun Hao","Xian Liu","Gang Zeng","Ming-Yu Liu","Qinsheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18114v1.pdf","comment":"Project Page: https://research.nvidia.com/labs/dir/edgerunner/"},{"id":"http://arxiv.org/abs/2409.18111v1","updated":"2024-09-26T17:53:04Z","published":"2024-09-26T17:53:04Z","title":"E.T. Bench: Towards Open-Ended Event-Level Video-Language Understanding","summary":" Recent advances in Video Large Language Models (Video-LLMs) have demonstrated\ntheir great potential in general-purpose video understanding. To verify the\nsignificance of these models, a number of benchmarks have been proposed to\ndiagnose their capabilities in different scenarios. However, existing\nbenchmarks merely evaluate models through video-level question-answering,\nlacking fine-grained event-level assessment and task diversity. To fill this\ngap, we introduce E.T. Bench (Event-Level & Time-Sensitive Video Understanding\nBenchmark), a large-scale and high-quality benchmark for open-ended event-level\nvideo understanding. Categorized within a 3-level task taxonomy, E.T. Bench\nencompasses 7.3K samples under 12 tasks with 7K videos (251.4h total length)\nunder 8 domains, providing comprehensive evaluations. We extensively evaluated\n8 Image-LLMs and 12 Video-LLMs on our benchmark, and the results reveal that\nstate-of-the-art models for coarse-level (video-level) understanding struggle\nto solve our fine-grained tasks, e.g., grounding event-of-interests within\nvideos, largely due to the short video context length, improper time\nrepresentations, and lack of multi-event training data. Focusing on these\nissues, we further propose a strong baseline model, E.T. Chat, together with an\ninstruction-tuning dataset E.T. Instruct 164K tailored for fine-grained\nevent-level understanding. Our simple but effective solution demonstrates\nsuperior performance in multiple scenarios.\n","authors":["Ye Liu","Zongyang Ma","Zhongang Qi","Yang Wu","Ying Shan","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2409.18111v1.pdf","comment":"Accepted to NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2409.18104v1","updated":"2024-09-26T17:49:20Z","published":"2024-09-26T17:49:20Z","title":"Find Rhinos without Finding Rhinos: Active Learning with Multimodal\n Imagery of South African Rhino Habitats","summary":" Much of Earth's charismatic megafauna is endangered by human activities,\nparticularly the rhino, which is at risk of extinction due to the poaching\ncrisis in Africa. Monitoring rhinos' movement is crucial to their protection\nbut has unfortunately proven difficult because rhinos are elusive. Therefore,\ninstead of tracking rhinos, we propose the novel approach of mapping communal\ndefecation sites, called middens, which give information about rhinos' spatial\nbehavior valuable to anti-poaching, management, and reintroduction efforts.\nThis paper provides the first-ever mapping of rhino midden locations by\nbuilding classifiers to detect them using remotely sensed thermal, RGB, and\nLiDAR imagery in passive and active learning settings. As existing active\nlearning methods perform poorly due to the extreme class imbalance in our\ndataset, we design MultimodAL, an active learning system employing a ranking\ntechnique and multimodality to achieve competitive performance with passive\nlearning models with 94% fewer labels. Our methods could therefore save over 76\nhours in labeling time when used on a similarly-sized dataset. Unexpectedly,\nour midden map reveals that rhino middens are not randomly distributed\nthroughout the landscape; rather, they are clustered. Consequently, rangers\nshould be targeted at areas with high midden densities to strengthen\nanti-poaching efforts, in line with UN Target 15.7.\n","authors":["Lucia Gordon","Nikhil Behari","Samuel Collier","Elizabeth Bondi-Kelly","Jackson A. Killian","Catherine Ressijac","Peter Boucher","Andrew Davies","Milind Tambe"],"pdf_url":"https://arxiv.org/pdf/2409.18104v1.pdf","comment":"9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good"},{"id":"http://arxiv.org/abs/2409.18102v1","updated":"2024-09-26T17:45:10Z","published":"2024-09-26T17:45:10Z","title":"MALPOLON: A Framework for Deep Species Distribution Modeling","summary":" This paper describes a deep-SDM framework, MALPOLON. Written in Python and\nbuilt upon the PyTorch library, this framework aims to facilitate training and\ninferences of deep species distribution models (deep-SDM) and sharing for users\nwith only general Python language skills (e.g., modeling ecologists) who are\ninterested in testing deep learning approaches to build new SDMs. More advanced\nusers can also benefit from the framework's modularity to run more specific\nexperiments by overriding existing classes while taking advantage of\npress-button examples to train neural networks on multiple classification tasks\nusing custom or provided raw and pre-processed datasets. The framework is\nopen-sourced on GitHub and PyPi along with extensive documentation and examples\nof use in various scenarios. MALPOLON offers straightforward installation,\nYAML-based configuration, parallel computing, multi-GPU utilization, baseline\nand foundational models for benchmarking, and extensive\ntutorials/documentation, aiming to enhance accessibility and performance\nscalability for ecologists and researchers.\n","authors":["Theo Larcher","Lukas Picek","Benjamin Deneu","Titouan Lorieul","Maximilien Servajean","Alexis Joly"],"pdf_url":"https://arxiv.org/pdf/2409.18102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18101v1","updated":"2024-09-26T17:44:52Z","published":"2024-09-26T17:44:52Z","title":"AI-Powered Augmented Reality for Satellite Assembly, Integration and\n Test","summary":" The integration of Artificial Intelligence (AI) and Augmented Reality (AR) is\nset to transform satellite Assembly, Integration, and Testing (AIT) processes\nby enhancing precision, minimizing human error, and improving operational\nefficiency in cleanroom environments. This paper presents a technical\ndescription of the European Space Agency's (ESA) project \"AI for AR in\nSatellite AIT,\" which combines real-time computer vision and AR systems to\nassist technicians during satellite assembly. Leveraging Microsoft HoloLens 2\nas the AR interface, the system delivers context-aware instructions and\nreal-time feedback, tackling the complexities of object recognition and 6D pose\nestimation in AIT workflows. All AI models demonstrated over 70% accuracy, with\nthe detection model exceeding 95% accuracy, indicating a high level of\nperformance and reliability. A key contribution of this work lies in the\neffective use of synthetic data for training AI models in AR applications,\naddressing the significant challenges of obtaining real-world datasets in\nhighly dynamic satellite environments, as well as the creation of the Segmented\nAnything Model for Automatic Labelling (SAMAL), which facilitates the automatic\nannotation of real data, achieving speeds up to 20 times faster than manual\nhuman annotation. The findings demonstrate the efficacy of AI-driven AR systems\nin automating critical satellite assembly tasks, setting a foundation for\nfuture innovations in the space industry.\n","authors":["Alvaro Patricio","Joao Valente","Atabak Dehban","Ines Cadilha","Daniel Reis","Rodrigo Ventura"],"pdf_url":"https://arxiv.org/pdf/2409.18101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18100v1","updated":"2024-09-26T17:44:29Z","published":"2024-09-26T17:44:29Z","title":"Self-supervised Pretraining for Cardiovascular Magnetic Resonance Cine\n Segmentation","summary":" Self-supervised pretraining (SSP) has shown promising results in learning\nfrom large unlabeled datasets and, thus, could be useful for automated\ncardiovascular magnetic resonance (CMR) short-axis cine segmentation. However,\ninconsistent reports of the benefits of SSP for segmentation have made it\ndifficult to apply SSP to CMR. Therefore, this study aimed to evaluate SSP\nmethods for CMR cine segmentation.\n To this end, short-axis cine stacks of 296 subjects (90618 2D slices) were\nused for unlabeled pretraining with four SSP methods; SimCLR, positional\ncontrastive learning, DINO, and masked image modeling (MIM). Subsets of varying\nnumbers of subjects were used for supervised fine-tuning of 2D models for each\nSSP method, as well as to train a 2D baseline model from scratch. The\nfine-tuned models were compared to the baseline using the 3D Dice similarity\ncoefficient (DSC) in a test dataset of 140 subjects.\n The SSP methods showed no performance gains with the largest supervised\nfine-tuning subset compared to the baseline (DSC = 0.89). When only 10 subjects\n(231 2D slices) are available for supervised training, SSP using MIM (DSC =\n0.86) improves over training from scratch (DSC = 0.82).\n This study found that SSP is valuable for CMR cine segmentation when labeled\ntraining data is scarce, but does not aid state-of-the-art deep learning\nmethods when ample labeled data is available. Moreover, the choice of SSP\nmethod is important. The code is publicly available at:\nhttps://github.com/q-cardIA/ssp-cmr-cine-segmentation\n","authors":["Rob A. J. de Mooij","Josien P. W. Pluim","Cian M. Scannell"],"pdf_url":"https://arxiv.org/pdf/2409.18100v1.pdf","comment":"Accepted to Data Engineering in Medical Imaging (DEMI) Workshop at\n MICCAI 2024"},{"id":"http://arxiv.org/abs/2409.18099v1","updated":"2024-09-26T17:44:20Z","published":"2024-09-26T17:44:20Z","title":"EfficientCrackNet: A Lightweight Model for Crack Segmentation","summary":" Crack detection, particularly from pavement images, presents a formidable\nchallenge in the domain of computer vision due to several inherent complexities\nsuch as intensity inhomogeneity, intricate topologies, low contrast, and noisy\nbackgrounds. Automated crack detection is crucial for maintaining the\nstructural integrity of essential infrastructures, including buildings,\npavements, and bridges. Existing lightweight methods often face challenges\nincluding computational inefficiency, complex crack patterns, and difficult\nbackgrounds, leading to inaccurate detection and impracticality for real-world\napplications. To address these limitations, we propose EfficientCrackNet, a\nlightweight hybrid model combining Convolutional Neural Networks (CNNs) and\ntransformers for precise crack segmentation. EfficientCrackNet integrates\ndepthwise separable convolutions (DSC) layers and MobileViT block to capture\nboth global and local features. The model employs an Edge Extraction Method\n(EEM) and for efficient crack edge detection without pretraining, and\nUltra-Lightweight Subspace Attention Module (ULSAM) to enhance feature\nextraction. Extensive experiments on three benchmark datasets Crack500,\nDeepCrack, and GAPs384 demonstrate that EfficientCrackNet achieves superior\nperformance compared to existing lightweight models, while requiring only 0.26M\nparameters, and 0.483 FLOPs (G). The proposed model offers an optimal balance\nbetween accuracy and computational efficiency, outperforming state-of-the-art\nlightweight models, and providing a robust and adaptable solution for\nreal-world crack segmentation.\n","authors":["Abid Hasan Zim","Aquib Iqbal","Zaid Al-Huda","Asad Malik","Minoru Kuribayash"],"pdf_url":"https://arxiv.org/pdf/2409.18099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18092v1","updated":"2024-09-26T17:39:05Z","published":"2024-09-26T17:39:05Z","title":"DiffSSC: Semantic LiDAR Scan Completion using Denoising Diffusion\n Probabilistic Models","summary":" Perception systems play a crucial role in autonomous driving, incorporating\nmultiple sensors and corresponding computer vision algorithms. 3D LiDAR sensors\nare widely used to capture sparse point clouds of the vehicle's surroundings.\nHowever, such systems struggle to perceive occluded areas and gaps in the scene\ndue to the sparsity of these point clouds and their lack of semantics. To\naddress these challenges, Semantic Scene Completion (SSC) jointly predicts\nunobserved geometry and semantics in the scene given raw LiDAR measurements,\naiming for a more complete scene representation. Building on promising results\nof diffusion models in image generation and super-resolution tasks, we propose\ntheir extension to SSC by implementing the noising and denoising diffusion\nprocesses in the point and semantic spaces individually. To control the\ngeneration, we employ semantic LiDAR point clouds as conditional input and\ndesign local and global regularization losses to stabilize the denoising\nprocess. We evaluate our approach on autonomous driving datasets and our\napproach outperforms the state-of-the-art for SSC.\n","authors":["Helin Cao","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2409.18092v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2409.16147v2","updated":"2024-09-26T17:31:35Z","published":"2024-09-23T00:11:30Z","title":"Gaussian Deja-vu: Creating Controllable 3D Gaussian Head-Avatars with\n Enhanced Generalization and Personalization Abilities","summary":" Recent advancements in 3D Gaussian Splatting (3DGS) have unlocked significant\npotential for modeling 3D head avatars, providing greater flexibility than\nmesh-based methods and more efficient rendering compared to NeRF-based\napproaches. Despite these advancements, the creation of controllable 3DGS-based\nhead avatars remains time-intensive, often requiring tens of minutes to hours.\nTo expedite this process, we here introduce the ``Gaussian D\\'ej\\`a-vu\"\nframework, which first obtains a generalized model of the head avatar and then\npersonalizes the result. The generalized model is trained on large 2D\n(synthetic and real) image datasets. This model provides a well-initialized 3D\nGaussian head that is further refined using a monocular video to achieve the\npersonalized head avatar. For personalizing, we propose learnable\nexpression-aware rectification blendmaps to correct the initial 3D Gaussians,\nensuring rapid convergence without the reliance on neural networks. Experiments\ndemonstrate that the proposed method meets its objectives. It outperforms\nstate-of-the-art 3D Gaussian head avatars in terms of photorealistic quality as\nwell as reduces training time consumption to at least a quarter of the existing\nmethods, producing the avatar in minutes.\n","authors":["Peizhi Yan","Rabab Ward","Qiang Tang","Shan Du"],"pdf_url":"https://arxiv.org/pdf/2409.16147v2.pdf","comment":"11 pages, Accepted by WACV 2025 in Round 1"},{"id":"http://arxiv.org/abs/2409.18083v1","updated":"2024-09-26T17:26:18Z","published":"2024-09-26T17:26:18Z","title":"Stable Video Portraits","summary":" Rapid advances in the field of generative AI and text-to-image methods in\nparticular have transformed the way we interact with and perceive\ncomputer-generated imagery today. In parallel, much progress has been made in\n3D face reconstruction, using 3D Morphable Models (3DMM). In this paper, we\npresent SVP, a novel hybrid 2D/3D generation method that outputs photorealistic\nvideos of talking faces leveraging a large pre-trained text-to-image prior\n(2D), controlled via a 3DMM (3D). Specifically, we introduce a person-specific\nfine-tuning of a general 2D stable diffusion model which we lift to a video\nmodel by providing temporal 3DMM sequences as conditioning and by introducing a\ntemporal denoising procedure. As an output, this model generates temporally\nsmooth imagery of a person with 3DMM-based controls, i.e., a person-specific\navatar. The facial appearance of this person-specific avatar can be edited and\nmorphed to text-defined celebrities, without any fine-tuning at test time. The\nmethod is analyzed quantitatively and qualitatively, and we show that our\nmethod outperforms state-of-the-art monocular head avatar methods.\n","authors":["Mirela Ostrek","Justus Thies"],"pdf_url":"https://arxiv.org/pdf/2409.18083v1.pdf","comment":"Accepted at ECCV 2024, Project: https://svp.is.tue.mpg.de"},{"id":"http://arxiv.org/abs/2409.18082v1","updated":"2024-09-26T17:26:16Z","published":"2024-09-26T17:26:16Z","title":"SKT: Integrating State-Aware Keypoint Trajectories with Vision-Language\n Models for Robotic Garment Manipulation","summary":" Automating garment manipulation poses a significant challenge for assistive\nrobotics due to the diverse and deformable nature of garments. Traditional\napproaches typically require separate models for each garment type, which\nlimits scalability and adaptability. In contrast, this paper presents a unified\napproach using vision-language models (VLMs) to improve keypoint prediction\nacross various garment categories. By interpreting both visual and semantic\ninformation, our model enables robots to manage different garment states with a\nsingle model. We created a large-scale synthetic dataset using advanced\nsimulation techniques, allowing scalable training without extensive real-world\ndata. Experimental results indicate that the VLM-based method significantly\nenhances keypoint detection accuracy and task success rates, providing a more\nflexible and general solution for robotic garment manipulation. In addition,\nthis research also underscores the potential of VLMs to unify various garment\nmanipulation tasks within a single framework, paving the way for broader\napplications in home automation and assistive robotics for future.\n","authors":["Xin Li","Siyuan Huang","Qiaojun Yu","Zhengkai Jiang","Ce Hao","Yimeng Zhu","Hongsheng Li","Peng Gao","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2409.18082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18071v1","updated":"2024-09-26T17:18:39Z","published":"2024-09-26T17:18:39Z","title":"FreeEdit: Mask-free Reference-based Image Editing with Multi-modal\n Instruction","summary":" Introducing user-specified visual concepts in image editing is highly\npractical as these concepts convey the user's intent more precisely than\ntext-based descriptions. We propose FreeEdit, a novel approach for achieving\nsuch reference-based image editing, which can accurately reproduce the visual\nconcept from the reference image based on user-friendly language instructions.\nOur approach leverages the multi-modal instruction encoder to encode language\ninstructions to guide the editing process. This implicit way of locating the\nediting area eliminates the need for manual editing masks. To enhance the\nreconstruction of reference details, we introduce the Decoupled Residual\nReferAttention (DRRA) module. This module is designed to integrate fine-grained\nreference features extracted by a detail extractor into the image editing\nprocess in a residual way without interfering with the original self-attention.\nGiven that existing datasets are unsuitable for reference-based image editing\ntasks, particularly due to the difficulty in constructing image triplets that\ninclude a reference image, we curate a high-quality dataset, FreeBench, using a\nnewly developed twice-repainting scheme. FreeBench comprises the images before\nand after editing, detailed editing instructions, as well as a reference image\nthat maintains the identity of the edited object, encompassing tasks such as\nobject addition, replacement, and deletion. By conducting phased training on\nFreeBench followed by quality tuning, FreeEdit achieves high-quality zero-shot\nediting through convenient language instructions. We conduct extensive\nexperiments to evaluate the effectiveness of FreeEdit across multiple task\ntypes, demonstrating its superiority over existing methods. The code will be\navailable at: https://freeedit.github.io/.\n","authors":["Runze He","Kai Ma","Linjiang Huang","Shaofei Huang","Jialin Gao","Xiaoming Wei","Jiao Dai","Jizhong Han","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18071v1.pdf","comment":"14 pages, 14 figures, project website: https://freeedit.github.io/"},{"id":"http://arxiv.org/abs/2409.18057v1","updated":"2024-09-26T17:00:02Z","published":"2024-09-26T17:00:02Z","title":"LightAvatar: Efficient Head Avatar as Dynamic Neural Light Field","summary":" Recent works have shown that neural radiance fields (NeRFs) on top of\nparametric models have reached SOTA quality to build photorealistic head\navatars from a monocular video. However, one major limitation of the NeRF-based\navatars is the slow rendering speed due to the dense point sampling of NeRF,\npreventing them from broader utility on resource-constrained devices. We\nintroduce LightAvatar, the first head avatar model based on neural light fields\n(NeLFs). LightAvatar renders an image from 3DMM parameters and a camera pose\nvia a single network forward pass, without using mesh or volume rendering. The\nproposed approach, while being conceptually appealing, poses a significant\nchallenge towards real-time efficiency and training stability. To resolve them,\nwe introduce dedicated network designs to obtain proper representations for the\nNeLF model and maintain a low FLOPs budget. Meanwhile, we tap into a\ndistillation-based training strategy that uses a pretrained avatar model as\nteacher to synthesize abundant pseudo data for training. A warping field\nnetwork is introduced to correct the fitting error in the real data so that the\nmodel can learn better. Extensive experiments suggest that our method can\nachieve new SOTA image quality quantitatively or qualitatively, while being\nsignificantly faster than the counterparts, reporting 174.1 FPS (512x512\nresolution) on a consumer-grade GPU (RTX3090) with no customized optimization.\n","authors":["Huan Wang","Feitong Tan","Ziqian Bai","Yinda Zhang","Shichen Liu","Qiangeng Xu","Menglei Chai","Anish Prabhu","Rohit Pandey","Sean Fanello","Zeng Huang","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2409.18057v1.pdf","comment":"Appear in ECCV'24 CADL Workshop. Code:\n https://github.com/MingSun-Tse/LightAvatar-TensorFlow"},{"id":"http://arxiv.org/abs/2409.18055v1","updated":"2024-09-26T16:59:01Z","published":"2024-09-26T16:59:01Z","title":"Visual Data Diagnosis and Debiasing with Concept Graphs","summary":" The widespread success of deep learning models today is owed to the curation\nof extensive datasets significant in size and complexity. However, such models\nfrequently pick up inherent biases in the data during the training process,\nleading to unreliable predictions. Diagnosing and debiasing datasets is thus a\nnecessity to ensure reliable model performance. In this paper, we present\nCONBIAS, a novel framework for diagnosing and mitigating Concept co-occurrence\nBiases in visual datasets. CONBIAS represents visual datasets as knowledge\ngraphs of concepts, enabling meticulous analysis of spurious concept\nco-occurrences to uncover concept imbalances across the whole dataset.\nMoreover, we show that by employing a novel clique-based concept balancing\nstrategy, we can mitigate these imbalances, leading to enhanced performance on\ndownstream tasks. Extensive experiments show that data augmentation based on a\nbalanced concept distribution augmented by CONBIAS improves generalization\nperformance across multiple datasets compared to state-of-the-art methods. We\nwill make our code and data publicly available.\n","authors":["Rwiddhi Chakraborty","Yinong Wang","Jialu Gao","Runkai Zheng","Cheng Zhang","Fernando De la Torre"],"pdf_url":"https://arxiv.org/pdf/2409.18055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08168v3","updated":"2024-09-26T16:51:37Z","published":"2023-12-13T14:27:45Z","title":"Chat-Scene: Bridging 3D Scene and Large Language Models with Object\n Identifiers","summary":" Recent advancements in 3D Large Language Models (LLMs) have demonstrated\npromising capabilities for 3D scene understanding. However, previous methods\nexhibit deficiencies in general referencing and grounding capabilities for\nintricate scene comprehension. In this paper, we introduce the use of object\nidentifiers and object-centric representations to interact with scenes at the\nobject level. Specifically, we decompose the input 3D scene into a set of\nobject proposals, each assigned a unique identifier token, which enables\nefficient object referencing and grounding during user-assistant interactions.\nGiven the scarcity of scene-language data, we model the scene embeddings as a\nsequence of explicit object-level embeddings, derived from semantic-rich 2D or\n3D representations. By employing object identifiers, we transform diverse 3D\nscene-language tasks into a unified question-answering format, facilitating\njoint training without the need for additional task-specific heads. With\nminimal fine-tuning on all downstream tasks, our model significantly\noutperforms existing methods on benchmarks including ScanRefer, Multi3DRefer,\nScan2Cap, ScanQA, and SQA3D.\n","authors":["Haifeng Huang","Yilun Chen","Zehan Wang","Rongjie Huang","Runsen Xu","Tai Wang","Luping Liu","Xize Cheng","Yang Zhao","Jiangmiao Pang","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.08168v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18049v1","updated":"2024-09-26T16:49:58Z","published":"2024-09-26T16:49:58Z","title":"Revisit Anything: Visual Place Recognition via Image Segment Retrieval","summary":" Accurately recognizing a revisited place is crucial for embodied agents to\nlocalize and navigate. This requires visual representations to be distinct,\ndespite strong variations in camera viewpoint and scene appearance. Existing\nvisual place recognition pipelines encode the \"whole\" image and search for\nmatches. This poses a fundamental challenge in matching two images of the same\nplace captured from different camera viewpoints: \"the similarity of what\noverlaps can be dominated by the dissimilarity of what does not overlap\". We\naddress this by encoding and searching for \"image segments\" instead of the\nwhole images. We propose to use open-set image segmentation to decompose an\nimage into `meaningful' entities (i.e., things and stuff). This enables us to\ncreate a novel image representation as a collection of multiple overlapping\nsubgraphs connecting a segment with its neighboring segments, dubbed\nSuperSegment. Furthermore, to efficiently encode these SuperSegments into\ncompact vector representations, we propose a novel factorized representation of\nfeature aggregation. We show that retrieving these partial representations\nleads to significantly higher recognition recall than the typical whole image\nbased retrieval. Our segments-based approach, dubbed SegVLAD, sets a new\nstate-of-the-art in place recognition on a diverse selection of benchmark\ndatasets, while being applicable to both generic and task-specialized image\nencoders. Finally, we demonstrate the potential of our method to ``revisit\nanything'' by evaluating our method on an object instance retrieval task, which\nbridges the two disparate areas of research: visual place recognition and\nobject-goal navigation, through their common aim of recognizing goal objects\nspecific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything.\n","authors":["Kartik Garg","Sai Shubodh Puligilla","Shishir Kolathaya","Madhava Krishna","Sourav Garg"],"pdf_url":"https://arxiv.org/pdf/2409.18049v1.pdf","comment":"Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures"},{"id":"http://arxiv.org/abs/2409.18046v1","updated":"2024-09-26T16:47:32Z","published":"2024-09-26T16:47:32Z","title":"IFCap: Image-like Retrieval and Frequency-based Entity Filtering for\n Zero-shot Captioning","summary":" Recent advancements in image captioning have explored text-only training\nmethods to overcome the limitations of paired image-text data. However,\nexisting text-only training methods often overlook the modality gap between\nusing text data during training and employing images during inference. To\naddress this issue, we propose a novel approach called Image-like Retrieval,\nwhich aligns text features with visually relevant features to mitigate the\nmodality gap. Our method further enhances the accuracy of generated captions by\ndesigning a Fusion Module that integrates retrieved captions with input\nfeatures. Additionally, we introduce a Frequency-based Entity Filtering\ntechnique that significantly improves caption quality. We integrate these\nmethods into a unified framework, which we refer to as IFCap\n($\\textbf{I}$mage-like Retrieval and $\\textbf{F}$requency-based Entity\nFiltering for Zero-shot $\\textbf{Cap}$tioning). Through extensive\nexperimentation, our straightforward yet powerful approach has demonstrated its\nefficacy, outperforming the state-of-the-art methods by a significant margin in\nboth image captioning and video captioning compared to zero-shot captioning\nbased on text-only training.\n","authors":["Soeun Lee","Si-Woo Kim","Taewhan Kim","Dong-Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18046v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18042v1","updated":"2024-09-26T16:44:02Z","published":"2024-09-26T16:44:02Z","title":"EMOVA: Empowering Language Models to See, Hear and Speak with Vivid\n Emotions","summary":" GPT-4o, an omni-modal model that enables vocal conversations with diverse\nemotions and tones, marks a milestone for omni-modal foundation models.\nHowever, empowering Large Language Models to perceive and generate images,\ntexts, and speeches end-to-end with publicly available data remains challenging\nin the open-source community. Existing vision-language models rely on external\ntools for the speech processing, while speech-language models still suffer from\nlimited or even without vision-understanding abilities. To address this gap, we\npropose EMOVA (EMotionally Omni-present Voice Assistant), to enable Large\nLanguage Models with end-to-end speech capabilities while maintaining the\nleading vision-language performance. With a semantic-acoustic disentangled\nspeech tokenizer, we notice surprisingly that omni-modal alignment can further\nenhance vision-language and speech abilities compared with the corresponding\nbi-modal aligned counterparts. Moreover, a lightweight style module is proposed\nfor flexible speech style controls (e.g., emotions and pitches). For the first\ntime, EMOVA achieves state-of-the-art performance on both the vision-language\nand speech benchmarks, and meanwhile, supporting omni-modal spoken dialogue\nwith vivid emotions.\n","authors":["Kai Chen","Yunhao Gou","Runhui Huang","Zhili Liu","Daxin Tan","Jing Xu","Chunwei Wang","Yi Zhu","Yihan Zeng","Kuo Yang","Dingdong Wang","Kun Xiang","Haoyuan Li","Haoli Bai","Jianhua Han","Xiaohui Li","Weike Jin","Nian Xie","Yu Zhang","James T. Kwok","Hengshuang Zhao","Xiaodan Liang","Dit-Yan Yeung","Xiao Chen","Zhenguo Li","Wei Zhang","Qun Liu","Lanqing Hong","Lu Hou","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.18042v1.pdf","comment":"Project Page: https://emova-ollm.github.io/"},{"id":"http://arxiv.org/abs/2311.04591v4","updated":"2024-09-26T16:35:58Z","published":"2023-11-08T10:45:09Z","title":"Exploring Event-based Human Pose Estimation with 3D Event\n Representations","summary":" Human pose estimation is a fundamental and appealing task in computer vision.\nAlthough traditional cameras are commonly applied, their reliability decreases\nin scenarios under high dynamic range or heavy motion blur, where event cameras\noffer a robust solution. Predominant event-based methods accumulate events into\nframes, ignoring the asynchronous and high temporal resolution that is crucial\nfor distinguishing distinct actions. To address this issue and to unlock the 3D\npotential of event information, we introduce two 3D event representations: the\nRasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The\nRasEPC aggregates events within concise temporal slices at identical positions,\npreserving their 3D attributes along with statistical information, thereby\nsignificantly reducing memory and computational demands. Meanwhile, the DEV\nrepresentation discretizes events into voxels and projects them across three\northogonal planes, utilizing decoupled event attention to retrieve 3D cues from\nthe 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic\nevent-based dataset crafted to facilitate training and quantitative analysis in\noutdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD\ndataset, and our EV-3DPW dataset, with further qualitative validation via a\nderived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our\ncode and dataset have been made publicly available at\nhttps://github.com/MasterHow/EventPointPose.\n","authors":["Xiaoting Yin","Hao Shi","Jiaan Chen","Ze Wang","Yaozu Ye","Kailun Yang","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2311.04591v4.pdf","comment":"Accepted to Computer Vision and Image Understanding (CVPU). Extended\n version of arXiv:2206.04511. The code and dataset are available at\n https://github.com/MasterHow/EventPointPose"},{"id":"http://arxiv.org/abs/2409.18026v1","updated":"2024-09-26T16:33:16Z","published":"2024-09-26T16:33:16Z","title":"ReliOcc: Towards Reliable Semantic Occupancy Prediction via Uncertainty\n Learning","summary":" Vision-centric semantic occupancy prediction plays a crucial role in\nautonomous driving, which requires accurate and reliable predictions from\nlow-cost sensors. Although having notably narrowed the accuracy gap with LiDAR,\nthere is still few research effort to explore the reliability in predicting\nsemantic occupancy from camera. In this paper, we conduct a comprehensive\nevaluation of existing semantic occupancy prediction models from a reliability\nperspective for the first time. Despite the gradual alignment of camera-based\nmodels with LiDAR in term of accuracy, a significant reliability gap persists.\nTo addresses this concern, we propose ReliOcc, a method designed to enhance the\nreliability of camera-based occupancy networks. ReliOcc provides a\nplug-and-play scheme for existing models, which integrates hybrid uncertainty\nfrom individual voxels with sampling-based noise and relative voxels through\nmix-up learning. Besides, an uncertainty-aware calibration strategy is devised\nto further enhance model reliability in offline mode. Extensive experiments\nunder various settings demonstrate that ReliOcc significantly enhances model\nreliability while maintaining the accuracy of both geometric and semantic\npredictions. Importantly, our proposed approach exhibits robustness to sensor\nfailures and out of domain noises during inference.\n","authors":["Song Wang","Zhongdao Wang","Jiawei Yu","Wentong Li","Bailan Feng","Junbo Chen","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.18026v1.pdf","comment":"Technical report. Work in progress"},{"id":"http://arxiv.org/abs/2409.18017v1","updated":"2024-09-26T16:25:48Z","published":"2024-09-26T16:25:48Z","title":"Transferring disentangled representations: bridging the gap between\n synthetic and real images","summary":" Developing meaningful and efficient representations that separate the\nfundamental structure of the data generation mechanism is crucial in\nrepresentation learning. However, Disentangled Representation Learning has not\nfully shown its potential on real images, because of correlated generative\nfactors, their resolution and limited access to ground truth labels.\nSpecifically on the latter, we investigate the possibility of leveraging\nsynthetic data to learn general-purpose disentangled representations applicable\nto real data, discussing the effect of fine-tuning and what properties of\ndisentanglement are preserved after the transfer. We provide an extensive\nempirical study to address these issues. In addition, we propose a new\ninterpretable intervention-based metric, to measure the quality of factors\nencoding in the representation. Our results indicate that some level of\ndisentanglement, transferring a representation from synthetic to real data, is\npossible and effective.\n","authors":["Jacopo Dapueto","Nicoletta Noceti","Francesca Odone"],"pdf_url":"https://arxiv.org/pdf/2409.18017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14579v2","updated":"2024-09-26T16:25:33Z","published":"2023-12-22T10:15:15Z","title":"Synthesizing Environment-Specific People in Photographs","summary":" We present ESP, a novel method for context-aware full-body generation, that\nenables photo-realistic synthesis and inpainting of people wearing clothing\nthat is semantically appropriate for the scene depicted in an input photograph.\nESP is conditioned on a 2D pose and contextual cues that are extracted from the\nphotograph of the scene and integrated into the generation process, where the\nclothing is modeled explicitly with human parsing masks (HPM). Generated HPMs\nare used as tight guiding masks for inpainting, such that no changes are made\nto the original background. Our models are trained on a dataset containing a\nset of in-the-wild photographs of people covering a wide range of different\nenvironments. The method is analyzed quantitatively and qualitatively, and we\nshow that ESP outperforms the state-of-the-art on the task of contextual\nfull-body generation.\n","authors":["Mirela Ostrek","Carol O'Sullivan","Michael J. Black","Justus Thies"],"pdf_url":"https://arxiv.org/pdf/2312.14579v2.pdf","comment":"Accepted at ECCV 2024, Project: https://esp.is.tue.mpg.de"},{"id":"http://arxiv.org/abs/2406.08113v3","updated":"2024-09-26T16:14:54Z","published":"2024-06-12T11:50:51Z","title":"Valeo4Cast: A Modular Approach to End-to-End Forecasting","summary":" Motion forecasting is crucial in autonomous driving systems to anticipate the\nfuture trajectories of surrounding agents such as pedestrians, vehicles, and\ntraffic signals. In end-to-end forecasting, the model must jointly detect and\ntrack from sensor data (cameras or LiDARs) the past trajectories of the\ndifferent elements of the scene and predict their future locations. We depart\nfrom the current trend of tackling this task via end-to-end training from\nperception to forecasting, and instead use a modular approach. We individually\nbuild and train detection, tracking and forecasting modules. We then only use\nconsecutive finetuning steps to integrate the modules better and alleviate\ncompounding errors. We conduct an in-depth study on the finetuning strategies\nand it reveals that our simple yet effective approach significantly improves\nperformance on the end-to-end forecasting benchmark. Consequently, our solution\nranks first in the Argoverse 2 End-to-end Forecasting Challenge, with 63.82\nmAPf. We surpass forecasting results by +17.1 points over last year's winner\nand by +13.3 points over this year's runner-up. This remarkable performance in\nforecasting can be explained by our modular paradigm, which integrates\nfinetuning strategies and significantly outperforms the end-to-end-trained\ncounterparts. The code, model weights and results are made available\nhttps://github.com/valeoai/valeo4cast.\n","authors":["Yihong Xu","Éloi Zablocki","Alexandre Boulch","Gilles Puy","Mickael Chen","Florent Bartoccioni","Nermin Samet","Oriane Siméoni","Spyros Gidaris","Tuan-Hung Vu","Andrei Bursuc","Eduardo Valle","Renaud Marlet","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2406.08113v3.pdf","comment":"Winning solution of the Argoverse 2 \"Unified Detection, Tracking, and\n Forecasting\" challenge; work accepted at Road++ ECCVW 2024"},{"id":"http://arxiv.org/abs/2312.05295v2","updated":"2024-09-26T16:11:37Z","published":"2023-12-08T18:43:12Z","title":"Disentangled Clothed Avatar Generation from Text Descriptions","summary":" In this paper, we introduce a novel text-to-avatar generation method that\nseparately generates the human body and the clothes and allows high-quality\nanimation on the generated avatar. While recent advancements in text-to-avatar\ngeneration have yielded diverse human avatars from text prompts, these methods\ntypically combine all elements-clothes, hair, and body-into a single 3D\nrepresentation. Such an entangled approach poses challenges for downstream\ntasks like editing or animation. To overcome these limitations, we propose a\nnovel disentangled 3D avatar representation named Sequentially Offset-SMPL\n(SO-SMPL), building upon the SMPL model. SO-SMPL represents the human body and\nclothes with two separate meshes but associates them with offsets to ensure the\nphysical alignment between the body and the clothes. Then, we design a Score\nDistillation Sampling (SDS)-based distillation framework to generate the\nproposed SO-SMPL representation from text prompts. Our approach not only\nachieves higher texture and geometry quality and better semantic alignment with\ntext prompts, but also significantly improves the visual quality of character\nanimation, virtual try-on, and avatar editing. Project page:\nhttps://shanemankiw.github.io/SO-SMPL/.\n","authors":["Jionghao Wang","Yuan Liu","Zhiyang Dou","Zhengming Yu","Yongqing Liang","Cheng Lin","Xin Li","Wenping Wang","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2312.05295v2.pdf","comment":"Project page: https://shanemankiw.github.io/SO-SMPL/"},{"id":"http://arxiv.org/abs/2409.17996v1","updated":"2024-09-26T16:07:24Z","published":"2024-09-26T16:07:24Z","title":"PhoCoLens: Photorealistic and Consistent Reconstruction in Lensless\n Imaging","summary":" Lensless cameras offer significant advantages in size, weight, and cost\ncompared to traditional lens-based systems. Without a focusing lens, lensless\ncameras rely on computational algorithms to recover the scenes from multiplexed\nmeasurements. However, current algorithms struggle with inaccurate forward\nimaging models and insufficient priors to reconstruct high-quality images. To\novercome these limitations, we introduce a novel two-stage approach for\nconsistent and photorealistic lensless image reconstruction. The first stage of\nour approach ensures data consistency by focusing on accurately reconstructing\nthe low-frequency content with a spatially varying deconvolution method that\nadjusts to changes in the Point Spread Function (PSF) across the camera's field\nof view. The second stage enhances photorealism by incorporating a generative\nprior from pre-trained diffusion models. By conditioning on the low-frequency\ncontent retrieved in the first stage, the diffusion model effectively\nreconstructs the high-frequency details that are typically lost in the lensless\nimaging process, while also maintaining image fidelity. Our method achieves a\nsuperior balance between data fidelity and visual quality compared to existing\nmethods, as demonstrated with two popular lensless systems, PhlatCam and\nDiffuserCam. Project website: https://phocolens.github.io/.\n","authors":["Xin Cai","Zhiyuan You","Hailong Zhang","Wentao Liu","Jinwei Gu","Tianfan Xue"],"pdf_url":"https://arxiv.org/pdf/2409.17996v1.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2409.17993v1","updated":"2024-09-26T16:04:31Z","published":"2024-09-26T16:04:31Z","title":"InterNet: Unsupervised Cross-modal Homography Estimation Based on\n Interleaved Modality Transfer and Self-supervised Homography Prediction","summary":" We propose a novel unsupervised cross-modal homography estimation framework,\nbased on interleaved modality transfer and self-supervised homography\nprediction, named InterNet. InterNet integrates modality transfer and\nself-supervised homography estimation, introducing an innovative interleaved\noptimization framework to alternately promote both components. The modality\ntransfer gradually narrows the modality gaps, facilitating the self-supervised\nhomography estimation to fully leverage the synthetic intra-modal data. The\nself-supervised homography estimation progressively achieves reliable\npredictions, thereby providing robust cross-modal supervision for the modality\ntransfer. To further boost the estimation accuracy, we also formulate a\nfine-grained homography feature loss to improve the connection between two\ncomponents. Furthermore, we employ a simple yet effective distillation training\ntechnique to reduce model parameters and improve cross-domain generalization\nability while maintaining comparable performance. Experiments reveal that\nInterNet achieves the state-of-the-art (SOTA) performance among unsupervised\nmethods, and even outperforms many supervised methods such as MHN and\nLocalTrans.\n","authors":["Junchen Yu","Si-Yuan Cao","Runmin Zhang","Chenghao Zhang","Jianxin Hu","Zhu Yu","Hui-liang Shen"],"pdf_url":"https://arxiv.org/pdf/2409.17993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17988v1","updated":"2024-09-26T15:57:20Z","published":"2024-09-26T15:57:20Z","title":"Deblur e-NeRF: NeRF from Motion-Blurred Events under High-speed or\n Low-light Conditions","summary":" The stark contrast in the design philosophy of an event camera makes it\nparticularly ideal for operating under high-speed, high dynamic range and\nlow-light conditions, where standard cameras underperform. Nonetheless, event\ncameras still suffer from some amount of motion blur, especially under these\nchallenging conditions, in contrary to what most think. This is attributed to\nthe limited bandwidth of the event sensor pixel, which is mostly proportional\nto the light intensity. Thus, to ensure that event cameras can truly excel in\nsuch conditions where it has an edge over standard cameras, it is crucial to\naccount for event motion blur in downstream applications, especially\nreconstruction. However, none of the recent works on reconstructing Neural\nRadiance Fields (NeRFs) from events, nor event simulators, have considered the\nfull effects of event motion blur. To this end, we propose, Deblur e-NeRF, a\nnovel method to directly and effectively reconstruct blur-minimal NeRFs from\nmotion-blurred events generated under high-speed motion or low-light\nconditions. The core component of this work is a physically-accurate pixel\nbandwidth model proposed to account for event motion blur under arbitrary speed\nand lighting conditions. We also introduce a novel threshold-normalized total\nvariation loss to improve the regularization of large textureless patches.\nExperiments on real and novel realistically simulated sequences verify our\neffectiveness. Our code, event simulator and synthetic event dataset will be\nopen-sourced.\n","authors":["Weng Fei Low","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2409.17988v1.pdf","comment":"Accepted to ECCV 2024. Project website is accessible at\n https://wengflow.github.io/deblur-e-nerf. arXiv admin note: text overlap with\n arXiv:2006.07722 by other authors"},{"id":"http://arxiv.org/abs/2409.17987v1","updated":"2024-09-26T15:57:08Z","published":"2024-09-26T15:57:08Z","title":"LLM4Brain: Training a Large Language Model for Brain Video Understanding","summary":" Decoding visual-semantic information from brain signals, such as functional\nMRI (fMRI), across different subjects poses significant challenges, including\nlow signal-to-noise ratio, limited data availability, and cross-subject\nvariability. Recent advancements in large language models (LLMs) show\nremarkable effectiveness in processing multimodal information. In this study,\nwe introduce an LLM-based approach for reconstructing visual-semantic\ninformation from fMRI signals elicited by video stimuli. Specifically, we\nemploy fine-tuning techniques on an fMRI encoder equipped with adaptors to\ntransform brain responses into latent representations aligned with the video\nstimuli. Subsequently, these representations are mapped to textual modality by\nLLM. In particular, we integrate self-supervised domain adaptation methods to\nenhance the alignment between visual-semantic information and brain responses.\nOur proposed method achieves good results using various quantitative semantic\nmetrics, while yielding similarity with ground-truth information.\n","authors":["Ruizhe Zheng","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2409.17987v1.pdf","comment":"ECCV2024 Workshop"},{"id":"http://arxiv.org/abs/2409.17981v1","updated":"2024-09-26T15:54:18Z","published":"2024-09-26T15:54:18Z","title":"BlinkTrack: Feature Tracking over 100 FPS via Events and Images","summary":" Feature tracking is crucial for, structure from motion (SFM), simultaneous\nlocalization and mapping (SLAM), object tracking and various computer vision\ntasks. Event cameras, known for their high temporal resolution and ability to\ncapture asynchronous changes, have gained significant attention for their\npotential in feature tracking, especially in challenging conditions. However,\nevent cameras lack the fine-grained texture information that conventional\ncameras provide, leading to error accumulation in tracking. To address this, we\npropose a novel framework, BlinkTrack, which integrates event data with RGB\nimages for high-frequency feature tracking. Our method extends the traditional\nKalman filter into a learning-based framework, utilizing differentiable Kalman\nfilters in both event and image branches. This approach improves\nsingle-modality tracking, resolves ambiguities, and supports asynchronous data\nfusion. We also introduce new synthetic and augmented datasets to better\nevaluate our model. Experimental results indicate that BlinkTrack significantly\noutperforms existing event-based methods, exceeding 100 FPS with preprocessed\nevent data and 80 FPS with multi-modality data.\n","authors":["Yichen Shen","Yijin Li","Shuo Chen","Guanglin Li","Zhaoyang Huang","Hujun Bao","Zhaopeng Cui","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17978v1","updated":"2024-09-26T15:52:36Z","published":"2024-09-26T15:52:36Z","title":"HydraViT: Stacking Heads for a Scalable ViT","summary":" The architecture of Vision Transformers (ViTs), particularly the Multi-head\nAttention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs\non devices with varying constraints, such as mobile phones, requires multiple\nmodels of different sizes. However, this approach has limitations, such as\ntraining and storing each required model separately. This paper introduces\nHydraViT, a novel approach that addresses these limitations by stacking\nattention heads to achieve a scalable ViT. By repeatedly changing the size of\nthe embedded dimensions throughout each layer and their corresponding number of\nattention heads in MHA during training, HydraViT induces multiple subnetworks.\nThereby, HydraViT achieves adaptability across a wide spectrum of hardware\nenvironments while maintaining performance. Our experimental results\ndemonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10\nsubnetworks, covering a wide range of resource constraints. HydraViT achieves\nup to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy\nwith the same throughput on ImageNet-1K compared to the baselines, making it an\neffective solution for scenarios where hardware availability is diverse or\nvaries over time. Source code available at https://github.com/ds-kiel/HydraViT.\n","authors":["Janek Haberer","Ali Hojjat","Olaf Landsiedel"],"pdf_url":"https://arxiv.org/pdf/2409.17978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17977v1","updated":"2024-09-26T15:52:34Z","published":"2024-09-26T15:52:34Z","title":"Cross-Modality Attack Boosted by Gradient-Evolutionary Multiform\n Optimization","summary":" In recent years, despite significant advancements in adversarial attack\nresearch, the security challenges in cross-modal scenarios, such as the\ntransferability of adversarial attacks between infrared, thermal, and RGB\nimages, have been overlooked. These heterogeneous image modalities collected by\ndifferent hardware devices are widely prevalent in practical applications, and\nthe substantial differences between modalities pose significant challenges to\nattack transferability. In this work, we explore a novel cross-modal\nadversarial attack strategy, termed multiform attack. We propose a dual-layer\noptimization framework based on gradient-evolution, facilitating efficient\nperturbation transfer between modalities. In the first layer of optimization,\nthe framework utilizes image gradients to learn universal perturbations within\neach modality and employs evolutionary algorithms to search for shared\nperturbations with transferability across different modalities through\nsecondary optimization. Through extensive testing on multiple heterogeneous\ndatasets, we demonstrate the superiority and robustness of Multiform Attack\ncompared to existing techniques. This work not only enhances the\ntransferability of cross-modal adversarial attacks but also provides a new\nperspective for understanding security vulnerabilities in cross-modal systems.\n","authors":["Yunpeng Gong","Qingyuan Zeng","Dejun Xu","Zhenzhong Wang","Min Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.17977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17963v1","updated":"2024-09-26T15:41:18Z","published":"2024-09-26T15:41:18Z","title":"CNCA: Toward Customizable and Natural Generation of Adversarial\n Camouflage for Vehicle Detectors","summary":" Prior works on physical adversarial camouflage against vehicle detectors\nmainly focus on the effectiveness and robustness of the attack. The current\nmost successful methods optimize 3D vehicle texture at a pixel level. However,\nthis results in conspicuous and attention-grabbing patterns in the generated\ncamouflage, which humans can easily identify. To address this issue, we propose\na Customizable and Natural Camouflage Attack (CNCA) method by leveraging an\noff-the-shelf pre-trained diffusion model. By sampling the optimal texture\nimage from the diffusion model with a user-specific text prompt, our method can\ngenerate natural and customizable adversarial camouflage while maintaining high\nattack performance. With extensive experiments on the digital and physical\nworlds and user studies, the results demonstrate that our proposed method can\ngenerate significantly more natural-looking camouflage than the\nstate-of-the-art baselines while achieving competitive attack performance. Our\ncode is available at\n\\href{https://anonymous.4open.science/r/CNCA-1D54}{https://anonymous.4open.science/r/CNCA-1D54}\n","authors":["Linye Lyu","Jiawei Zhou","Daojing He","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2409.17963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10814v3","updated":"2024-09-26T15:37:58Z","published":"2023-08-21T16:03:35Z","title":"Jumping through Local Minima: Quantization in the Loss Landscape of\n Vision Transformers","summary":" Quantization scale and bit-width are the most important parameters when\nconsidering how to quantize a neural network. Prior work focuses on optimizing\nquantization scales in a global manner through gradient methods (gradient\ndescent \\& Hessian analysis). Yet, when applying perturbations to quantization\nscales, we observe a very jagged, highly non-smooth test loss landscape. In\nfact, small perturbations in quantization scale can greatly affect accuracy,\nyielding a $0.5-0.8\\%$ accuracy boost in 4-bit quantized vision transformers\n(ViTs). In this regime, gradient methods break down, since they cannot reliably\nreach local minima. In our work, dubbed Evol-Q, we use evolutionary search to\neffectively traverse the non-smooth landscape. Additionally, we propose using\nan infoNCE loss, which not only helps combat overfitting on the small\ncalibration dataset ($1,000$ images) but also makes traversing such a highly\nnon-smooth surface easier. Evol-Q improves the top-1 accuracy of a fully\nquantized ViT-Base by $10.30\\%$, $0.78\\%$, and $0.15\\%$ for $3$-bit, $4$-bit,\nand $8$-bit weight quantization levels. Extensive experiments on a variety of\nCNN and ViT architectures further demonstrate its robustness in extreme\nquantization scenarios. Our code is available at\nhttps://github.com/enyac-group/evol-q\n","authors":["Natalia Frumkin","Dibakar Gope","Diana Marculescu"],"pdf_url":"https://arxiv.org/pdf/2308.10814v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2211.09643"},{"id":"http://arxiv.org/abs/2409.17958v1","updated":"2024-09-26T15:36:10Z","published":"2024-09-26T15:36:10Z","title":"The Hard Positive Truth about Vision-Language Compositionality","summary":" Several benchmarks have concluded that our best vision-language models (e.g.,\nCLIP) are lacking in compositionality. Given an image, these benchmarks probe a\nmodel's ability to identify its associated caption amongst a set of\ncompositional distractors. In response, a surge of recent proposals show\nimprovements by finetuning CLIP with distractors as hard negatives. Our\ninvestigations reveal that these improvements have, in fact, been significantly\noverstated -- because existing benchmarks do not probe whether finetuned\nvision-language models remain invariant to hard positives. By curating an\nevaluation dataset with 112,382 hard negatives and hard positives, we uncover\nthat including hard positives decreases CLIP's performance by 12.9%, while\nhumans perform effortlessly at 99%. CLIP finetuned with hard negatives results\nin an even larger decrease, up to 38.7%. With this finding, we then produce a\n1,775,259 image-text training set with both hard negative and hard positive\ncaptions. By training with both, we see improvements on existing benchmarks\nwhile simultaneously improving performance on hard positives, indicating a more\nrobust improvement in compositionality. Our work suggests the need for future\nresearch to rigorously test and improve CLIP's understanding of semantic\nrelationships between related \"positive\" concepts.\n","authors":["Amita Kamath","Cheng-Yu Hsieh","Kai-Wei Chang","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2409.17958v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2312.14115v4","updated":"2024-09-26T15:30:00Z","published":"2023-12-21T18:40:34Z","title":"LingoQA: Visual Question Answering for Autonomous Driving","summary":" We introduce LingoQA, a novel dataset and benchmark for visual question\nanswering in autonomous driving. The dataset contains 28K unique short video\nscenarios, and 419K annotations. Evaluating state-of-the-art vision-language\nmodels on our benchmark shows that their performance is below human\ncapabilities, with GPT-4V responding truthfully to 59.6% of the questions\ncompared to 96.6% for humans. For evaluation, we propose a truthfulness\nclassifier, called Lingo-Judge, that achieves a 0.95 Spearman correlation\ncoefficient to human evaluations, surpassing existing techniques like METEOR,\nBLEU, CIDEr, and GPT-4. We establish a baseline vision-language model and run\nextensive ablation studies to understand its performance. We release our\ndataset and benchmark as an evaluation platform for vision-language models in\nautonomous driving.\n","authors":["Ana-Maria Marcu","Long Chen","Jan Hünermann","Alice Karnsund","Benoit Hanotte","Prajwal Chidananda","Saurabh Nair","Vijay Badrinarayanan","Alex Kendall","Jamie Shotton","Elahe Arani","Oleg Sinavski"],"pdf_url":"https://arxiv.org/pdf/2312.14115v4.pdf","comment":"Accepted to ECCV 2024. Benchmark and dataset are available at\n https://github.com/wayveai/LingoQA/"},{"id":"http://arxiv.org/abs/2409.17951v1","updated":"2024-09-26T15:28:25Z","published":"2024-09-26T15:28:25Z","title":"Spatial Hierarchy and Temporal Attention Guided Cross Masking for\n Self-supervised Skeleton-based Action Recognition","summary":" In self-supervised skeleton-based action recognition, the mask reconstruction\nparadigm is gaining interest in enhancing model refinement and robustness\nthrough effective masking. However, previous works primarily relied on a single\nmasking criterion, resulting in the model overfitting specific features and\noverlooking other effective information. In this paper, we introduce a\nhierarchy and attention guided cross-masking framework (HA-CM) that applies\nmasking to skeleton sequences from both spatial and temporal perspectives.\nSpecifically, in spatial graphs, we utilize hyperbolic space to maintain joint\ndistinctions and effectively preserve the hierarchical structure of\nhigh-dimensional skeletons, employing joint hierarchy as the masking criterion.\nIn temporal flows, we substitute traditional distance metrics with the global\nattention of joints for masking, addressing the convergence of distances in\nhigh-dimensional space and the lack of a global perspective. Additionally, we\nincorporate cross-contrast loss based on the cross-masking framework into the\nloss function to enhance the model's learning of instance-level features. HA-CM\nshows efficiency and universality on three public large-scale datasets, NTU-60,\nNTU-120, and PKU-MMD. The source code of our HA-CM is available at\nhttps://github.com/YinxPeng/HA-CM-main.\n","authors":["Xinpeng Yin","Wenming Cao"],"pdf_url":"https://arxiv.org/pdf/2409.17951v1.pdf","comment":"12 pages,6 figures,IEEE Trans"},{"id":"http://arxiv.org/abs/2409.17941v1","updated":"2024-09-26T15:16:32Z","published":"2024-09-26T15:16:32Z","title":"Perturb, Attend, Detect and Localize (PADL): Robust Proactive Image\n Defense","summary":" Image manipulation detection and localization have received considerable\nattention from the research community given the blooming of Generative Models\n(GMs). Detection methods that follow a passive approach may overfit to specific\nGMs, limiting their application in real-world scenarios, due to the growing\ndiversity of generative models. Recently, approaches based on a proactive\nframework have shown the possibility of dealing with this limitation. However,\nthese methods suffer from two main limitations, which raises concerns about\npotential vulnerabilities: i) the manipulation detector is not robust to noise\nand hence can be easily fooled; ii) the fact that they rely on fixed\nperturbations for image protection offers a predictable exploit for malicious\nattackers, enabling them to reverse-engineer and evade detection. To overcome\nthis issue we propose PADL, a new solution able to generate image-specific\nperturbations using a symmetric scheme of encoding and decoding based on\ncross-attention, which drastically reduces the possibility of reverse\nengineering, even when evaluated with adaptive attack [31]. Additionally, PADL\nis able to pinpoint manipulated areas, facilitating the identification of\nspecific regions that have undergone alterations, and has more generalization\npower than prior art on held-out generative models. Indeed, although being\ntrained only on an attribute manipulation GAN model [15], our method\ngeneralizes to a range of unseen models with diverse architectural designs,\nsuch as StarGANv2, BlendGAN, DiffAE, StableDiffusion and StableDiffusionXL.\nAdditionally, we introduce a novel evaluation protocol, which offers a fair\nevaluation of localisation performance in function of detection accuracy and\nbetter captures real-world scenarios.\n","authors":["Filippo Bartolucci","Iacopo Masi","Giuseppe Lisanti"],"pdf_url":"https://arxiv.org/pdf/2409.17941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15268v4","updated":"2024-09-26T15:10:58Z","published":"2023-12-23T14:36:27Z","title":"Manydepth2: Motion-Aware Self-Supervised Monocular Depth Estimation in\n Dynamic Scenes","summary":" Despite advancements in self-supervised monocular depth estimation,\nchallenges persist in dynamic scenarios due to the dependence on assumptions\nabout a static world. In this paper, we present Manydepth2, a Motion-Guided\nCost Volume Depth Net, to achieve precise depth estimation for both dynamic\nobjects and static backgrounds, all while maintaining computational efficiency.\nTo tackle the challenges posed by dynamic content, we incorporate optical flow\nand coarse monocular depth to create a novel static reference frame. This frame\nis then utilized to build a motion-guided cost volume in collaboration with the\ntarget frame. Additionally, to enhance the accuracy and resilience of the\nnetwork structure, we introduce an attention-based depth net architecture to\neffectively integrate information from feature maps with varying resolutions.\nCompared to methods with similar computational costs, Manydepth2 achieves a\nsignificant reduction of approximately five percent in root-mean-square error\nfor self-supervised monocular depth estimation on the KITTI-2015 dataset. The\ncode could be found: https://github.com/kaichen-z/Manydepth2\n","authors":["Kaichen Zhou","Jia-Wang Bian","Qian Xie","Jian-Qing Zheng","Niki Trigoni","Andrew Markham"],"pdf_url":"https://arxiv.org/pdf/2312.15268v4.pdf","comment":"Monocular Depth Estimation, Self-Supervised, Optical Flow"},{"id":"http://arxiv.org/abs/2409.07714v2","updated":"2024-09-26T15:05:43Z","published":"2024-09-12T02:50:04Z","title":"CollaMamba: Efficient Collaborative Perception with Cross-Agent\n Spatial-Temporal State Space Model","summary":" By sharing complementary perceptual information, multi-agent collaborative\nperception fosters a deeper understanding of the environment. Recent studies on\ncollaborative perception mostly utilize CNNs or Transformers to learn feature\nrepresentation and fusion in the spatial dimension, which struggle to handle\nlong-range spatial-temporal features under limited computing and communication\nresources. Holistically modeling the dependencies over extensive spatial areas\nand extended temporal frames is crucial to enhancing feature quality. To this\nend, we propose a resource efficient cross-agent spatial-temporal collaborative\nstate space model (SSM), named CollaMamba. Initially, we construct a\nfoundational backbone network based on spatial SSM. This backbone adeptly\ncaptures positional causal dependencies from both single-agent and cross-agent\nviews, yielding compact and comprehensive intermediate features while\nmaintaining linear complexity. Furthermore, we devise a history-aware feature\nboosting module based on temporal SSM, extracting contextual cues from extended\nhistorical frames to refine vague features while preserving low overhead.\nExtensive experiments across several datasets demonstrate that CollaMamba\noutperforms state-of-the-art methods, achieving higher model accuracy while\nreducing computational and communication overhead by up to 71.9% and 1/64,\nrespectively. This work pioneers the exploration of the Mamba's potential in\ncollaborative perception. The source code will be made available.\n","authors":["Yang Li","Quan Yuan","Guiyang Luo","Xiaoyuan Fu","Xuanhan Zhu","Yujia Yang","Rui Pan","Jinglin Li"],"pdf_url":"https://arxiv.org/pdf/2409.07714v2.pdf","comment":"Submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2409.17924v1","updated":"2024-09-26T15:05:29Z","published":"2024-09-26T15:05:29Z","title":"Neural Light Spheres for Implicit Image Stitching and View Synthesis","summary":" Challenging to capture, and challenging to display on a cellphone screen, the\npanorama paradoxically remains both a staple and underused feature of modern\nmobile camera applications. In this work we address both of these challenges\nwith a spherical neural light field model for implicit panoramic image\nstitching and re-rendering; able to accommodate for depth parallax,\nview-dependent lighting, and local scene motion and color changes during\ncapture. Fit during test-time to an arbitrary path panoramic video capture --\nvertical, horizontal, random-walk -- these neural light spheres jointly\nestimate the camera path and a high-resolution scene reconstruction to produce\nnovel wide field-of-view projections of the environment. Our single-layer model\navoids expensive volumetric sampling, and decomposes the scene into compact\nview-dependent ray offset and color components, with a total model size of 80\nMB per scene, and real-time (50 FPS) rendering at 1080p resolution. We\ndemonstrate improved reconstruction quality over traditional image stitching\nand radiance field methods, with significantly higher tolerance to scene motion\nand non-ideal capture settings.\n","authors":["Ilya Chugunov","Amogh Joshi","Kiran Murthy","Francois Bleibel","Felix Heide"],"pdf_url":"https://arxiv.org/pdf/2409.17924v1.pdf","comment":"Project site: https://light.princeton.edu/publication/neuls/"},{"id":"http://arxiv.org/abs/2409.17920v1","updated":"2024-09-26T15:04:13Z","published":"2024-09-26T15:04:13Z","title":"Resolving Multi-Condition Confusion for Finetuning-Free Personalized\n Image Generation","summary":" Personalized text-to-image generation methods can generate customized images\nbased on the reference images, which have garnered wide research interest.\nRecent methods propose a finetuning-free approach with a decoupled\ncross-attention mechanism to generate personalized images requiring no\ntest-time finetuning. However, when multiple reference images are provided, the\ncurrent decoupled cross-attention mechanism encounters the object confusion\nproblem and fails to map each reference image to its corresponding object,\nthereby seriously limiting its scope of application. To address the object\nconfusion problem, in this work we investigate the relevance of different\npositions of the latent image features to the target object in diffusion model,\nand accordingly propose a weighted-merge method to merge multiple reference\nimage features into the corresponding objects. Next, we integrate this\nweighted-merge method into existing pre-trained models and continue to train\nthe model on a multi-object dataset constructed from the open-sourced SA-1B\ndataset. To mitigate object confusion and reduce training costs, we propose an\nobject quality score to estimate the image quality for the selection of\nhigh-quality training samples. Furthermore, our weighted-merge training\nframework can be employed on single-object generation when a single object has\nmultiple reference images. The experiments verify that our method achieves\nsuperior performance to the state-of-the-arts on the Concept101 dataset and\nDreamBooth dataset of multi-object personalized image generation, and\nremarkably improves the performance on single-object personalized image\ngeneration. Our code is available at https://github.com/hqhQAQ/MIP-Adapter.\n","authors":["Qihan Huang","Siming Fu","Jinlong Liu","Hao Jiang","Yipeng Yu","Jie Song"],"pdf_url":"https://arxiv.org/pdf/2409.17920v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17917v1","updated":"2024-09-26T15:02:50Z","published":"2024-09-26T15:02:50Z","title":"WaSt-3D: Wasserstein-2 Distance for Scene-to-Scene Stylization on 3D\n Gaussians","summary":" While style transfer techniques have been well-developed for 2D image\nstylization, the extension of these methods to 3D scenes remains relatively\nunexplored. Existing approaches demonstrate proficiency in transferring colors\nand textures but often struggle with replicating the geometry of the scenes. In\nour work, we leverage an explicit Gaussian Splatting (GS) representation and\ndirectly match the distributions of Gaussians between style and content scenes\nusing the Earth Mover's Distance (EMD). By employing the entropy-regularized\nWasserstein-2 distance, we ensure that the transformation maintains spatial\nsmoothness. Additionally, we decompose the scene stylization problem into\nsmaller chunks to enhance efficiency. This paradigm shift reframes stylization\nfrom a pure generative process driven by latent space losses to an explicit\nmatching of distributions between two Gaussian representations. Our method\nachieves high-resolution 3D stylization by faithfully transferring details from\n3D style scenes onto the content scene. Furthermore, WaSt-3D consistently\ndelivers results across diverse content and style scenes without necessitating\nany training, as it relies solely on optimization-based techniques. See our\nproject page for additional results and source code:\n$\\href{https://compvis.github.io/wast3d/}{https://compvis.github.io/wast3d/}$.\n","authors":["Dmytro Kotovenko","Olga Grebenkova","Nikolaos Sarafianos","Avinash Paliwal","Pingchuan Ma","Omid Poursaeed","Sreyas Mohan","Yuchen Fan","Yilei Li","Rakesh Ranjan","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2409.17917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01895v2","updated":"2024-09-26T14:57:13Z","published":"2024-08-04T01:34:22Z","title":"Computational Trichromacy Reconstruction: Empowering the Color-Vision\n Deficient to Recognize Colors Using Augmented Reality","summary":" We propose an assistive technology that helps individuals with Color Vision\nDeficiencies (CVD) to recognize/name colors. A dichromat's color perception is\na reduced two-dimensional (2D) subset of a normal trichromat's three\ndimensional color (3D) perception, leading to confusion when visual stimuli\nthat appear identical to the dichromat are referred to by different color\nnames. Using our proposed system, CVD individuals can interactively induce\ndistinct perceptual changes to originally confusing colors via a computational\ncolor space transformation. By combining their original 2D precepts for colors\nwith the discriminative changes, a three dimensional color space is\nreconstructed, where the dichromat can learn to resolve color name confusions\nand accurately recognize colors. Our system is implemented as an Augmented\nReality (AR) interface on smartphones, where users interactively control the\nrotation through swipe gestures and observe the induced color shifts in the\ncamera view or in a displayed image. Through psychophysical experiments and a\nlongitudinal user study, we demonstrate that such rotational color shifts have\ndiscriminative power (initially confusing colors become distinct under\nrotation) and exhibit structured perceptual shifts dichromats can learn with\nmodest training. The AR App is also evaluated in two real-world scenarios\n(building with lego blocks and interpreting artistic works); users all report\npositive experience in using the App to recognize object colors that they\notherwise could not.\n","authors":["Yuhao Zhu","Ethan Chen","Colin Hascup","Yukang Yan","Gaurav Sharma"],"pdf_url":"https://arxiv.org/pdf/2408.01895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17908v1","updated":"2024-09-26T14:52:55Z","published":"2024-09-26T14:52:55Z","title":"LKA-ReID:Vehicle Re-Identification with Large Kernel Attention","summary":" With the rapid development of intelligent transportation systems and the\npopularity of smart city infrastructure, Vehicle Re-ID technology has become an\nimportant research field. The vehicle Re-ID task faces an important challenge,\nwhich is the high similarity between different vehicles. Existing methods use\nadditional detection or segmentation models to extract differentiated local\nfeatures. However, these methods either rely on additional annotations or\ngreatly increase the computational cost. Using attention mechanism to capture\nglobal and local features is crucial to solve the challenge of high similarity\nbetween classes in vehicle Re-ID tasks. In this paper, we propose LKA-ReID with\nlarge kernel attention. Specifically, the large kernel attention (LKA) utilizes\nthe advantages of self-attention and also benefits from the advantages of\nconvolution, which can extract the global and local features of the vehicle\nmore comprehensively. We also introduce hybrid channel attention (HCA) combines\nchannel attention with spatial information, so that the model can better focus\non channels and feature regions, and ignore background and other disturbing\ninformation. Experiments on VeRi-776 dataset demonstrated the effectiveness of\nLKA-ReID, with mAP reaches 86.65% and Rank-1 reaches 98.03%.\n","authors":["Xuezhi Xiang","Zhushan Ma","Lei Zhang","Denis Ombati","Himaloy Himu","Xiantong Zhen"],"pdf_url":"https://arxiv.org/pdf/2409.17908v1.pdf","comment":"The paper is under consideration at 2025 IEEE International\n Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)"},{"id":"http://arxiv.org/abs/2409.17895v1","updated":"2024-09-26T14:44:41Z","published":"2024-09-26T14:44:41Z","title":"Self-supervised Monocular Depth Estimation with Large Kernel Attention","summary":" Self-supervised monocular depth estimation has emerged as a promising\napproach since it does not rely on labeled training data. Most methods combine\nconvolution and Transformer to model long-distance dependencies to estimate\ndepth accurately. However, Transformer treats 2D image features as 1D\nsequences, and positional encoding somewhat mitigates the loss of spatial\ninformation between different feature blocks, tending to overlook channel\nfeatures, which limit the performance of depth estimation. In this paper, we\npropose a self-supervised monocular depth estimation network to get finer\ndetails. Specifically, we propose a decoder based on large kernel attention,\nwhich can model long-distance dependencies without compromising the\ntwo-dimension structure of features while maintaining feature channel\nadaptivity. In addition, we introduce a up-sampling module to accurately\nrecover the fine details in the depth map. Our method achieves competitive\nresults on the KITTI dataset.\n","authors":["Xuezhi Xiang","Yao Wang","Lei Zhang","Denis Ombati","Himaloy Himu","Xiantong Zhen"],"pdf_url":"https://arxiv.org/pdf/2409.17895v1.pdf","comment":"The paper is under consideration at 2025 IEEE International\n Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)"},{"id":"http://arxiv.org/abs/2409.17886v1","updated":"2024-09-26T14:35:06Z","published":"2024-09-26T14:35:06Z","title":"Upper-Body Pose-based Gaze Estimation for Privacy-Preserving 3D Gaze\n Target Detection","summary":" Gaze Target Detection (GTD), i.e., determining where a person is looking\nwithin a scene from an external viewpoint, is a challenging task, particularly\nin 3D space. Existing approaches heavily rely on analyzing the person's\nappearance, primarily focusing on their face to predict the gaze target. This\npaper presents a novel approach to tackle this problem by utilizing the\nperson's upper-body pose and available depth maps to extract a 3D gaze\ndirection and employing a multi-stage or an end-to-end pipeline to predict the\ngazed target. When predicted accurately, the human body pose can provide\nvaluable information about the head pose, which is a good approximation of the\ngaze direction, as well as the position of the arms and hands, which are linked\nto the activity the person is performing and the objects they are likely\nfocusing on. Consequently, in addition to performing gaze estimation in 3D, we\nare also able to perform GTD simultaneously. We demonstrate state-of-the-art\nresults on the most comprehensive publicly accessible 3D gaze target detection\ndataset without requiring images of the person's face, thus promoting privacy\npreservation in various application contexts. The code is available at\nhttps://github.com/intelligolabs/privacy-gtd-3D.\n","authors":["Andrea Toaiari","Vittorio Murino","Marco Cristani","Cigdem Beyan"],"pdf_url":"https://arxiv.org/pdf/2409.17886v1.pdf","comment":"Accepted in the T-CAP workshop at ECCV 2024"},{"id":"http://arxiv.org/abs/2312.04564v3","updated":"2024-09-26T14:33:24Z","published":"2023-12-07T18:59:55Z","title":"EAGLES: Efficient Accelerated 3D Gaussians with Lightweight EncodingS","summary":" Recently, 3D Gaussian splatting (3D-GS) has gained popularity in novel-view\nscene synthesis. It addresses the challenges of lengthy training times and slow\nrendering speeds associated with Neural Radiance Fields (NeRFs). Through rapid,\ndifferentiable rasterization of 3D Gaussians, 3D-GS achieves real-time\nrendering and accelerated training. They, however, demand substantial memory\nresources for both training and storage, as they require millions of Gaussians\nin their point cloud representation for each scene. We present a technique\nutilizing quantized embeddings to significantly reduce per-point memory storage\nrequirements and a coarse-to-fine training strategy for a faster and more\nstable optimization of the Gaussian point clouds. Our approach develops a\npruning stage which results in scene representations with fewer Gaussians,\nleading to faster training times and rendering speeds for real-time rendering\nof high resolution scenes. We reduce storage memory by more than an order of\nmagnitude all while preserving the reconstruction quality. We validate the\neffectiveness of our approach on a variety of datasets and scenes preserving\nthe visual quality while consuming 10-20x lesser memory and faster\ntraining/inference speed. Project page and code is available\nhttps://efficientgaussian.github.io\n","authors":["Sharath Girish","Kamal Gupta","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2312.04564v3.pdf","comment":"Website: https://efficientgaussian.github.io Code:\n https://github.com/Sharath-girish/efficientgaussian"},{"id":"http://arxiv.org/abs/2409.17880v1","updated":"2024-09-26T14:27:55Z","published":"2024-09-26T14:27:55Z","title":"Self-Distilled Depth Refinement with Noisy Poisson Fusion","summary":" Depth refinement aims to infer high-resolution depth with fine-grained edges\nand details, refining low-resolution results of depth estimation models. The\nprevailing methods adopt tile-based manners by merging numerous patches, which\nlacks efficiency and produces inconsistency. Besides, prior arts suffer from\nfuzzy depth boundaries and limited generalizability. Analyzing the fundamental\nreasons for these limitations, we model depth refinement as a noisy Poisson\nfusion problem with local inconsistency and edge deformation noises. We propose\nthe Self-distilled Depth Refinement (SDDR) framework to enforce robustness\nagainst the noises, which mainly consists of depth edge representation and\nedge-based guidance. With noisy depth predictions as input, SDDR generates\nlow-noise depth edge representations as pseudo-labels by coarse-to-fine\nself-distillation. Edge-based guidance with edge-guided gradient loss and\nedge-based fusion loss serves as the optimization objective equivalent to\nPoisson fusion. When depth maps are better refined, the labels also become more\nnoise-free. Our model can acquire strong robustness to the noises, achieving\nsignificant improvements in accuracy, edge quality, efficiency, and\ngeneralizability on five different benchmarks. Moreover, directly training\nanother model with edge labels produced by SDDR brings improvements, suggesting\nthat our method could help with training robust refinement models in future\nworks.\n","authors":["Jiaqi Li","Yiran Wang","Jinghong Zheng","Zihao Huang","Ke Xian","Zhiguo Cao","Jianming Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17880v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2407.09946v2","updated":"2024-09-26T14:27:23Z","published":"2024-07-13T17:03:16Z","title":"Low-Rank Interconnected Adaptation across Layers","summary":" Low-rank adaptation (LoRA) is a powerful parameter-efficient fine-tuning\nmethod that utilizes low-rank projectors $A$ and $B$ to learn weight updates\n$\\Delta W$ for adaptation targets $W$. Previous research has shown that LoRA is\nessentially a gradient compressor, performing random projections on the\ngradient using a fixed projection matrix $A_0$. However, this setup restricts\nthe overall weight update to be low-rank, which limits the adaptation\nperformance. In this paper, we propose low-rank interconnected adaptation\nacross layers (Lily). Specifically, we employ a hierarchical framework where\nlow-dimensional projectors (LPs) retained for downward projection at a\nparticular level, while globally-shared high-dimensional projector (HP) experts\nperform upward projection across all levels of layers. Lily uniquely connects\neach LP to all HP experts, therefore the gradient projections are no longer\ndominated by fixed projection matrices, but rather by selective combinations of\nall the projectors, thereby breaking the low-rank constraint of LoRA.\nFurthermore, Lily's cross-layer connections facilitate the capture of intricate\ninformation and dependencies across different layers, thereby enhancing the\nmodel's representational capabilities. Experiments across various modalities,\narchitectures, and model sizes underscore Lily's great performance and\nefficiency. Code is available on github https://github.com/yibozhong/lily.\n","authors":["Yibo Zhong","Yao Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.09946v2.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2409.17854v1","updated":"2024-09-26T14:00:00Z","published":"2024-09-26T14:00:00Z","title":"Visualization of Age Distributions as Elements of Medical Data-Stories","summary":" In various fields, including medicine, age distributions are crucial. Despite\nwidespread media coverage of health topics, there remains a need to enhance\nhealth communication. Narrative medical visualization is promising for\nimproving information comprehension and retention. This study explores the most\neffective ways to present age distributions of diseases through narrative\nvisualizations. We conducted a thorough analysis of existing visualizations,\nheld workshops with a broad audience, and reviewed relevant literature. From\nthis, we identified design choices focusing on comprehension, aesthetics,\nengagement, and memorability. We specifically tested three pictogram variants:\npictograms as bars, stacked pictograms, and annotations. After evaluating 18\nvisualizations with 72 participants and three expert reviews, we determined\nthat annotations were most effective for comprehension and aesthetics. However,\ntraditional bar charts were preferred for engagement, and other variants were\nmore memorable. The study provides a set of design recommendations based on\nthese insights.\n","authors":["Sophia Dowlatabadi","Bernhard Preim","Monique Meuschke"],"pdf_url":"https://arxiv.org/pdf/2409.17854v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.17851v1","updated":"2024-09-26T13:57:05Z","published":"2024-09-26T13:57:05Z","title":"A New Dataset for Monocular Depth Estimation Under Viewpoint Shifts","summary":" Monocular depth estimation is a critical task for autonomous driving and many\nother computer vision applications. While significant progress has been made in\nthis field, the effects of viewpoint shifts on depth estimation models remain\nlargely underexplored. This paper introduces a novel dataset and evaluation\nmethodology to quantify the impact of different camera positions and\norientations on monocular depth estimation performance. We propose a ground\ntruth strategy based on homography estimation and object detection, eliminating\nthe need for expensive lidar sensors. We collect a diverse dataset of road\nscenes from multiple viewpoints and use it to assess the robustness of a modern\ndepth estimation model to geometric shifts. After assessing the validity of our\nstrategy on a public dataset, we provide valuable insights into the limitations\nof current models and highlight the importance of considering viewpoint\nvariations in real-world applications.\n","authors":["Aurel Pjetri","Stefano Caprasecca","Leonardo Taccari","Matteo Simoncini","Henrique Piñeiro Monteagudo","Walter Wallace","Douglas Coimbra de Andrade","Francesco Sambo","Andrew David Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2409.17851v1.pdf","comment":"17 pages, 5 figures. Accepted at ECCV 2024 2nd Workshop on\n Vision-Centric Autonomous Driving (VCAD)"},{"id":"http://arxiv.org/abs/2404.04693v2","updated":"2024-09-26T13:53:33Z","published":"2024-04-06T17:41:36Z","title":"OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera\n Fusion for Colorizing Point Clouds","summary":" A Colored point cloud, as a simple and efficient 3D representation, has many\nadvantages in various fields, including robotic navigation and scene\nreconstruction. This representation is now commonly used in 3D reconstruction\ntasks relying on cameras and LiDARs. However, fusing data from these two types\nof sensors is poorly performed in many existing frameworks, leading to\nunsatisfactory mapping results, mainly due to inaccurate camera poses. This\npaper presents OmniColor, a novel and efficient algorithm to colorize point\nclouds using an independent 360-degree camera. Given a LiDAR-based point cloud\nand a sequence of panorama images with initial coarse camera poses, our\nobjective is to jointly optimize the poses of all frames for mapping images\nonto geometric reconstructions. Our pipeline works in an off-the-shelf manner\nthat does not require any feature extraction or matching process. Instead, we\nfind optimal poses by directly maximizing the photometric consistency of LiDAR\nmaps. In experiments, we show that our method can overcome the severe visual\ndistortion of omnidirectional images and greatly benefit from the wide field of\nview (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy\nand stability. The code will be released at\nhttps://github.com/liubonan123/OmniColor/.\n","authors":["Bonan Liu","Guoyang Zhao","Jianhao Jiao","Guang Cai","Chengyang Li","Handi Yin","Yuyang Wang","Ming Liu","Pan Hui"],"pdf_url":"https://arxiv.org/pdf/2404.04693v2.pdf","comment":"2024 IEEE International Conference on Robotics and Automation (ICRA)"},{"id":"http://arxiv.org/abs/2403.10542v2","updated":"2024-09-26T13:38:48Z","published":"2024-03-08T23:04:14Z","title":"SF-MMCN: Low-Power Sever Flow Multi-Mode Diffusion Model Accelerator","summary":" Generative Artificial Intelligence (AI) has become incredibly popular in\nrecent years, and the significance of traditional accelerators in dealing with\nlarge-scale parameters is urgent. With the diffusion model's parallel\nstructure, the hardware design challenge has skyrocketed because of the\nmultiple layers operating simultaneously. Convolution Neural Network (CNN)\naccelerators have been designed and developed rapidly, especially for\nhigh-speed inference. Often, CNN models with parallel structures are deployed.\nIn these CNN accelerators, many Processing Elements (PE) are required to\nperform parallel computations, mainly the multiply and accumulation (MAC)\noperation, resulting in high power consumption and a large silicon area. In\nthis work, a Server Flow Multi-Mode CNN Unit (SF-MMCN) is proposed to reduce\nthe number of PE while improving the operation efficiency of the CNN\naccelerator. The pipelining technique is introduced into Server Flow to process\nparallel computations. The proposed SF-MMCN is implemented with TSMC 90-nm CMOS\ntechnology. It is evaluated with VGG-16, ResNet-18, and U-net. The evaluation\nresults show that the proposed SF-MMCN can reduce the power consumption by 92%,\nand the silicon area by 70%, while improving the efficiency of operation by\nnearly 81 times. A new FoM, area efficiency (GOPs/mm^2) is also introduced to\nevaluate the performance of the accelerator in terms of the ratio throughput\n(GOPs) and silicon area (mm^2). In this FoM, SF-MMCN improves area efficiency\nby 18 times (18.42).\n","authors":["Huan-Ke Hsu","I-Chyn Wey","T. Hui Teo"],"pdf_url":"https://arxiv.org/pdf/2403.10542v2.pdf","comment":"16 pages, 16 figures; extend the CNN to process Diffusion Model\n (possible this is the first reported hardware Diffusion Model implementation)"},{"id":"http://arxiv.org/abs/2407.17380v2","updated":"2024-09-26T13:37:04Z","published":"2024-07-24T16:04:18Z","title":"2D and 3D Deep Learning Models for MRI-based Parkinson's Disease\n Classification: A Comparative Analysis of Convolutional Kolmogorov-Arnold\n Networks, Convolutional Neural Networks, and Graph Convolutional Networks","summary":" Parkinson's Disease (PD) diagnosis remains challenging. This study applies\nConvolutional Kolmogorov-Arnold Networks (ConvKANs), integrating learnable\nspline-based activation functions into convolutional layers, for PD\nclassification using structural MRI. The first 3D implementation of ConvKANs\nfor medical imaging is presented, comparing their performance to Convolutional\nNeural Networks (CNNs) and Graph Convolutional Networks (GCNs) across three\nopen-source datasets. Isolated analyses assessed performance within individual\ndatasets, using cross-validation techniques. Holdout analyses evaluated\ncross-dataset generalizability by training models on two datasets and testing\non the third, mirroring real-world clinical scenarios. In isolated analyses, 2D\nConvKANs achieved the highest AUC of 0.99 (95% CI: 0.98-0.99) on the PPMI\ndataset, outperforming 2D CNNs (AUC: 0.97, p = 0.0092). 3D models showed\npromise, with 3D CNN and 3D ConvKAN reaching an AUC of 0.85 on PPMI. In holdout\nanalyses, 3D ConvKAN demonstrated superior generalization, achieving an AUC of\n0.85 on early-stage PD data. GCNs underperformed in 2D but improved in 3D\nimplementations. These findings highlight ConvKANs' potential for PD detection,\nemphasize the importance of 3D analysis in capturing subtle brain changes, and\nunderscore cross-dataset generalization challenges. This study advances\nAI-assisted PD diagnosis using structural MRI and emphasizes the need for\nlarger-scale validation.\n","authors":["Salil B Patel","Vicky Goh","James F FitzGerald","Chrystalina A Antoniades"],"pdf_url":"https://arxiv.org/pdf/2407.17380v2.pdf","comment":"7 figures"},{"id":"http://arxiv.org/abs/2406.04769v2","updated":"2024-09-26T13:31:40Z","published":"2024-06-07T09:15:29Z","title":"Diffusion-based Generative Image Outpainting for Recovery of\n FOV-Truncated CT Images","summary":" Field-of-view (FOV) recovery of truncated chest CT scans is crucial for\naccurate body composition analysis, which involves quantifying skeletal muscle\nand subcutaneous adipose tissue (SAT) on CT slices. This, in turn, enables\ndisease prognostication. Here, we present a method for recovering truncated CT\nslices using generative image outpainting. We train a diffusion model and apply\nit to truncated CT slices generated by simulating a small FOV. Our model\nreliably recovers the truncated anatomy and outperforms the previous\nstate-of-the-art despite being trained on 87% less data.\n","authors":["Michelle Espranita Liman","Daniel Rueckert","Florian J. Fintelmann","Philip Müller"],"pdf_url":"https://arxiv.org/pdf/2406.04769v2.pdf","comment":"Shared last authorship: Florian J. Fintelmann and Philip M\\\"uller"},{"id":"http://arxiv.org/abs/2409.17830v1","updated":"2024-09-26T13:29:40Z","published":"2024-09-26T13:29:40Z","title":"Unsupervised Learning Based Multi-Scale Exposure Fusion","summary":" Unsupervised learning based multi-scale exposure fusion (ULMEF) is efficient\nfor fusing differently exposed low dynamic range (LDR) images into a higher\nquality LDR image for a high dynamic range (HDR) scene. Unlike supervised\nlearning, loss functions play a crucial role in the ULMEF. In this paper, novel\nloss functions are proposed for the ULMEF and they are defined by using all the\nimages to be fused and other differently exposed images from the same HDR\nscene. The proposed loss functions can guide the proposed ULMEF to learn more\nreliable information from the HDR scene than existing loss functions which are\ndefined by only using the set of images to be fused. As such, the quality of\nthe fused image is significantly improved. The proposed ULMEF also adopts a\nmulti-scale strategy that includes a multi-scale attention module to\neffectively preserve the scene depth and local contrast in the fused image.\nMeanwhile, the proposed ULMEF can be adopted to achieve exposure interpolation\nand exposure extrapolation. Extensive experiments show that the proposed ULMEF\nalgorithm outperforms state-of-the-art exposure fusion algorithms.\n","authors":["Chaobing Zheng","Shiqian Wu","Zhenggguo Li"],"pdf_url":"https://arxiv.org/pdf/2409.17830v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2409.17823v1","updated":"2024-09-26T13:21:02Z","published":"2024-09-26T13:21:02Z","title":"Kendall's $τ$ Coefficient for Logits Distillation","summary":" Knowledge distillation typically employs the Kullback-Leibler (KL) divergence\nto constrain the student model's output to match the soft labels provided by\nthe teacher model exactly. However, sometimes the optimization direction of the\nKL divergence loss is not always aligned with the task loss, where a smaller KL\ndivergence could lead to erroneous predictions that diverge from the soft\nlabels. This limitation often results in suboptimal optimization for the\nstudent. Moreover, even under temperature scaling, the KL divergence loss\nfunction tends to overly focus on the larger-valued channels in the logits,\ndisregarding the rich inter-class information provided by the multitude of\nsmaller-valued channels. This hard constraint proves too challenging for\nlightweight students, hindering further knowledge distillation. To address this\nissue, we propose a plug-and-play ranking loss based on Kendall's $\\tau$\ncoefficient, called Rank-Kendall Knowledge Distillation (RKKD). RKKD balances\nthe attention to smaller-valued channels by constraining the order of channel\nvalues in student logits, providing more inter-class relational information.\nThe rank constraint on the top-valued channels helps avoid suboptimal traps\nduring optimization. We also discuss different differentiable forms of\nKendall's $\\tau$ coefficient and demonstrate that the proposed ranking loss\nfunction shares a consistent optimization objective with the KL divergence.\nExtensive experiments on the CIFAR-100 and ImageNet datasets show that our RKKD\ncan enhance the performance of various knowledge distillation baselines and\noffer broad improvements across multiple teacher-student architecture\ncombinations.\n","authors":["Yuchen Guan","Runxi Cheng","Kang Liu","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2409.17823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16042v2","updated":"2024-09-26T13:18:24Z","published":"2024-09-24T12:44:27Z","title":"Enhanced Unsupervised Image-to-Image Translation Using Contrastive\n Learning and Histogram of Oriented Gradients","summary":" Image-to-Image Translation is a vital area of computer vision that focuses on\ntransforming images from one visual domain to another while preserving their\ncore content and structure. However, this field faces two major challenges:\nfirst, the data from the two domains are often unpaired, making it difficult to\ntrain generative adversarial networks effectively; second, existing methods\ntend to produce artifacts or hallucinations during image generation, leading to\na decline in image quality. To address these issues, this paper proposes an\nenhanced unsupervised image-to-image translation method based on the\nContrastive Unpaired Translation (CUT) model, incorporating Histogram of\nOriented Gradients (HOG) features. This novel approach ensures the preservation\nof the semantic structure of images, even without semantic labels, by\nminimizing the loss between the HOG features of input and generated images. The\nmethod was tested on translating synthetic game environments from GTA5 dataset\nto realistic urban scenes in cityscapes dataset, demonstrating significant\nimprovements in reducing hallucinations and enhancing image quality.\n","authors":["Wanchen Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.16042v2.pdf","comment":"Critical Errors in Data or Analysis"},{"id":"http://arxiv.org/abs/2409.17805v1","updated":"2024-09-26T12:58:01Z","published":"2024-09-26T12:58:01Z","title":"Cascade Prompt Learning for Vision-Language Model Adaptation","summary":" Prompt learning has surfaced as an effective approach to enhance the\nperformance of Vision-Language Models (VLMs) like CLIP when applied to\ndownstream tasks. However, current learnable prompt tokens are primarily used\nfor the single phase of adapting to tasks (i.e., adapting prompt), easily\nleading to overfitting risks. In this work, we propose a novel Cascade Prompt\nLearning CasPL framework to enable prompt learning to serve both generic and\nspecific expertise (i.e., boosting and adapting prompt) simultaneously.\nSpecifically, CasPL is a new learning paradigm comprising two distinct phases\nof learnable prompts: the first boosting prompt is crafted to extract\ndomain-general knowledge from a senior larger CLIP teacher model by aligning\ntheir predicted logits using extensive unlabeled domain images. The second\nadapting prompt is then cascaded with the frozen first set to fine-tune the\ndownstream tasks, following the approaches employed in prior research. In this\nmanner, CasPL can effectively capture both domain-general and task-specific\nrepresentations into explicitly different gradual groups of prompts, thus\npotentially alleviating overfitting issues in the target domain. It's worth\nnoting that CasPL serves as a plug-and-play module that can seamlessly\nintegrate into any existing prompt learning approach. CasPL achieves a\nsignificantly better balance between performance and inference speed, which is\nespecially beneficial for deploying smaller VLM models in resource-constrained\nenvironments. Compared to the previous state-of-the-art method PromptSRC, CasPL\nshows an average improvement of 1.85% for base classes, 3.44% for novel\nclasses, and 2.72% for the harmonic mean over 11 image classification datasets.\nCode is publicly available at: https://github.com/megvii-research/CasPL.\n","authors":["Ge Wu","Xin Zhang","Zheng Li","Zhaowei Chen","Jiajun Liang","Jian Yang","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2409.17805v1.pdf","comment":"ECCV2024"},{"id":"http://arxiv.org/abs/2406.10615v2","updated":"2024-09-26T12:55:43Z","published":"2024-06-15T12:27:35Z","title":"Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation","summary":" Given the high cost of collecting robotic data in the real world, sample\nefficiency is a consistently compelling pursuit in robotics. In this paper, we\nintroduce SGRv2, an imitation learning framework that enhances sample\nefficiency through improved visual and action representations. Central to the\ndesign of SGRv2 is the incorporation of a critical inductive bias-action\nlocality, which posits that robot's actions are predominantly influenced by the\ntarget object and its interactions with the local environment. Extensive\nexperiments in both simulated and real-world settings demonstrate that action\nlocality is essential for boosting sample efficiency. SGRv2 excels in RLBench\ntasks with keyframe control using merely 5 demonstrations and surpasses the RVT\nbaseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and\nMimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR.\nIn real-world environments, with only eight demonstrations, SGRv2 can perform a\nvariety of tasks at a markedly higher success rate compared to baseline models.\nProject website: http://sgrv2-robot.github.io\n","authors":["Tong Zhang","Yingdong Hu","Jiacheng You","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2406.10615v2.pdf","comment":"CoRL 2024. Project website: http://sgrv2-robot.github.io"},{"id":"http://arxiv.org/abs/2409.17792v1","updated":"2024-09-26T12:37:50Z","published":"2024-09-26T12:37:50Z","title":"Reblurring-Guided Single Image Defocus Deblurring: A Learning Framework\n with Misaligned Training Pairs","summary":" For single image defocus deblurring, acquiring well-aligned training pairs\n(or training triplets), i.e., a defocus blurry image, an all-in-focus sharp\nimage (and a defocus blur map), is an intricate task for the development of\ndeblurring models. Existing image defocus deblurring methods typically rely on\ntraining data collected by specialized imaging equipment, presupposing that\nthese pairs or triplets are perfectly aligned. However, in practical scenarios\ninvolving the collection of real-world data, direct acquisition of training\ntriplets is infeasible, and training pairs inevitably encounter spatial\nmisalignment issues. In this work, we introduce a reblurring-guided learning\nframework for single image defocus deblurring, enabling the learning of a\ndeblurring network even with misaligned training pairs. Specifically, we first\npropose a baseline defocus deblurring network that utilizes spatially varying\ndefocus blur map as degradation prior to enhance the deblurring performance.\nThen, to effectively learn the baseline defocus deblurring network with\nmisaligned training pairs, our reblurring module ensures spatial consistency\nbetween the deblurred image, the reblurred image and the input blurry image by\nreconstructing spatially variant isotropic blur kernels. Moreover, the\nspatially variant blur derived from the reblurring module can serve as pseudo\nsupervision for defocus blur map during training, interestingly transforming\ntraining pairs into training triplets. Additionally, we have collected a new\ndataset specifically for single image defocus deblurring (SDD) with typical\nmisalignments, which not only substantiates our proposed method but also serves\nas a benchmark for future research.\n","authors":["Xinya Shu","Yu Li","Dongwei Ren","Xiaohe Wu","Jin Li","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2409.17792v1.pdf","comment":"The source code and dataset are available at\n https://github.com/ssscrystal/Reblurring-guided-JDRL"},{"id":"http://arxiv.org/abs/2409.17790v1","updated":"2024-09-26T12:37:22Z","published":"2024-09-26T12:37:22Z","title":"CASPFormer: Trajectory Prediction from BEV Images with Deformable\n Attention","summary":" Motion prediction is an important aspect for Autonomous Driving (AD) and\nAdvance Driver Assistance Systems (ADAS). Current state-of-the-art motion\nprediction methods rely on High Definition (HD) maps for capturing the\nsurrounding context of the ego vehicle. Such systems lack scalability in\nreal-world deployment as HD maps are expensive to produce and update in\nreal-time. To overcome this issue, we propose Context Aware Scene Prediction\nTransformer (CASPFormer), which can perform multi-modal motion prediction from\nrasterized Bird-Eye-View (BEV) images. Our system can be integrated with any\nupstream perception module that is capable of generating BEV images. Moreover,\nCASPFormer directly decodes vectorized trajectories without any postprocessing.\nTrajectories are decoded recurrently using deformable attention, as it is\ncomputationally efficient and provides the network with the ability to focus\nits attention on the important spatial locations of the BEV images. In\naddition, we also address the issue of mode collapse for generating multiple\nscene-consistent trajectories by incorporating learnable mode queries. We\nevaluate our model on the nuScenes dataset and show that it reaches\nstate-of-the-art across multiple metrics\n","authors":["Harsh Yadav","Maximilian Schaefer","Kun Zhao","Tobias Meisen"],"pdf_url":"https://arxiv.org/pdf/2409.17790v1.pdf","comment":"Under Review at ICPR 2024, Kolkata"},{"id":"http://arxiv.org/abs/2405.07865v4","updated":"2024-09-26T12:18:49Z","published":"2024-05-13T15:53:18Z","title":"AnoVox: A Benchmark for Multimodal Anomaly Detection in Autonomous\n Driving","summary":" The scale-up of autonomous vehicles depends heavily on their ability to deal\nwith anomalies, such as rare objects on the road. In order to handle such\nsituations, it is necessary to detect anomalies in the first place. Anomaly\ndetection for autonomous driving has made great progress in the past years but\nsuffers from poorly designed benchmarks with a strong focus on camera data. In\nthis work, we propose AnoVox, the largest benchmark for ANOmaly detection in\nautonomous driving to date. AnoVox incorporates large-scale multimodal sensor\ndata and spatial VOXel ground truth, allowing for the comparison of methods\nindependent of their used sensor. We propose a formal definition of normality\nand provide a compliant training dataset. AnoVox is the first benchmark to\ncontain both content and temporal anomalies.\n","authors":["Daniel Bogdoll","Iramm Hamdard","Lukas Namgyu Rößler","Felix Geisler","Muhammed Bayram","Felix Wang","Jan Imhof","Miguel de Campos","Anushervon Tabarov","Yitian Yang","Hanno Gottschalk","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2405.07865v4.pdf","comment":"Daniel Bogdoll, Iramm Hamdard, and Lukas Namgyu R\\\"o{\\ss}ler\n contributed equally. Accepted for publication at ECCV 2024 W-CODA workshop"},{"id":"http://arxiv.org/abs/2409.17778v1","updated":"2024-09-26T12:16:11Z","published":"2024-09-26T12:16:11Z","title":"Taming Diffusion Prior for Image Super-Resolution with Domain Shift SDEs","summary":" Diffusion-based image super-resolution (SR) models have attracted substantial\ninterest due to their powerful image restoration capabilities. However,\nprevailing diffusion models often struggle to strike an optimal balance between\nefficiency and performance. Typically, they either neglect to exploit the\npotential of existing extensive pretrained models, limiting their generative\ncapacity, or they necessitate a dozens of forward passes starting from random\nnoises, compromising inference efficiency. In this paper, we present DoSSR, a\nDomain Shift diffusion-based SR model that capitalizes on the generative powers\nof pretrained diffusion models while significantly enhancing efficiency by\ninitiating the diffusion process with low-resolution (LR) images. At the core\nof our approach is a domain shift equation that integrates seamlessly with\nexisting diffusion models. This integration not only improves the use of\ndiffusion prior but also boosts inference efficiency. Moreover, we advance our\nmethod by transitioning the discrete shift process to a continuous formulation,\ntermed as DoS-SDEs. This advancement leads to the fast and customized solvers\nthat further enhance sampling efficiency. Empirical results demonstrate that\nour proposed method achieves state-of-the-art performance on synthetic and\nreal-world datasets, while notably requiring only 5 sampling steps. Compared to\nprevious diffusion prior based methods, our approach achieves a remarkable\nspeedup of 5-7 times, demonstrating its superior efficiency. Code:\nhttps://github.com/QinpengCui/DoSSR.\n","authors":["Qinpeng Cui","Yixuan Liu","Xinyi Zhang","Qiqi Bao","Zhongdao Wang","Qingmin Liao","Li Wang","Tian Lu","Emad Barsoum"],"pdf_url":"https://arxiv.org/pdf/2409.17778v1.pdf","comment":"This paper is accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17777v1","updated":"2024-09-26T12:15:13Z","published":"2024-09-26T12:15:13Z","title":"Harnessing Shared Relations via Multimodal Mixup Contrastive Learning\n for Multimodal Classification","summary":" Deep multimodal learning has shown remarkable success by leveraging\ncontrastive learning to capture explicit one-to-one relations across\nmodalities. However, real-world data often exhibits shared relations beyond\nsimple pairwise associations. We propose M3CoL, a Multimodal Mixup Contrastive\nLearning approach to capture nuanced shared relations inherent in multimodal\ndata. Our key contribution is a Mixup-based contrastive loss that learns robust\nrepresentations by aligning mixed samples from one modality with their\ncorresponding samples from other modalities thereby capturing shared relations\nbetween them. For multimodal classification tasks, we introduce a framework\nthat integrates a fusion module with unimodal prediction modules for auxiliary\nsupervision during training, complemented by our proposed Mixup-based\ncontrastive loss. Through extensive experiments on diverse datasets (N24News,\nROSMAP, BRCA, and Food-101), we demonstrate that M3CoL effectively captures\nshared multimodal relations and generalizes across domains. It outperforms\nstate-of-the-art methods on N24News, ROSMAP, and BRCA, while achieving\ncomparable performance on Food-101. Our work highlights the significance of\nlearning shared relations for robust multimodal learning, opening up promising\navenues for future research.\n","authors":["Raja Kumar","Raghav Singhal","Pranamya Kulkarni","Deval Mehta","Kshitij Jadhav"],"pdf_url":"https://arxiv.org/pdf/2409.17777v1.pdf","comment":"RK and RS contributed equally to this work, 20 Pages, 8 Figures, 9\n Tables"},{"id":"http://arxiv.org/abs/2409.17775v1","updated":"2024-09-26T12:13:52Z","published":"2024-09-26T12:13:52Z","title":"UNICORN: A Deep Learning Model for Integrating Multi-Stain Data in\n Histopathology","summary":" Background: The integration of multi-stain histopathology images through deep\nlearning poses a significant challenge in digital histopathology. Current\nmulti-modal approaches struggle with data heterogeneity and missing data. This\nstudy aims to overcome these limitations by developing a novel transformer\nmodel for multi-stain integration that can handle missing data during training\nas well as inference. Methods: We propose UNICORN (UNiversal modality\nIntegration Network for CORonary classificatioN) a multi-modal transformer\ncapable of processing multi-stain histopathology for atherosclerosis severity\nclass prediction. The architecture comprises a two-stage, end-to-end trainable\nmodel with specialized modules utilizing transformer self-attention blocks. The\ninitial stage employs domain-specific expert modules to extract features from\neach modality. In the subsequent stage, an aggregation expert module integrates\nthese features by learning the interactions between the different data\nmodalities. Results: Evaluation was performed using a multi-class dataset of\natherosclerotic lesions from the Munich Cardiovascular Studies Biobank\n(MISSION), using over 4,000 paired multi-stain whole slide images (WSIs) from\n170 deceased individuals on 7 prespecified segments of the coronary tree, each\nstained according to four histopathological protocols. UNICORN achieved a\nclassification accuracy of 0.67, outperforming other state-of-the-art models.\nThe model effectively identifies relevant tissue phenotypes across stainings\nand implicitly models disease progression. Conclusion: Our proposed multi-modal\ntransformer model addresses key challenges in medical data analysis, including\ndata heterogeneity and missing modalities. Explainability and the model's\neffectiveness in predicting atherosclerosis progression underscores its\npotential for broader applications in medical research.\n","authors":["Valentin Koch","Sabine Bauer","Valerio Luppberger","Michael Joner","Heribert Schunkert","Julia A. Schnabel","Moritz von Scheidt","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2409.17775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17763v1","updated":"2024-09-26T11:58:41Z","published":"2024-09-26T11:58:41Z","title":"Confidence intervals uncovered: Are we ready for real-world medical\n imaging AI?","summary":" Medical imaging is spearheading the AI transformation of healthcare.\nPerformance reporting is key to determine which methods should be translated\ninto clinical practice. Frequently, broad conclusions are simply derived from\nmean performance values. In this paper, we argue that this common practice is\noften a misleading simplification as it ignores performance variability. Our\ncontribution is threefold. (1) Analyzing all MICCAI segmentation papers (n =\n221) published in 2023, we first observe that more than 50\\% of papers do not\nassess performance variability at all. Moreover, only one (0.5\\%) paper\nreported confidence intervals (CIs) for model performance. (2) To address the\nreporting bottleneck, we show that the unreported standard deviation (SD) in\nsegmentation papers can be approximated by a second-order polynomial function\nof the mean Dice similarity coefficient (DSC). Based on external validation\ndata from 56 previous MICCAI challenges, we demonstrate that this approximation\ncan accurately reconstruct the CI of a method using information provided in\npublications. (3) Finally, we reconstructed 95\\% CIs around the mean DSC of\nMICCAI 2023 segmentation papers. The median CI width was 0.03 which is three\ntimes larger than the median performance gap between the first and second\nranked method. For more than 60\\% of papers, the mean performance of the\nsecond-ranked method was within the CI of the first-ranked method. We conclude\nthat current publications typically do not provide sufficient evidence to\nsupport which models could potentially be translated into clinical practice.\n","authors":["Evangelia Christodoulou","Annika Reinke","Rola Houhou","Piotr Kalinowski","Selen Erkan","Carole H. Sudre","Ninon Burgos","Sofiène Boutaj","Sophie Loizillon","Maëlys Solal","Nicola Rieke","Veronika Cheplygina","Michela Antonelli","Leon D. Mayer","Minu D. Tizabi","M. Jorge Cardoso","Amber Simpson","Paul F. Jäger","Annette Kopp-Schneider","Gaël Varoquaux","Olivier Colliot","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2409.17763v1.pdf","comment":"Paper accepted at MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2409.17759v1","updated":"2024-09-26T11:53:25Z","published":"2024-09-26T11:53:25Z","title":"LGFN: Lightweight Light Field Image Super-Resolution using Local\n Convolution Modulation and Global Attention Feature Extraction","summary":" Capturing different intensity and directions of light rays at the same scene\nLight field (LF) can encode the 3D scene cues into a 4D LF image which has a\nwide range of applications (i.e. post-capture refocusing and depth sensing). LF\nimage super-resolution (SR) aims to improve the image resolution limited by the\nperformance of LF camera sensor. Although existing methods have achieved\npromising results the practical application of these models is limited because\nthey are not lightweight enough. In this paper we propose a lightweight model\nnamed LGFN which integrates the local and global features of different views\nand the features of different channels for LF image SR. Specifically owing to\nneighboring regions of the same pixel position in different sub-aperture images\nexhibit similar structural relationships we design a lightweight CNN-based\nfeature extraction module (namely DGCE) to extract local features better\nthrough feature modulation. Meanwhile as the position beyond the boundaries in\nthe LF image presents a large disparity we propose an efficient spatial\nattention module (namely ESAM) which uses decomposable large-kernel convolution\nto obtain an enlarged receptive field and an efficient channel attention module\n(namely ECAM). Compared with the existing LF image SR models with large\nparameter our model has a parameter of 0.45M and a FLOPs of 19.33G which has\nachieved a competitive effect. Extensive experiments with ablation studies\ndemonstrate the effectiveness of our proposed method which ranked the second\nplace in the Track 2 Fidelity & Efficiency of NTIRE2024 Light Field Super\nResolution Challenge and the seventh place in the Track 1 Fidelity.\n","authors":["Zhongxin Yu","Liang Chen","Zhiyun Zeng","Kunping Yang","Shaofei Luo","Shaorui Chen","Cheng Zhong"],"pdf_url":"https://arxiv.org/pdf/2409.17759v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.09369v3","updated":"2024-09-26T11:38:12Z","published":"2024-09-14T08:47:45Z","title":"Interpretable Vision-Language Survival Analysis with Ordinal Inductive\n Bias for Computational Pathology","summary":" Histopathology Whole-Slide Images (WSIs) provide an important tool to assess\ncancer prognosis in computational pathology (CPATH). While existing survival\nanalysis (SA) approaches have made exciting progress, they are generally\nlimited to adopting highly-expressive architectures and only coarse-grained\npatient-level labels to learn prognostic visual representations from gigapixel\nWSIs. Such learning paradigm suffers from important performance bottlenecks,\nwhen facing present scarce training data and standard multi-instance learning\n(MIL) framework in CPATH. To overcome it, this paper, for the first time,\nproposes a new Vision-Language-based SA (VLSA) paradigm. Concretely, (1) VLSA\nis driven by pathology VL foundation models. It no longer relies on\nhigh-capability networks and shows the advantage of data efficiency. (2) In\nvision-end, VLSA encodes prognostic language prior and then employs it as\nauxiliary signals to guide the aggregating of prognostic visual features at\ninstance level, thereby compensating for the weak supervision in MIL. Moreover,\ngiven the characteristics of SA, we propose i) ordinal survival prompt learning\nto transform continuous survival labels into textual prompts; and ii) ordinal\nincidence function as prediction target to make SA compatible with VL-based\nprediction. Notably, VLSA's predictions can be interpreted intuitively by our\nShapley values-based method. The extensive experiments on five datasets confirm\nthe effectiveness of our scheme. Our VLSA could pave a new way for SA in CPATH\nby offering weakly-supervised MIL an effective means to learn valuable\nprognostic clues from gigapixel WSIs. Our source code is available at\nhttps://github.com/liupei101/VLSA.\n","authors":["Pei Liu","Luping Ji","Jiaxiang Gou","Bo Fu","Mao Ye"],"pdf_url":"https://arxiv.org/pdf/2409.09369v3.pdf","comment":"24 pages, 11 tables, 6 figures"},{"id":"http://arxiv.org/abs/2401.01008v3","updated":"2024-09-26T11:35:22Z","published":"2023-12-13T17:05:37Z","title":"Fast Sampling Through The Reuse Of Attention Maps In Diffusion Models","summary":" Text-to-image diffusion models have demonstrated unprecedented capabilities\nfor flexible and realistic image synthesis. Nevertheless, these models rely on\na time-consuming sampling procedure, which has motivated attempts to reduce\ntheir latency. When improving efficiency, researchers often use the original\ndiffusion model to train an additional network designed specifically for fast\nimage generation. In contrast, our approach seeks to reduce latency directly,\nwithout any retraining, fine-tuning, or knowledge distillation. In particular,\nwe find the repeated calculation of attention maps to be costly yet redundant,\nand instead suggest reusing them during sampling. Our specific reuse strategies\nare based on ODE theory, which implies that the later a map is reused, the\nsmaller the distortion in the final image. We empirically compare these reuse\nstrategies with few-step sampling procedures of comparable latency, finding\nthat reuse generates images that are closer to those produced by the original\nhigh-latency diffusion model.\n","authors":["Rosco Hunter","Łukasz Dudziak","Mohamed S. Abdelfattah","Abhinav Mehrotra","Sourav Bhattacharya","Hongkai Wen"],"pdf_url":"https://arxiv.org/pdf/2401.01008v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12844v2","updated":"2024-09-26T11:29:04Z","published":"2024-02-20T09:13:15Z","title":"ICON: Improving Inter-Report Consistency in Radiology Report Generation\n via Lesion-aware Mixup Augmentation","summary":" Previous research on radiology report generation has made significant\nprogress in terms of increasing the clinical accuracy of generated reports. In\nthis paper, we emphasize another crucial quality that it should possess, i.e.,\ninter-report consistency, which refers to the capability of generating\nconsistent reports for semantically equivalent radiographs. This quality is\neven of greater significance than the overall report accuracy in terms of\nensuring the system's credibility, as a system prone to providing conflicting\nresults would severely erode users' trust. Regrettably, existing approaches\nstruggle to maintain inter-report consistency, exhibiting biases towards common\npatterns and susceptibility to lesion variants. To address this issue, we\npropose ICON, which improves the inter-report consistency of radiology report\ngeneration. Aiming to enhance the system's ability to capture similarities in\nsemantically equivalent lesions, our approach first involves extracting lesions\nfrom input images and examining their characteristics. Then, we introduce a\nlesion-aware mixup technique to ensure that the representations of the\nsemantically equivalent lesions align with the same attributes, achieved\nthrough a linear combination during the training phase. Extensive experiments\non three publicly available chest X-ray datasets verify the effectiveness of\nour approach, both in terms of improving the consistency and accuracy of the\ngenerated reports.\n","authors":["Wenjun Hou","Yi Cheng","Kaishuai Xu","Yan Hu","Wenjie Li","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.12844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17747v1","updated":"2024-09-26T11:23:59Z","published":"2024-09-26T11:23:59Z","title":"Text Image Generation for Low-Resource Languages with Dual Translation\n Learning","summary":" Scene text recognition in low-resource languages frequently faces challenges\ndue to the limited availability of training datasets derived from real-world\nscenes. This study proposes a novel approach that generates text images in\nlow-resource languages by emulating the style of real text images from\nhigh-resource languages. Our approach utilizes a diffusion model that is\nconditioned on binary states: ``synthetic'' and ``real.'' The training of this\nmodel involves dual translation tasks, where it transforms plain text images\ninto either synthetic or real text images, based on the binary states. This\napproach not only effectively differentiates between the two domains but also\nfacilitates the model's explicit recognition of characters in the target\nlanguage. Furthermore, to enhance the accuracy and variety of generated text\nimages, we introduce two guidance techniques: Fidelity-Diversity Balancing\nGuidance and Fidelity Enhancement Guidance. Our experimental results\ndemonstrate that the text images generated by our proposed framework can\nsignificantly improve the performance of scene text recognition models for\nlow-resource languages.\n","authors":["Chihiro Noguchi","Shun Fukuda","Shoichiro Mihara","Masao Yamanaka"],"pdf_url":"https://arxiv.org/pdf/2409.17747v1.pdf","comment":"23 pages, 11 figures"},{"id":"http://arxiv.org/abs/2405.06945v2","updated":"2024-09-26T11:21:27Z","published":"2024-05-11T07:56:19Z","title":"Direct Learning of Mesh and Appearance via 3D Gaussian Splatting","summary":" Accurately reconstructing a 3D scene including explicit geometry information\nis both attractive and challenging. Geometry reconstruction can benefit from\nincorporating differentiable appearance models, such as Neural Radiance Fields\nand 3D Gaussian Splatting (3DGS). However, existing methods encounter\nefficiency issues due to indirect geometry learning and the paradigm of\nseparately modeling geometry and surface appearance. In this work, we propose a\nlearnable scene model that incorporates 3DGS with an explicit geometry\nrepresentation, namely a mesh. Our model learns the mesh and appearance in an\nend-to-end manner, where we bind 3D Gaussians to the mesh faces and perform\ndifferentiable rendering of 3DGS to obtain photometric supervision. The model\ncreates an effective information pathway to supervise the learning of both 3DGS\nand mesh. Experimental results demonstrate that the learned scene model not\nonly achieves state-of-the-art efficiency and rendering quality but also\nsupports manipulation using the explicit mesh. In addition, our model has a\nunique advantage in adapting to scene updates, thanks to the end-to-end\nlearning of both mesh and appearance.\n","authors":["Ancheng Lin","Jun Li"],"pdf_url":"https://arxiv.org/pdf/2405.06945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17740v1","updated":"2024-09-26T11:15:15Z","published":"2024-09-26T11:15:15Z","title":"AnyLogo: Symbiotic Subject-Driven Diffusion System with Gemini Status","summary":" Diffusion models have made compelling progress on facilitating\nhigh-throughput daily production. Nevertheless, the appealing customized\nrequirements are remain suffered from instance-level finetuning for authentic\nfidelity. Prior zero-shot customization works achieve the semantic consistence\nthrough the condensed injection of identity features, while addressing detailed\nlow-level signatures through complex model configurations and subject-specific\nfabrications, which significantly break the statistical coherence within the\noverall system and limit the applicability across various scenarios. To\nfacilitate the generic signature concentration with rectified efficiency, we\npresent \\textbf{AnyLogo}, a zero-shot region customizer with remarkable detail\nconsistency, building upon the symbiotic diffusion system with eliminated\ncumbersome designs. Streamlined as vanilla image generation, we discern that\nthe rigorous signature extraction and creative content generation are\npromisingly compatible and can be systematically recycled within a single\ndenoising model. In place of the external configurations, the gemini status of\nthe denoising model promote the reinforced subject transmission efficiency and\ndisentangled semantic-signature space with continuous signature decoration.\nMoreover, the sparse recycling paradigm is adopted to prevent the duplicated\nrisk with compressed transmission quota for diversified signature stimulation.\nExtensive experiments on constructed logo-level benchmarks demonstrate the\neffectiveness and practicability of our methods.\n","authors":["Jinghao Zhang","Wen Qian","Hao Luo","Fan Wang","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.17740v1.pdf","comment":"13 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.17729v1","updated":"2024-09-26T10:58:31Z","published":"2024-09-26T10:58:31Z","title":"Neural Implicit Representation for Highly Dynamic LiDAR Mapping and\n Odometry","summary":" Recent advancements in Simultaneous Localization and Mapping (SLAM) have\nincreasingly highlighted the robustness of LiDAR-based techniques. At the same\ntime, Neural Radiance Fields (NeRF) have introduced new possibilities for 3D\nscene reconstruction, exemplified by SLAM systems. Among these, NeRF-LOAM has\nshown notable performance in NeRF-based SLAM applications. However, despite its\nstrengths, these systems often encounter difficulties in dynamic outdoor\nenvironments due to their inherent static assumptions. To address these\nlimitations, this paper proposes a novel method designed to improve\nreconstruction in highly dynamic outdoor scenes. Based on NeRF-LOAM, the\nproposed approach consists of two primary components. First, we separate the\nscene into static background and dynamic foreground. By identifying and\nexcluding dynamic elements from the mapping process, this segmentation enables\nthe creation of a dense 3D map that accurately represents the static background\nonly. The second component extends the octree structure to support\nmulti-resolution representation. This extension not only enhances\nreconstruction quality but also aids in the removal of dynamic objects\nidentified by the first module. Additionally, Fourier feature encoding is\napplied to the sampled points, capturing high-frequency information and leading\nto more complete reconstruction results. Evaluations on various datasets\ndemonstrate that our method achieves more competitive results compared to\ncurrent state-of-the-art approaches.\n","authors":["Qi Zhang","He Wang","Ru Li","Wenbin Li"],"pdf_url":"https://arxiv.org/pdf/2409.17729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17728v1","updated":"2024-09-26T10:57:02Z","published":"2024-09-26T10:57:02Z","title":"AlterMOMA: Fusion Redundancy Pruning for Camera-LiDAR Fusion Models with\n Alternative Modality Masking","summary":" Camera-LiDAR fusion models significantly enhance perception performance in\nautonomous driving. The fusion mechanism leverages the strengths of each\nmodality while minimizing their weaknesses. Moreover, in practice, camera-LiDAR\nfusion models utilize pre-trained backbones for efficient training. However, we\nargue that directly loading single-modal pre-trained camera and LiDAR backbones\ninto camera-LiDAR fusion models introduces similar feature redundancy across\nmodalities due to the nature of the fusion mechanism. Unfortunately, existing\npruning methods are developed explicitly for single-modal models, and thus,\nthey struggle to effectively identify these specific redundant parameters in\ncamera-LiDAR fusion models. In this paper, to address the issue above on\ncamera-LiDAR fusion models, we propose a novelty pruning framework Alternative\nModality Masking Pruning (AlterMOMA), which employs alternative masking on each\nmodality and identifies the redundant parameters. Specifically, when one\nmodality parameters are masked (deactivated), the absence of features from the\nmasked backbone compels the model to reactivate previous redundant features of\nthe other modality backbone. Therefore, these redundant features and relevant\nredundant parameters can be identified via the reactivation process. The\nredundant parameters can be pruned by our proposed importance score evaluation\nfunction, Alternative Evaluation (AlterEva), which is based on the observation\nof the loss changes when certain modality parameters are activated and\ndeactivated. Extensive experiments on the nuScene and KITTI datasets\nencompassing diverse tasks, baseline models, and pruning algorithms showcase\nthat AlterMOMA outperforms existing pruning methods, attaining state-of-the-art\nperformance.\n","authors":["Shiqi Sun","Yantao Lu","Ning Liu","Bo Jiang","JinChao Chen","Ying Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17728v1.pdf","comment":"17 pages, 3 figures, Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17727v1","updated":"2024-09-26T10:56:35Z","published":"2024-09-26T10:56:35Z","title":"Robotic-CLIP: Fine-tuning CLIP on Action Data for Robotic Applications","summary":" Vision language models have played a key role in extracting meaningful\nfeatures for various robotic applications. Among these, Contrastive\nLanguage-Image Pretraining (CLIP) is widely used in robotic tasks that require\nboth vision and natural language understanding. However, CLIP was trained\nsolely on static images paired with text prompts and has not yet been fully\nadapted for robotic tasks involving dynamic actions. In this paper, we\nintroduce Robotic-CLIP to enhance robotic perception capabilities. We first\ngather and label large-scale action data, and then build our Robotic-CLIP by\nfine-tuning CLIP on 309,433 videos (~7.4 million frames) of action data using\ncontrastive learning. By leveraging action data, Robotic-CLIP inherits CLIP's\nstrong image performance while gaining the ability to understand actions in\nrobotic contexts. Intensive experiments show that our Robotic-CLIP outperforms\nother CLIP-based models across various language-driven robotic tasks.\nAdditionally, we demonstrate the practical effectiveness of Robotic-CLIP in\nreal-world grasping applications.\n","authors":["Nghia Nguyen","Minh Nhat Vu","Tung D. Ta","Baoru Huang","Thieu Vo","Ngan Le","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.17727v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2409.17720v1","updated":"2024-09-26T10:43:09Z","published":"2024-09-26T10:43:09Z","title":"Scene Understanding in Pick-and-Place Tasks: Analyzing Transformations\n Between Initial and Final Scenes","summary":" With robots increasingly collaborating with humans in everyday tasks, it is\nimportant to take steps toward robotic systems capable of understanding the\nenvironment. This work focuses on scene understanding to detect pick and place\ntasks given initial and final images from the scene. To this end, a dataset is\ncollected for object detection and pick and place task detection. A YOLOv5\nnetwork is subsequently trained to detect the objects in the initial and final\nscenes. Given the detected objects and their bounding boxes, two methods are\nproposed to detect the pick and place tasks which transform the initial scene\ninto the final scene. A geometric method is proposed which tracks objects'\nmovements in the two scenes and works based on the intersection of the bounding\nboxes which moved within scenes. Contrarily, the CNN-based method utilizes a\nConvolutional Neural Network to classify objects with intersected bounding\nboxes into 5 classes, showing the spatial relationship between the involved\nobjects. The performed pick and place tasks are then derived from analyzing the\nexperiments with both scenes. Results show that the CNN-based method, using a\nVGG16 backbone, outscores the geometric method by roughly 12 percentage points\nin certain scenarios, with an overall success rate of 84.3%.\n","authors":["Seraj Ghasemi","Hamed Hosseini","MohammadHossein Koosheshi","Mehdi Tale Masouleh","Ahmad Kalhor"],"pdf_url":"https://arxiv.org/pdf/2409.17720v1.pdf","comment":"Conference Paper, ICEE 2024, 7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17717v1","updated":"2024-09-26T10:40:23Z","published":"2024-09-26T10:40:23Z","title":"Behaviour4All: in-the-wild Facial Behaviour Analysis Toolkit","summary":" In this paper, we introduce Behavior4All, a comprehensive, open-source\ntoolkit for in-the-wild facial behavior analysis, integrating Face\nLocalization, Valence-Arousal Estimation, Basic Expression Recognition and\nAction Unit Detection, all within a single framework. Available in both\nCPU-only and GPU-accelerated versions, Behavior4All leverages 12 large-scale,\nin-the-wild datasets consisting of over 5 million images from diverse\ndemographic groups. It introduces a novel framework that leverages distribution\nmatching and label co-annotation to address tasks with non-overlapping\nannotations, encoding prior knowledge of their relatedness. In the largest\nstudy of its kind, Behavior4All outperforms both state-of-the-art and toolkits\nin overall performance as well as fairness across all databases and tasks. It\nalso demonstrates superior generalizability on unseen databases and on compound\nexpression recognition. Finally, Behavior4All is way times faster than other\ntoolkits.\n","authors":["Dimitrios Kollias","Chunchang Shao","Odysseus Kaloidas","Ioannis Patras"],"pdf_url":"https://arxiv.org/pdf/2409.17717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00230v3","updated":"2024-09-26T10:27:58Z","published":"2024-03-30T03:19:50Z","title":"Latent Watermark: Inject and Detect Watermarks in Latent Diffusion Space","summary":" Watermarking is a tool for actively identifying and attributing the images\ngenerated by latent diffusion models. Existing methods face the dilemma of\nimage quality and watermark robustness. Watermarks with superior image quality\nusually have inferior robustness against attacks such as blurring and JPEG\ncompression, while watermarks with superior robustness usually significantly\ndamage image quality. This dilemma stems from the traditional paradigm where\nwatermarks are injected and detected in pixel space, relying on pixel\nperturbation for watermark detection and resilience against attacks. In this\npaper, we highlight that an effective solution to the problem is to both inject\nand detect watermarks in the latent diffusion space, and propose Latent\nWatermark with a progressive training strategy. It weakens the direct\nconnection between quality and robustness and thus alleviates their\ncontradiction. We conduct evaluations on two datasets and against 10 watermark\nattacks. Six metrics measure the image quality and watermark robustness.\nResults show that compared to the recently proposed methods such as\nStableSignature, StegaStamp, RoSteALS, LaWa, TreeRing, and DiffuseTrace, LW not\nonly surpasses them in terms of robustness but also offers superior image\nquality. Our code will be available at\nhttps://github.com/RichardSunnyMeng/LatentWatermark.\n","authors":["Zheling Meng","Bo Peng","Jing Dong"],"pdf_url":"https://arxiv.org/pdf/2404.00230v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05024v2","updated":"2024-09-26T09:55:49Z","published":"2024-09-08T08:33:32Z","title":"Deep Self-Cleansing for Medical Image Segmentation with Noisy Labels","summary":" Medical image segmentation is crucial in the field of medical imaging, aiding\nin disease diagnosis and surgical planning. Most established segmentation\nmethods rely on supervised deep learning, in which clean and precise labels are\nessential for supervision and significantly impact the performance of models.\nHowever, manually delineated labels often contain noise, such as missing labels\nand inaccurate boundary delineation, which can hinder networks from correctly\nmodeling target characteristics. In this paper, we propose a deep\nself-cleansing segmentation framework that can preserve clean labels while\ncleansing noisy ones in the training phase. To achieve this, we devise a\ngaussian mixture model-based label filtering module that distinguishes noisy\nlabels from clean labels. Additionally, we develop a label cleansing module to\ngenerate pseudo low-noise labels for identified noisy samples. The preserved\nclean labels and pseudo-labels are then used jointly to supervise the network.\nValidated on a clinical liver tumor dataset and a public cardiac diagnosis\ndataset, our method can effectively suppress the interference from noisy labels\nand achieve prominent segmentation performance.\n","authors":["Jiahua Dong","Yue Zhang","Qiuli Wang","Ruofeng Tong","Shihong Ying","Shaolin Gong","Xuanpu Zhang","Lanfen Lin","Yen-Wei Chen","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.05024v2.pdf","comment":"31 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.17686v1","updated":"2024-09-26T09:51:11Z","published":"2024-09-26T09:51:11Z","title":"MoGenTS: Motion Generation based on Spatial-Temporal Joint Modeling","summary":" Motion generation from discrete quantization offers many advantages over\ncontinuous regression, but at the cost of inevitable approximation errors.\nPrevious methods usually quantize the entire body pose into one code, which not\nonly faces the difficulty in encoding all joints within one vector but also\nloses the spatial relationship between different joints. Differently, in this\nwork we quantize each individual joint into one vector, which i) simplifies the\nquantization process as the complexity associated with a single joint is\nmarkedly lower than that of the entire pose; ii) maintains a spatial-temporal\nstructure that preserves both the spatial relationships among joints and the\ntemporal movement patterns; iii) yields a 2D token map, which enables the\napplication of various 2D operations widely used in 2D images. Grounded in the\n2D motion quantization, we build a spatial-temporal modeling framework, where\n2D joint VQVAE, temporal-spatial 2D masking technique, and spatial-temporal 2D\nattention are proposed to take advantage of spatial-temporal signals among the\n2D tokens. Extensive experiments demonstrate that our method significantly\noutperforms previous methods across different datasets, with a $26.6\\%$\ndecrease of FID on HumanML3D and a $29.9\\%$ decrease on KIT-ML.\n","authors":["Weihao Yuan","Weichao Shen","Yisheng He","Yuan Dong","Xiaodong Gu","Zilong Dong","Liefeng Bo","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17686v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17682v1","updated":"2024-09-26T09:48:24Z","published":"2024-09-26T09:48:24Z","title":"Dark Miner: Defend against unsafe generation for text-to-image diffusion\n models","summary":" Text-to-image diffusion models have been demonstrated with unsafe generation\ndue to unfiltered large-scale training data, such as violent, sexual, and\nshocking images, necessitating the erasure of unsafe concepts. Most existing\nmethods focus on modifying the generation probabilities conditioned on the\ntexts containing unsafe descriptions. However, they fail to guarantee safe\ngeneration for unseen texts in the training phase, especially for the prompts\nfrom adversarial attacks. In this paper, we re-analyze the erasure task and\npoint out that existing methods cannot guarantee the minimization of the total\nprobabilities of unsafe generation. To tackle this problem, we propose Dark\nMiner. It entails a recurring three-stage process that comprises mining,\nverifying, and circumventing. It greedily mines embeddings with maximum\ngeneration probabilities of unsafe concepts and reduces unsafe generation more\neffectively. In the experiments, we evaluate its performance on two\ninappropriate concepts, two objects, and two styles. Compared with 6 previous\nstate-of-the-art methods, our method achieves better erasure and defense\nresults in most cases, especially under 4 state-of-the-art attacks, while\npreserving the model's native generation capability. Our code will be available\non GitHub.\n","authors":["Zheling Meng","Bo Peng","Xiaochuan Jin","Yue Jiang","Jing Dong","Wei Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2409.17682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17680v1","updated":"2024-09-26T09:43:50Z","published":"2024-09-26T09:43:50Z","title":"Event-based Stereo Depth Estimation: A Survey","summary":" Stereopsis has widespread appeal in robotics as it is the predominant way by\nwhich living beings perceive depth to navigate our 3D world. Event cameras are\nnovel bio-inspired sensors that detect per-pixel brightness changes\nasynchronously, with very high temporal resolution and high dynamic range,\nenabling machine perception in high-speed motion and broad illumination\nconditions. The high temporal precision also benefits stereo matching, making\ndisparity (depth) estimation a popular research area for event cameras ever\nsince its inception. Over the last 30 years, the field has evolved rapidly,\nfrom low-latency, low-power circuit design to current deep learning (DL)\napproaches driven by the computer vision community. The bibliography is vast\nand difficult to navigate for non-experts due its highly interdisciplinary\nnature. Past surveys have addressed distinct aspects of this topic, in the\ncontext of applications, or focusing only on a specific class of techniques,\nbut have overlooked stereo datasets. This survey provides a comprehensive\noverview, covering both instantaneous stereo and long-term methods suitable for\nsimultaneous localization and mapping (SLAM), along with theoretical and\nempirical comparisons. It is the first to extensively review DL methods as well\nas stereo datasets, even providing practical suggestions for creating new\nbenchmarks to advance the field. The main advantages and challenges faced by\nevent-based stereo depth estimation are also discussed. Despite significant\nprogress, challenges remain in achieving optimal performance in not only\naccuracy but also efficiency, a cornerstone of event-based computing. We\nidentify several gaps and propose future research directions. We hope this\nsurvey inspires future research in this area, by serving as an accessible entry\npoint for newcomers, as well as a practical guide for seasoned researchers in\nthe community.\n","authors":["Suman Ghosh","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2409.17680v1.pdf","comment":"28 pages, 20 figures, 7 tables"},{"id":"http://arxiv.org/abs/2409.17675v1","updated":"2024-09-26T09:34:33Z","published":"2024-09-26T09:34:33Z","title":"EM-Net: Efficient Channel and Frequency Learning with Mamba for 3D\n Medical Image Segmentation","summary":" Convolutional neural networks have primarily led 3D medical image\nsegmentation but may be limited by small receptive fields. Transformer models\nexcel in capturing global relationships through self-attention but are\nchallenged by high computational costs at high resolutions. Recently, Mamba, a\nstate space model, has emerged as an effective approach for sequential\nmodeling. Inspired by its success, we introduce a novel Mamba-based 3D medical\nimage segmentation model called EM-Net. It not only efficiently captures\nattentive interaction between regions by integrating and selecting channels,\nbut also effectively utilizes frequency domain to harmonize the learning of\nfeatures across varying scales, while accelerating training speed.\nComprehensive experiments on two challenging multi-organ datasets with other\nstate-of-the-art (SOTA) algorithms show that our method exhibits better\nsegmentation accuracy while requiring nearly half the parameter size of SOTA\nmodels and 2x faster training speed.\n","authors":["Ao Chang","Jiajun Zeng","Ruobing Huang","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2409.17675v1.pdf","comment":"10 pages, 3 figures, accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2409.17674v1","updated":"2024-09-26T09:33:20Z","published":"2024-09-26T09:33:20Z","title":"Self-Supervised Learning of Deviation in Latent Representation for\n Co-speech Gesture Video Generation","summary":" Gestures are pivotal in enhancing co-speech communication. While recent works\nhave mostly focused on point-level motion transformation or fully supervised\nmotion representations through data-driven approaches, we explore the\nrepresentation of gestures in co-speech, with a focus on self-supervised\nrepresentation and pixel-level motion deviation, utilizing a diffusion model\nwhich incorporates latent motion features. Our approach leverages\nself-supervised deviation in latent representation to facilitate hand gestures\ngeneration, which are crucial for generating realistic gesture videos. Results\nof our first experiment demonstrate that our method enhances the quality of\ngenerated videos, with an improvement from 2.7 to 4.5% for FGD, DIV, and FVD,\nand 8.1% for PSNR, 2.5% for SSIM over the current state-of-the-art methods.\n","authors":["Huan Yang","Jiahui Chen","Chaofan Ding","Runhua Shi","Siyu Xiong","Qingqi Hong","Xiaoqi Mo","Xinhan Di"],"pdf_url":"https://arxiv.org/pdf/2409.17674v1.pdf","comment":"5 pages, 5 figures, conference"},{"id":"http://arxiv.org/abs/2404.09486v2","updated":"2024-09-26T09:31:48Z","published":"2024-04-15T06:15:46Z","title":"MMCode: Benchmarking Multimodal Large Language Models for Code\n Generation with Visually Rich Programming Problems","summary":" Programming often involves converting detailed and complex specifications\ninto code, a process during which developers typically utilize visual aids to\nmore effectively convey concepts. While recent developments in Large Multimodal\nModels have demonstrated remarkable abilities in visual reasoning and\nmathematical tasks, there is little work on investigating whether these models\ncan effectively interpret visual elements for code generation. To this end, we\npresent MMCode, the first multi-modal coding dataset for evaluating algorithmic\nproblem-solving skills in visually rich contexts. MMCode contains 3,548\nquestions and 6,620 images collected from real-world programming challenges\nharvested from 10 code competition websites, presenting significant challenges\ndue to the extreme demand for reasoning abilities. Our experiment results show\nthat current state-of-the-art models struggle to solve these problems. The\nresults highlight the lack of powerful vision-code models, and we hope MMCode\ncan serve as an inspiration for future works in this domain. The data and code\nare publicly available at https://github.com/likaixin2000/MMCode.\n","authors":["Kaixin Li","Yuchen Tian","Qisheng Hu","Ziyang Luo","Zhiyong Huang","Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2404.09486v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17671v1","updated":"2024-09-26T09:30:37Z","published":"2024-09-26T09:30:37Z","title":"Leveraging Anthropometric Measurements to Improve Human Mesh Estimation\n and Ensure Consistent Body Shapes","summary":" The basic body shape of a person does not change within a single video.\nHowever, most SOTA human mesh estimation (HME) models output a slightly\ndifferent body shape for each video frame, which results in inconsistent body\nshapes for the same person. In contrast, we leverage anthropometric\nmeasurements like tailors are already obtaining from humans for centuries. We\ncreate a model called A2B that converts such anthropometric measurements to\nbody shape parameters of human mesh models. Moreover, we find that finetuned\nSOTA 3D human pose estimation (HPE) models outperform HME models regarding the\nprecision of the estimated keypoints. We show that applying inverse kinematics\n(IK) to the results of such a 3D HPE model and combining the resulting body\npose with the A2B body shape leads to superior and consistent human meshes for\nchallenging datasets like ASPset or fit3D, where we can lower the MPJPE by over\n30 mm compared to SOTA HME models. Further, replacing HME models estimates of\nthe body shape parameters with A2B model results not only increases the\nperformance of these HME models, but also leads to consistent body shapes.\n","authors":["Katja Ludwig","Julian Lorenz","Daniel Kienzle","Tuan Bui","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2409.17671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17663v1","updated":"2024-09-26T09:21:48Z","published":"2024-09-26T09:21:48Z","title":"Explanation Bottleneck Models","summary":" Recent concept-based interpretable models have succeeded in providing\nmeaningful explanations by pre-defined concept sets. However, the dependency on\nthe pre-defined concepts restricts the application because of the limited\nnumber of concepts for explanations. This paper proposes a novel interpretable\ndeep neural network called explanation bottleneck models (XBMs). XBMs generate\na text explanation from the input without pre-defined concepts and then predict\na final task prediction based on the generated explanation by leveraging\npre-trained vision-language encoder-decoder models. To achieve both the target\ntask performance and the explanation quality, we train XBMs through the target\ntask loss with the regularization penalizing the explanation decoder via the\ndistillation from the frozen pre-trained decoder. Our experiments, including a\ncomparison to state-of-the-art concept bottleneck models, confirm that XBMs\nprovide accurate and fluent natural language explanations without pre-defined\nconcept sets. Code will be available at https://github.com/yshinya6/xbm/.\n","authors":["Shin'ya Yamaguchi","Kosuke Nishida"],"pdf_url":"https://arxiv.org/pdf/2409.17663v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.15897v2","updated":"2024-09-26T09:04:58Z","published":"2023-12-26T06:20:55Z","title":"Recursive Distillation for Open-Set Distributed Robot Localization","summary":" A typical assumption in state-of-the-art self-localization models is that an\nannotated training dataset is available for the target workspace. However, this\nis not necessarily true when a robot travels around the general open world.\nThis work introduces a novel training scheme for open-world distributed robot\nsystems. In our scheme, a robot (``student\") can ask the other robots it meets\nat unfamiliar places (``teachers\") for guidance. Specifically, a\npseudo-training dataset is reconstructed from the teacher model and then used\nfor continual learning of the student model under domain, class, and vocabulary\nincremental setup. Unlike typical knowledge transfer schemes, our scheme\nintroduces only minimal assumptions on the teacher model, so that it can handle\nvarious types of open-set teachers, including those uncooperative, untrainable\n(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In\nthis paper, we investigate a ranking function as an instance of such generic\nmodels, using a challenging data-free recursive distillation scenario, where a\nstudent once trained can recursively join the next-generation open teacher set.\n","authors":["Kenta Tsukahara","Kanji Tanaka"],"pdf_url":"https://arxiv.org/pdf/2312.15897v2.pdf","comment":"5 pages, 4 figures, technical report"},{"id":"http://arxiv.org/abs/2402.18411v3","updated":"2024-09-26T09:00:00Z","published":"2024-02-28T15:31:45Z","title":"Unsupervised Cross-Domain Image Retrieval via Prototypical Optimal\n Transport","summary":" Unsupervised cross-domain image retrieval (UCIR) aims to retrieve images\nsharing the same category across diverse domains without relying on labeled\ndata. Prior approaches have typically decomposed the UCIR problem into two\ndistinct tasks: intra-domain representation learning and cross-domain feature\nalignment. However, these segregated strategies overlook the potential\nsynergies between these tasks. This paper introduces ProtoOT, a novel Optimal\nTransport formulation explicitly tailored for UCIR, which integrates\nintra-domain feature representation learning and cross-domain alignment into a\nunified framework. ProtoOT leverages the strengths of the K-means clustering\nmethod to effectively manage distribution imbalances inherent in UCIR. By\nutilizing K-means for generating initial prototypes and approximating class\nmarginal distributions, we modify the constraints in Optimal Transport\naccordingly, significantly enhancing its performance in UCIR scenarios.\nFurthermore, we incorporate contrastive learning into the ProtoOT framework to\nfurther improve representation learning. This encourages local semantic\nconsistency among features with similar semantics, while also explicitly\nenforcing separation between features and unmatched prototypes, thereby\nenhancing global discriminativeness. ProtoOT surpasses existing\nstate-of-the-art methods by a notable margin across benchmark datasets.\nNotably, on DomainNet, ProtoOT achieves an average P@200 enhancement of 18.17%,\nand on Office-Home, it demonstrates a P@15 improvement of 3.83%.\n","authors":["Bin Li","Ye Shi","Qian Yu","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2402.18411v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18576v5","updated":"2024-09-26T08:57:49Z","published":"2023-11-30T14:15:39Z","title":"Fixed-length Dense Descriptor for Efficient Fingerprint Matching","summary":" In fingerprint matching, fixed-length descriptors generally offer greater\nefficiency compared to minutiae set, but the recognition accuracy is not as\ngood as that of the latter. Although much progress has been made in deep\nlearning based fixed-length descriptors recently, they often fall short when\ndealing with incomplete or partial fingerprints, diverse fingerprint poses, and\nsignificant background noise. In this paper, we propose a three-dimensional\nrepresentation called Fixed-length Dense Descriptor (FDD) for efficient\nfingerprint matching. FDD features great spatial properties, enabling it to\ncapture the spatial relationships of the original fingerprints, thereby\nenhancing interpretability and robustness. Our experiments on various\nfingerprint datasets reveal that FDD outperforms other fixed-length\ndescriptors, especially in matching fingerprints of different areas,\ncross-modal fingerprint matching, and fingerprint matching with background\nnoise.\n","authors":["Zhiyu Pan","Yongjie Duan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.18576v5.pdf","comment":"Accepted by WIFS 2024"},{"id":"http://arxiv.org/abs/2409.17649v1","updated":"2024-09-26T08:55:44Z","published":"2024-09-26T08:55:44Z","title":"Provable Performance Guarantees of Copy Detection Patterns","summary":" Copy Detection Patterns (CDPs) are crucial elements in modern security\napplications, playing a vital role in safeguarding industries such as food,\npharmaceuticals, and cosmetics. Current performance evaluations of CDPs\npredominantly rely on empirical setups using simplistic metrics like Hamming\ndistances or Pearson correlation. These methods are often inadequate due to\ntheir sensitivity to distortions, degradation, and their limitations to\nstationary statistics of printing and imaging. Additionally, machine\nlearning-based approaches suffer from distribution biases and fail to\ngeneralize to unseen counterfeit samples. Given the critical importance of CDPs\nin preventing counterfeiting, including the counterfeit vaccines issue\nhighlighted during the COVID-19 pandemic, there is an urgent need for provable\nperformance guarantees across various criteria. This paper aims to establish a\ntheoretical framework to derive optimal criteria for the analysis,\noptimization, and future development of CDP authentication technologies,\nensuring their reliability and effectiveness in diverse security scenarios.\n","authors":["Joakim Tutt","Slava Voloshynovskiy"],"pdf_url":"https://arxiv.org/pdf/2409.17649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17647v1","updated":"2024-09-26T08:51:29Z","published":"2024-09-26T08:51:29Z","title":"MECD: Unlocking Multi-Event Causal Discovery in Video Reasoning","summary":" Video causal reasoning aims to achieve a high-level understanding of video\ncontent from a causal perspective. However, current video reasoning tasks are\nlimited in scope, primarily executed in a question-answering paradigm and\nfocusing on short videos containing only a single event and simple causal\nrelationships, lacking comprehensive and structured causality analysis for\nvideos with multiple events. To fill this gap, we introduce a new task and\ndataset, Multi-Event Causal Discovery (MECD). It aims to uncover the causal\nrelationships between events distributed chronologically across long videos.\nGiven visual segments and textual descriptions of events, MECD requires\nidentifying the causal associations between these events to derive a\ncomprehensive, structured event-level video causal diagram explaining why and\nhow the final result event occurred. To address MECD, we devise a novel\nframework inspired by the Granger Causality method, using an efficient\nmask-based event prediction model to perform an Event Granger Test, which\nestimates causality by comparing the predicted result event when premise events\nare masked versus unmasked. Furthermore, we integrate causal inference\ntechniques such as front-door adjustment and counterfactual inference to\naddress challenges in MECD like causality confounding and illusory causality.\nExperiments validate the effectiveness of our framework in providing causal\nrelationships in multi-event videos, outperforming GPT-4o and VideoLLaVA by\n5.7% and 4.1%, respectively.\n","authors":["Tieyuan Chen","Huabin Liu","Tianyao He","Yihang Chen","Chaofan Gan","Xiao Ma","Cheng Zhong","Yang Zhang","Yingxue Wang","Hui Lin","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2409.17647v1.pdf","comment":"Accepted at NeurIPS 2024 as a spotlight paper"},{"id":"http://arxiv.org/abs/2409.15246v2","updated":"2024-09-26T08:48:03Z","published":"2024-09-23T17:42:05Z","title":"On-Air Deep Learning Integrated Semantic Inference Models for Enhanced\n Earth Observation Satellite Networks","summary":" Earth Observation (EO) systems play a crucial role in achieving Sustainable\nDevelopment Goals by collecting and analyzing vital global data through\nsatellite networks. These systems are essential for tasks like mapping,\ndisaster monitoring, and resource management, but they face challenges in\nprocessing and transmitting large volumes of EO data, especially in specialized\nfields such as agriculture and real-time disaster response. Domain-adapted\nLarge Language Models (LLMs) provide a promising solution by facilitating data\nfusion between extensive EO data and semantic EO data. By improving integration\nand interpretation of diverse datasets, LLMs address the challenges of\nprocessing specialized information in agriculture and disaster response\napplications. This fusion enhances the accuracy and relevance of transmitted\ndata. This paper presents a framework for semantic communication in EO\nsatellite networks, aimed at improving data transmission efficiency and overall\nsystem performance through cognitive processing techniques. The proposed system\nemploys Discrete-Task-Oriented Source-Channel Coding (DT-JSCC) and Semantic\nData Augmentation (SA) to focus on relevant information while minimizing\ncommunication overhead. By integrating cognitive semantic processing and\ninter-satellite links, the framework enhances the analysis and transmission of\nmultispectral satellite imagery, improving object detection, pattern\nrecognition, and real-time decision-making. The introduction of Cognitive\nSemantic Augmentation (CSA) allows satellites to process and transmit semantic\ninformation, boosting adaptability to changing environments and application\nneeds. This end-to-end architecture is tailored for next-generation satellite\nnetworks, such as those supporting 6G, and demonstrates significant\nimprovements in efficiency and accuracy.\n","authors":["Hong-fu Chou","Vu Nguyen Ha","Prabhu Thiruvasagam","Thanh-Dung Le","Geoffrey Eappen","Ti Ti Nguyen","Luis M. Garces-Socarras","Jorge L. Gonzalez-Rios","Juan Carlos Merlano-Duncan","Symeon Chatzinotas"],"pdf_url":"https://arxiv.org/pdf/2409.15246v2.pdf","comment":"18 pages, 10 figures, magazine"},{"id":"http://arxiv.org/abs/2409.17634v1","updated":"2024-09-26T08:31:27Z","published":"2024-09-26T08:31:27Z","title":"P4Q: Learning to Prompt for Quantization in Visual-language Models","summary":" Large-scale pre-trained Vision-Language Models (VLMs) have gained prominence\nin various visual and multimodal tasks, yet the deployment of VLMs on\ndownstream application platforms remains challenging due to their prohibitive\nrequirements of training samples and computing resources. Fine-tuning and\nquantization of VLMs can substantially reduce the sample and computation costs,\nwhich are in urgent need. There are two prevailing paradigms in quantization,\nQuantization-Aware Training (QAT) can effectively quantize large-scale VLMs but\nincur a huge training cost, while low-bit Post-Training Quantization (PTQ)\nsuffers from a notable performance drop. We propose a method that balances\nfine-tuning and quantization named ``Prompt for Quantization'' (P4Q), in which\nwe design a lightweight architecture to leverage contrastive loss supervision\nto enhance the recognition performance of a PTQ model. Our method can\neffectively reduce the gap between image features and text features caused by\nlow-bit quantization, based on learnable prompts to reorganize textual\nrepresentations and a low-bit adapter to realign the distributions of image and\ntext features. We also introduce a distillation loss based on cosine similarity\npredictions to distill the quantized model using a full-precision teacher.\nExtensive experimental results demonstrate that our P4Q method outperforms\nprior arts, even achieving comparable results to its full-precision\ncounterparts. For instance, our 8-bit P4Q can theoretically compress the\nCLIP-ViT/B-32 by 4 $\\times$ while achieving 66.94\\% Top-1 accuracy,\noutperforming the learnable prompt fine-tuned full-precision model by 2.24\\%\nwith negligible additional parameters on the ImageNet dataset.\n","authors":["Huixin Sun","Runqi Wang","Yanjing Li","Xianbin Cao","Xiaolong Jiang","Yao Hu","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16723v2","updated":"2024-09-26T08:28:48Z","published":"2024-09-25T08:22:00Z","title":"EAGLE: Towards Efficient Arbitrary Referring Visual Prompts\n Comprehension for Multimodal Large Language Models","summary":" Recently, Multimodal Large Language Models (MLLMs) have sparked great\nresearch interests owing to their exceptional content-reasoning and\ninstruction-following capabilities. To effectively instruct an MLLM, in\naddition to conventional language expressions, the practice of referring to\nobjects by painting with brushes on images has emerged as a prevalent tool\n(referred to as \"referring visual prompts\") due to its efficacy in aligning the\nuser's intention with specific image regions. To accommodate the most common\nreferring visual prompts, namely points, boxes, and masks, existing approaches\ninitially utilize specialized feature encoding modules to capture the semantics\nof the highlighted areas indicated by these prompts. Subsequently, these\nencoded region features are adapted to MLLMs through fine-tuning on a\nmeticulously curated multimodal instruction dataset. However, such designs\nsuffer from redundancy in architecture. Moreover, they face challenges in\neffectively generalizing when encountering a diverse range of arbitrary\nreferring visual prompts in real-life scenarios. To address the above issues,\nwe propose EAGLE, a novel MLLM that empowers comprehension of arbitrary\nreferring visual prompts with less training efforts than existing approaches.\nSpecifically, our EAGLE maintains the innate format of the referring visual\nprompts as colored patches rendered on the given image for conducting the\ninstruction tuning. Our approach embeds referring visual prompts as spatial\nconcepts conveying specific spatial areas comprehensible to the MLLM, with the\nsemantic comprehension of these regions originating from the MLLM itself.\nBesides, we also propose a Geometry-Agnostic Learning paradigm (GAL) to further\ndisentangle the MLLM's region-level comprehension with the specific formats of\nreferring visual prompts. Extensive experiments are conducted to prove the\neffectiveness of our proposed method.\n","authors":["Jiacheng Zhang","Yang Jiao","Shaoxiang Chen","Jingjing Chen","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.16723v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17629v1","updated":"2024-09-26T08:23:04Z","published":"2024-09-26T08:23:04Z","title":"Hand-object reconstruction via interaction-aware graph attention\n mechanism","summary":" Estimating the poses of both a hand and an object has become an important\narea of research due to the growing need for advanced vision computing. The\nprimary challenge involves understanding and reconstructing how hands and\nobjects interact, such as contact and physical plausibility. Existing\napproaches often adopt a graph neural network to incorporate spatial\ninformation of hand and object meshes. However, these approaches have not fully\nexploited the potential of graphs without modification of edges within and\nbetween hand- and object-graphs. We propose a graph-based refinement method\nthat incorporates an interaction-aware graph-attention mechanism to account for\nhand-object interactions. Using edges, we establish connections among closely\ncorrelated nodes, both within individual graphs and across different graphs.\nExperiments demonstrate the effectiveness of our proposed method with notable\nimprovements in the realm of physical plausibility.\n","authors":["Taeyun Woo","Tae-Kyun Kim","Jinah Park"],"pdf_url":"https://arxiv.org/pdf/2409.17629v1.pdf","comment":"7 pages, Accepted by ICIP 2024"},{"id":"http://arxiv.org/abs/2405.17251v2","updated":"2024-09-26T08:22:52Z","published":"2024-05-27T15:07:04Z","title":"GenWarp: Single Image to Novel Views with Semantic-Preserving Generative\n Warping","summary":" Generating novel views from a single image remains a challenging task due to\nthe complexity of 3D scenes and the limited diversity in the existing\nmulti-view datasets to train a model on. Recent research combining large-scale\ntext-to-image (T2I) models with monocular depth estimation (MDE) has shown\npromise in handling in-the-wild images. In these methods, an input view is\ngeometrically warped to novel views with estimated depth maps, then the warped\nimage is inpainted by T2I models. However, they struggle with noisy depth maps\nand loss of semantic details when warping an input view to novel viewpoints. In\nthis paper, we propose a novel approach for single-shot novel view synthesis, a\nsemantic-preserving generative warping framework that enables T2I generative\nmodels to learn where to warp and where to generate, through augmenting\ncross-view attention with self-attention. Our approach addresses the\nlimitations of existing methods by conditioning the generative model on source\nview images and incorporating geometric warping signals. Qualitative and\nquantitative evaluations demonstrate that our model outperforms existing\nmethods in both in-domain and out-of-domain scenarios. Project page is\navailable at https://GenWarp-NVS.github.io/.\n","authors":["Junyoung Seo","Kazumi Fukuda","Takashi Shibuya","Takuya Narihira","Naoki Murata","Shoukang Hu","Chieh-Hsin Lai","Seungryong Kim","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2405.17251v2.pdf","comment":"Accepted to NeurIPS 2024 / Project page:\n https://GenWarp-NVS.github.io"},{"id":"http://arxiv.org/abs/2309.11531v2","updated":"2024-09-26T08:20:59Z","published":"2023-09-20T10:50:28Z","title":"EPTQ: Enhanced Post-Training Quantization via Hessian-guided\n Network-wise Optimization","summary":" Quantization is a key method for deploying deep neural networks on edge\ndevices with limited memory and computation resources. Recent improvements in\nPost-Training Quantization (PTQ) methods were achieved by an additional local\noptimization process for learning the weight quantization rounding policy.\nHowever, a gap exists when employing network-wise optimization with small\nrepresentative datasets. In this paper, we propose a new method for enhanced\nPTQ (EPTQ) that employs a network-wise quantization optimization process, which\nbenefits from considering cross-layer dependencies during optimization. EPTQ\nenables network-wise optimization with a small representative dataset using a\nnovel sample-layer attention score based on a label-free Hessian matrix upper\nbound. The label-free approach makes our method suitable for the PTQ scheme. We\ngive a theoretical analysis for the said bound and use it to construct a\nknowledge distillation loss that guides the optimization to focus on the more\nsensitive layers and samples. In addition, we leverage the Hessian upper bound\nto improve the weight quantization parameters selection by focusing on the more\nsensitive elements in the weight tensors. Empirically, by employing EPTQ we\nachieve state-of-the-art results on various models, tasks, and datasets,\nincluding ImageNet classification, COCO object detection, and Pascal-VOC for\nsemantic segmentation.\n","authors":["Ofir Gordon","Elad Cohen","Hai Victor Habi","Arnon Netzer"],"pdf_url":"https://arxiv.org/pdf/2309.11531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14220v2","updated":"2024-09-26T08:13:43Z","published":"2024-09-21T18:52:07Z","title":"Masks and Boxes: Combining the Best of Both Worlds for Multi-Object\n Tracking","summary":" Multi-object tracking (MOT) involves identifying and consistently tracking\nobjects across video sequences. Traditional tracking-by-detection methods,\nwhile effective, often require extensive tuning and lack generalizability. On\nthe other hand, segmentation mask-based methods are more generic but struggle\nwith tracking management, making them unsuitable for MOT. We propose a novel\napproach, McByte, which incorporates a temporally propagated segmentation mask\nas a strong association cue within a tracking-by-detection framework. By\ncombining bounding box and mask information, McByte enhances robustness and\ngeneralizability without per-sequence tuning. Evaluated on four benchmark\ndatasets - DanceTrack, MOT17, SoccerNet-tracking 2022, and KITTI-tracking -\nMcByte demonstrates performance gain in all cases examined. At the same time,\nit outperforms existing mask-based methods. Implementation code will be\nprovided upon acceptance.\n","authors":["Tomasz Stanczyk","Francois Bremond"],"pdf_url":"https://arxiv.org/pdf/2409.14220v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13818v3","updated":"2024-09-26T08:07:16Z","published":"2024-08-25T12:22:50Z","title":"HER2 and FISH Status Prediction in Breast Biopsy H&E-Stained Images\n Using Deep Learning","summary":" The current standard for detecting human epidermal growth factor receptor 2\n(HER2) status in breast cancer patients relies on HER2 amplification,\nidentified through fluorescence in situ hybridization (FISH) or\nimmunohistochemistry (IHC). However, hematoxylin and eosin (H\\&E) tumor stains\nare more widely available, and accurately predicting HER2 status using H\\&E\ncould reduce costs and expedite treatment selection. Deep Learning algorithms\nfor H&E have shown effectiveness in predicting various cancer features and\nclinical outcomes, including moderate success in HER2 status prediction. In\nthis work, we employed a customized weak supervision classification technique\ncombined with MoCo-v2 contrastive learning to predict HER2 status. We trained\nour pipeline on 182 publicly available H&E Whole Slide Images (WSIs) from The\nCancer Genome Atlas (TCGA), for which annotations by the pathology team at Yale\nSchool of Medicine are publicly available. Our pipeline achieved an Area Under\nthe Curve (AUC) of 0.85 across four different test folds. Additionally, we\ntested our model on 44 H&E slides from the TCGA-BRCA dataset, which had an HER2\nscore of 2+ and included corresponding HER2 status and FISH test results. These\ncases are considered equivocal for IHC, requiring an expensive FISH test on\ntheir IHC slides for disambiguation. Our pipeline demonstrated an AUC of 0.81\non these challenging H&E slides. Reducing the need for FISH test can have\nsignificant implications in cancer treatment equity for underserved\npopulations.\n","authors":["Ardhendu Sekhar","Vrinda Goel","Garima Jain","Abhijeet Patil","Ravi Kant Gupta","Tripti Bameta","Swapnil Rane","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2408.13818v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17612v1","updated":"2024-09-26T08:03:19Z","published":"2024-09-26T08:03:19Z","title":"Diversity-Driven Synthesis: Enhancing Dataset Distillation through\n Directed Weight Adjustment","summary":" The sharp increase in data-related expenses has motivated research into\ncondensing datasets while retaining the most informative features. Dataset\ndistillation has thus recently come to the fore. This paradigm generates\nsynthetic dataset that are representative enough to replace the original\ndataset in training a neural network. To avoid redundancy in these synthetic\ndatasets, it is crucial that each element contains unique features and remains\ndiverse from others during the synthesis stage. In this paper, we provide a\nthorough theoretical and empirical analysis of diversity within synthesized\ndatasets. We argue that enhancing diversity can improve the parallelizable yet\nisolated synthesizing approach. Specifically, we introduce a novel method that\nemploys dynamic and directed weight adjustment techniques to modulate the\nsynthesis process, thereby maximizing the representativeness and diversity of\neach synthetic instance. Our method ensures that each batch of synthetic data\nmirrors the characteristics of a large, varying subset of the original dataset.\nExtensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet,\nand ImageNet-1K, demonstrate the superior performance of our method,\nhighlighting its effectiveness in producing diverse and representative\nsynthetic datasets with minimal computational expense.\n","authors":["Jiawei Du","Xin Zhang","Juncheng Hu","Wenxin Huang","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.17612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06190v2","updated":"2024-09-26T07:56:50Z","published":"2024-08-12T14:40:38Z","title":"FruitNeRF: A Unified Neural Radiance Field based Fruit Counting\n Framework","summary":" We introduce FruitNeRF, a unified novel fruit counting framework that\nleverages state-of-the-art view synthesis methods to count any fruit type\ndirectly in 3D. Our framework takes an unordered set of posed images captured\nby a monocular camera and segments fruit in each image. To make our system\nindependent of the fruit type, we employ a foundation model that generates\nbinary segmentation masks for any fruit. Utilizing both modalities, RGB and\nsemantic, we train a semantic neural radiance field. Through uniform volume\nsampling of the implicit Fruit Field, we obtain fruit-only point clouds. By\napplying cascaded clustering on the extracted point cloud, our approach\nachieves precise fruit count.The use of neural radiance fields provides\nsignificant advantages over conventional methods such as object tracking or\noptical flow, as the counting itself is lifted into 3D. Our method prevents\ndouble counting fruit and avoids counting irrelevant fruit.We evaluate our\nmethodology using both real-world and synthetic datasets. The real-world\ndataset consists of three apple trees with manually counted ground truths, a\nbenchmark apple dataset with one row and ground truth fruit location, while the\nsynthetic dataset comprises various fruit types including apple, plum, lemon,\npear, peach, and mango.Additionally, we assess the performance of fruit\ncounting using the foundation model compared to a U-Net.\n","authors":["Lukas Meyer","Andreas Gilson","Ute Schmid","Marc Stamminger"],"pdf_url":"https://arxiv.org/pdf/2408.06190v2.pdf","comment":"Project Page: https://meyerls.github.io/fruit_nerf/"},{"id":"http://arxiv.org/abs/2409.17610v1","updated":"2024-09-26T07:55:57Z","published":"2024-09-26T07:55:57Z","title":"ZALM3: Zero-Shot Enhancement of Vision-Language Alignment via In-Context\n Information in Multi-Turn Multimodal Medical Dialogue","summary":" The rocketing prosperity of large language models (LLMs) in recent years has\nboosted the prevalence of vision-language models (VLMs) in the medical sector.\nIn our online medical consultation scenario, a doctor responds to the texts and\nimages provided by a patient in multiple rounds to diagnose her/his health\ncondition, forming a multi-turn multimodal medical dialogue format. Unlike\nhigh-quality images captured by professional equipment in traditional medical\nvisual question answering (Med-VQA), the images in our case are taken by\npatients' mobile phones. These images have poor quality control, with issues\nsuch as excessive background elements and the lesion area being significantly\noff-center, leading to degradation of vision-language alignment in the model\ntraining phase. In this paper, we propose ZALM3, a Zero-shot strategy to\nimprove vision-language ALignment in Multi-turn Multimodal Medical dialogue.\nSince we observe that the preceding text conversations before an image can\ninfer the regions of interest (RoIs) in the image, ZALM3 employs an LLM to\nsummarize the keywords from the preceding context and a visual grounding model\nto extract the RoIs. The updated images eliminate unnecessary background noise\nand provide more effective vision-language alignment. To better evaluate our\nproposed method, we design a new subjective assessment metric for multi-turn\nunimodal/multimodal medical dialogue to provide a fine-grained performance\ncomparison. Our experiments across three different clinical departments\nremarkably demonstrate the efficacy of ZALM3 with statistical significance.\n","authors":["Zhangpu Li","Changhong Zou","Suxue Ma","Zhicheng Yang","Chen Du","Youbao Tang","Zhenjie Cao","Ning Zhang","Jui-Hsin Lai","Ruei-Sung Lin","Yuan Ni","Xingzhi Sun","Jing Xiao","Kai Zhang","Mei Han"],"pdf_url":"https://arxiv.org/pdf/2409.17610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17608v1","updated":"2024-09-26T07:48:20Z","published":"2024-09-26T07:48:20Z","title":"Appearance Blur-driven AutoEncoder and Motion-guided Memory Module for\n Video Anomaly Detection","summary":" Video anomaly detection (VAD) often learns the distribution of normal samples\nand detects the anomaly through measuring significant deviations, but the\nundesired generalization may reconstruct a few anomalies thus suppressing the\ndeviations. Meanwhile, most VADs cannot cope with cross-dataset validation for\nnew target domains, and few-shot methods must laboriously rely on model-tuning\nfrom the target domain to complete domain adaptation. To address these\nproblems, we propose a novel VAD method with a motion-guided memory module to\nachieve cross-dataset validation with zero-shot. First, we add Gaussian blur to\nthe raw appearance images, thereby constructing the global pseudo-anomaly,\nwhich serves as the input to the network. Then, we propose multi-scale residual\nchannel attention to deblur the pseudo-anomaly in normal samples. Next, memory\nitems are obtained by recording the motion features in the training phase,\nwhich are used to retrieve the motion features from the raw information in the\ntesting phase. Lastly, our method can ignore the blurred real anomaly through\nattention and rely on motion memory items to increase the normality gap between\nnormal and abnormal motion. Extensive experiments on three benchmark datasets\ndemonstrate the effectiveness of the proposed method. Compared with\ncross-domain methods, our method achieves competitive performance without\nadaptation during testing.\n","authors":["Jiahao Lyu","Minghua Zhao","Jing Hu","Xuewen Huang","Shuangli Du","Cheng Shi","Zhiyong Lv"],"pdf_url":"https://arxiv.org/pdf/2409.17608v1.pdf","comment":"13 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.03944v2","updated":"2024-09-26T07:47:50Z","published":"2024-07-22T03:56:27Z","title":"Improving Fast Adversarial Training Paradigm: An Example Taxonomy\n Perspective","summary":" While adversarial training is an effective defense method against adversarial\nattacks, it notably increases the training cost. To this end, fast adversarial\ntraining (FAT) is presented for efficient training and has become a hot\nresearch topic. However, FAT suffers from catastrophic overfitting, which leads\nto a performance drop compared with multi-step adversarial training. However,\nthe cause of catastrophic overfitting remains unclear and lacks exploration. In\nthis paper, we present an example taxonomy in FAT, which identifies that\ncatastrophic overfitting is caused by the imbalance between the inner and outer\noptimization in FAT. Furthermore, we investigated the impact of varying degrees\nof training loss, revealing a correlation between training loss and\ncatastrophic overfitting. Based on these observations, we redesign the loss\nfunction in FAT with the proposed dynamic label relaxation to concentrate the\nloss range and reduce the impact of misclassified examples. Meanwhile, we\nintroduce batch momentum initialization to enhance the diversity to prevent\ncatastrophic overfitting in an efficient manner. Furthermore, we also propose\nCatastrophic Overfitting aware Loss Adaptation (COLA), which employs a separate\ntraining strategy for examples based on their loss degree. Our proposed method,\nnamed example taxonomy aware FAT (ETA), establishes an improved paradigm for\nFAT. Experiment results demonstrate our ETA achieves state-of-the-art\nperformance. Comprehensive experiments on four standard datasets demonstrate\nthe competitiveness of our proposed method.\n","authors":["Jie Gui","Chengze Jiang","Minjing Dong","Kun Tong","Xinli Shi","Yuan Yan Tang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.03944v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2409.17605v1","updated":"2024-09-26T07:43:12Z","published":"2024-09-26T07:43:12Z","title":"Good Data Is All Imitation Learning Needs","summary":" In this paper, we address the limitations of traditional teacher-student\nmodels, imitation learning, and behaviour cloning in the context of\nAutonomous/Automated Driving Systems (ADS), where these methods often struggle\nwith incomplete coverage of real-world scenarios. To enhance the robustness of\nsuch models, we introduce the use of Counterfactual Explanations (CFEs) as a\nnovel data augmentation technique for end-to-end ADS. CFEs, by generating\ntraining samples near decision boundaries through minimal input modifications,\nlead to a more comprehensive representation of expert driver strategies,\nparticularly in safety-critical scenarios. This approach can therefore help\nimprove the model's ability to handle rare and challenging driving events, such\nas anticipating darting out pedestrians, ultimately leading to safer and more\ntrustworthy decision-making for ADS. Our experiments in the CARLA simulator\ndemonstrate that CF-Driver outperforms the current state-of-the-art method,\nachieving a higher driving score and lower infraction rates. Specifically,\nCF-Driver attains a driving score of 84.2, surpassing the previous best model\nby 15.02 percentage points. These results highlight the effectiveness of\nincorporating CFEs in training end-to-end ADS. To foster further research, the\nCF-Driver code is made publicly available.\n","authors":["Amir Samadi","Konstantinos Koufos","Kurt Debattista","Mehrdad Dianati"],"pdf_url":"https://arxiv.org/pdf/2409.17605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17601v1","updated":"2024-09-26T07:35:23Z","published":"2024-09-26T07:35:23Z","title":"TA-Cleaner: A Fine-grained Text Alignment Backdoor Defense Strategy for\n Multimodal Contrastive Learning","summary":" Pre-trained large models for multimodal contrastive learning, such as CLIP,\nhave been widely recognized in the industry as highly susceptible to\ndata-poisoned backdoor attacks. This poses significant risks to downstream\nmodel training. In response to such potential threats, finetuning offers a\nsimpler and more efficient defense choice compared to retraining large models\nwith augmented data. In the supervised learning domain, fine-tuning defense\nstrategies can achieve excellent defense performance. However, in the\nunsupervised and semi-supervised domain, we find that when CLIP faces some\ncomplex attack techniques, the existing fine-tuning defense strategy,\nCleanCLIP, has some limitations on defense performance. The synonym\nsubstitution of its text-augmentation is insufficient to enhance the text\nfeature space. To compensate for this weakness, we improve it by proposing a\nfine-grained \\textbf{T}ext \\textbf{A}lignment \\textbf{C}leaner (TA-Cleaner) to\ncut off feature connections of backdoor triggers. We randomly select a few\nsamples for positive and negative subtext generation at each epoch of\nCleanCLIP, and align the subtexts to the images to strengthen the text\nself-supervision. We evaluate the effectiveness of our TA-Cleaner against six\nattack algorithms and conduct comprehensive zero-shot classification tests on\nImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves\nstate-of-the-art defensiveness among finetuning-based defense techniques. Even\nwhen faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms\nCleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\\% and 63.88\\%,\nrespectively.\n","authors":["Yuan Xun","Siyuan Liang","Xiaojun Jia","Xinwei Liu","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2409.17601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17597v1","updated":"2024-09-26T07:24:09Z","published":"2024-09-26T07:24:09Z","title":"Unifying Dimensions: A Linear Adaptive Approach to Lightweight Image\n Super-Resolution","summary":" Window-based transformers have demonstrated outstanding performance in\nsuper-resolution tasks due to their adaptive modeling capabilities through\nlocal self-attention (SA). However, they exhibit higher computational\ncomplexity and inference latency than convolutional neural networks. In this\npaper, we first identify that the adaptability of the Transformers is derived\nfrom their adaptive spatial aggregation and advanced structural design, while\ntheir high latency results from the computational costs and memory layout\ntransformations associated with the local SA. To simulate this aggregation\napproach, we propose an effective convolution-based linear focal separable\nattention (FSA), allowing for long-range dynamic modeling with linear\ncomplexity. Additionally, we introduce an effective dual-branch structure\ncombined with an ultra-lightweight information exchange module (IEM) to enhance\nthe aggregation of information by the Token Mixer. Finally, with respect to the\nstructure, we modify the existing spatial-gate-based feedforward neural\nnetworks by incorporating a self-gate mechanism to preserve high-dimensional\nchannel information, enabling the modeling of more complex relationships. With\nthese advancements, we construct a convolution-based Transformer framework\nnamed the linear adaptive mixer network (LAMNet). Extensive experiments\ndemonstrate that LAMNet achieves better performance than existing SA-based\nTransformer methods while maintaining the computational efficiency of\nconvolutional neural networks, which can achieve a \\(3\\times\\) speedup of\ninference time. The code will be publicly available at:\nhttps://github.com/zononhzy/LAMNet.\n","authors":["Zhenyu Hu","Wanjie Sun"],"pdf_url":"https://arxiv.org/pdf/2409.17597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17937v3","updated":"2024-09-26T07:23:49Z","published":"2024-03-26T17:59:58Z","title":"Efficient Video Object Segmentation via Modulated Cross-Attention Memory","summary":" Recently, transformer-based approaches have shown promising results for\nsemi-supervised video object segmentation. However, these approaches typically\nstruggle on long videos due to increased GPU memory demands, as they frequently\nexpand the memory bank every few frames. We propose a transformer-based\napproach, named MAVOS, that introduces an optimized and dynamic long-term\nmodulated cross-attention (MCA) memory to model temporal smoothness without\nrequiring frequent memory expansion. The proposed MCA effectively encodes both\nlocal and global features at various levels of granularity while efficiently\nmaintaining consistent speed regardless of the video length. Extensive\nexperiments on multiple benchmarks, LVOS, Long-Time Video, and DAVIS 2017,\ndemonstrate the effectiveness of our proposed contributions leading to\nreal-time inference and markedly reduced memory demands without any degradation\nin segmentation accuracy on long videos. Compared to the best existing\ntransformer-based approach, our MAVOS increases the speed by 7.6x, while\nsignificantly reducing the GPU memory by 87% with comparable segmentation\nperformance on short and long video datasets. Notably on the LVOS dataset, our\nMAVOS achieves a J&F score of 63.3% while operating at 37 frames per second\n(FPS) on a single V100 GPU. Our code and models will be publicly available at:\nhttps://github.com/Amshaker/MAVOS.\n","authors":["Abdelrahman Shaker","Syed Talal Wasim","Martin Danelljan","Salman Khan","Ming-Hsuan Yang","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.17937v3.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2409.17589v1","updated":"2024-09-26T07:12:04Z","published":"2024-09-26T07:12:04Z","title":"Improving Fast Adversarial Training via Self-Knowledge Guidance","summary":" Adversarial training has achieved remarkable advancements in defending\nagainst adversarial attacks. Among them, fast adversarial training (FAT) is\ngaining attention for its ability to achieve competitive robustness with fewer\ncomputing resources. Existing FAT methods typically employ a uniform strategy\nthat optimizes all training data equally without considering the influence of\ndifferent examples, which leads to an imbalanced optimization. However, this\nimbalance remains unexplored in the field of FAT. In this paper, we conduct a\ncomprehensive study of the imbalance issue in FAT and observe an obvious class\ndisparity regarding their performances. This disparity could be embodied from a\nperspective of alignment between clean and robust accuracy. Based on the\nanalysis, we mainly attribute the observed misalignment and disparity to the\nimbalanced optimization in FAT, which motivates us to optimize different\ntraining data adaptively to enhance robustness. Specifically, we take disparity\nand misalignment into consideration. First, we introduce self-knowledge guided\nregularization, which assigns differentiated regularization weights to each\nclass based on its training state, alleviating class disparity. Additionally,\nwe propose self-knowledge guided label relaxation, which adjusts label\nrelaxation according to the training accuracy, alleviating the misalignment and\nimproving robustness. By combining these methods, we formulate the\nSelf-Knowledge Guided FAT (SKG-FAT), leveraging naturally generated knowledge\nduring training to enhance the adversarial robustness without compromising\ntraining efficiency. Extensive experiments on four standard datasets\ndemonstrate that the SKG-FAT improves the robustness and preserves competitive\nclean accuracy, outperforming the state-of-the-art methods.\n","authors":["Chengze Jiang","Junkai Wang","Minjing Dong","Jie Gui","Xinli Shi","Yuan Cao","Yuan Yan Tang","James Tin-Yau Kwok"],"pdf_url":"https://arxiv.org/pdf/2409.17589v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2409.17583v1","updated":"2024-09-26T07:01:29Z","published":"2024-09-26T07:01:29Z","title":"Let the Quantum Creep In: Designing Quantum Neural Network Models by\n Gradually Swapping Out Classical Components","summary":" Artificial Intelligence (AI), with its multiplier effect and wide\napplications in multiple areas, could potentially be an important application\nof quantum computing. Since modern AI systems are often built on neural\nnetworks, the design of quantum neural networks becomes a key challenge in\nintegrating quantum computing into AI. To provide a more fine-grained\ncharacterisation of the impact of quantum components on the performance of\nneural networks, we propose a framework where classical neural network layers\nare gradually replaced by quantum layers that have the same type of input and\noutput while keeping the flow of information between layers unchanged,\ndifferent from most current research in quantum neural network, which favours\nan end-to-end quantum model. We start with a simple three-layer classical\nneural network without any normalisation layers or activation functions, and\ngradually change the classical layers to the corresponding quantum versions. We\nconduct numerical experiments on image classification datasets such as the\nMNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of\nperformance brought by the systematic introduction of quantum components.\nThrough this framework, our research sheds new light on the design of future\nquantum neural network models where it could be more favourable to search for\nmethods and frameworks that harness the advantages from both the classical and\nquantum worlds.\n","authors":["Peiyong Wang","Casey. R. Myers","Lloyd C. L. Hollenberg","Udaya Parampalli"],"pdf_url":"https://arxiv.org/pdf/2409.17583v1.pdf","comment":"50 pages (including Appendix), many figures, accepted as a poster on\n QTML2024. Code available at\n https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In"},{"id":"http://arxiv.org/abs/2409.17576v1","updated":"2024-09-26T06:46:40Z","published":"2024-09-26T06:46:40Z","title":"ID$^3$: Identity-Preserving-yet-Diversified Diffusion Models for\n Synthetic Face Recognition","summary":" Synthetic face recognition (SFR) aims to generate synthetic face datasets\nthat mimic the distribution of real face data, which allows for training face\nrecognition models in a privacy-preserving manner. Despite the remarkable\npotential of diffusion models in image generation, current diffusion-based SFR\nmodels struggle with generalization to real-world faces. To address this\nlimitation, we outline three key objectives for SFR: (1) promoting diversity\nacross identities (inter-class diversity), (2) ensuring diversity within each\nidentity by injecting various facial attributes (intra-class diversity), and\n(3) maintaining identity consistency within each identity group (intra-class\nidentity preservation). Inspired by these goals, we introduce a\ndiffusion-fueled SFR model termed $\\text{ID}^3$. $\\text{ID}^3$ employs an\nID-preserving loss to generate diverse yet identity-consistent facial\nappearances. Theoretically, we show that minimizing this loss is equivalent to\nmaximizing the lower bound of an adjusted conditional log-likelihood over\nID-preserving data. This equivalence motivates an ID-preserving sampling\nalgorithm, which operates over an adjusted gradient vector field, enabling the\ngeneration of fake face recognition datasets that approximate the distribution\nof real-world faces. Extensive experiments across five challenging benchmarks\nvalidate the advantages of $\\text{ID}^3$.\n","authors":["Shen Li","Jianqing Xu","Jiaying Wu","Miao Xiong","Ailin Deng","Jiazhen Ji","Yuge Huang","Wenjie Feng","Shouhong Ding","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2409.17576v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.12598v2","updated":"2024-09-26T06:31:25Z","published":"2024-08-22T17:59:01Z","title":"ND-SDF: Learning Normal Deflection Fields for High-Fidelity Indoor\n Reconstruction","summary":" Neural implicit reconstruction via volume rendering has demonstrated its\neffectiveness in recovering dense 3D surfaces. However, it is non-trivial to\nsimultaneously recover meticulous geometry and preserve smoothness across\nregions with differing characteristics. To address this issue, previous methods\ntypically employ geometric priors, which are often constrained by the\nperformance of the prior models. In this paper, we propose ND-SDF, which learns\na Normal Deflection field to represent the angular deviation between the scene\nnormal and the prior normal. Unlike previous methods that uniformly apply\ngeometric priors on all samples, introducing significant bias in accuracy, our\nproposed normal deflection field dynamically learns and adapts the utilization\nof samples based on their specific characteristics, thereby improving both the\naccuracy and effectiveness of the model. Our method not only obtains smooth\nweakly textured regions such as walls and floors but also preserves the\ngeometric details of complex structures. In addition, we introduce a novel ray\nsampling strategy based on the deflection angle to facilitate the unbiased\nrendering process, which significantly improves the quality and accuracy of\nintricate surfaces, especially on thin structures. Consistent improvements on\nvarious challenging datasets demonstrate the superiority of our method.\n","authors":["Ziyu Tang","Weicai Ye","Yifan Wang","Di Huang","Hujun Bao","Tong He","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17566v1","updated":"2024-09-26T06:28:05Z","published":"2024-09-26T06:28:05Z","title":"Flexiffusion: Segment-wise Neural Architecture Search for Flexible\n Denoising Schedule","summary":" Diffusion models are cutting-edge generative models adept at producing\ndiverse, high-quality images. Despite their effectiveness, these models often\nrequire significant computational resources owing to their numerous sequential\ndenoising steps and the significant inference cost of each step. Recently,\nNeural Architecture Search (NAS) techniques have been employed to automatically\nsearch for faster generation processes. However, NAS for diffusion is\ninherently time-consuming as it requires estimating thousands of diffusion\nmodels to search for the optimal one. In this paper, we introduce Flexiffusion,\na novel training-free NAS paradigm designed to accelerate diffusion models by\nconcurrently optimizing generation steps and network structures. Specifically,\nwe partition the generation process into isometric step segments, each\nsequentially composed of a full step, multiple partial steps, and several null\nsteps. The full step computes all network blocks, while the partial step\ninvolves part of the blocks, and the null step entails no computation.\nFlexiffusion autonomously explores flexible step combinations for each segment,\nsubstantially reducing search costs and enabling greater acceleration compared\nto the state-of-the-art (SOTA) method for diffusion models. Our searched models\nreported speedup factors of $2.6\\times$ and $1.5\\times$ for the original\nLDM-4-G and the SOTA, respectively. The factors for Stable Diffusion V1.5 and\nthe SOTA are $5.1\\times$ and $2.0\\times$. We also verified the performance of\nFlexiffusion on multiple datasets, and positive experiment results indicate\nthat Flexiffusion can effectively reduce redundancy in diffusion models.\n","authors":["Hongtao Huang","Xiaojun Chang","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2409.17566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17565v1","updated":"2024-09-26T06:27:26Z","published":"2024-09-26T06:27:26Z","title":"Pixel-Space Post-Training of Latent Diffusion Models","summary":" Latent diffusion models (LDMs) have made significant advancements in the\nfield of image generation in recent years. One major advantage of LDMs is their\nability to operate in a compressed latent space, allowing for more efficient\ntraining and deployment. However, despite these advantages, challenges with\nLDMs still remain. For example, it has been observed that LDMs often generate\nhigh-frequency details and complex compositions imperfectly. We hypothesize\nthat one reason for these flaws is due to the fact that all pre- and\npost-training of LDMs are done in latent space, which is typically $8 \\times 8$\nlower spatial-resolution than the output images. To address this issue, we\npropose adding pixel-space supervision in the post-training process to better\npreserve high-frequency details. Experimentally, we show that adding a\npixel-space objective significantly improves both supervised quality\nfine-tuning and preference-based post-training by a large margin on a\nstate-of-the-art DiT transformer and U-Net diffusion models in both visual\nquality and visual flaw metrics, while maintaining the same text alignment\nquality.\n","authors":["Christina Zhang","Simran Motwani","Matthew Yu","Ji Hou","Felix Juefei-Xu","Sam Tsai","Peter Vajda","Zijian He","Jialiang Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17564v1","updated":"2024-09-26T06:27:15Z","published":"2024-09-26T06:27:15Z","title":"General Compression Framework for Efficient Transformer Object Tracking","summary":" Transformer-based trackers have established a dominant role in the field of\nvisual object tracking. While these trackers exhibit promising performance,\ntheir deployment on resource-constrained devices remains challenging due to\ninefficiencies. To improve the inference efficiency and reduce the computation\ncost, prior approaches have aimed to either design lightweight trackers or\ndistill knowledge from larger teacher models into more compact student\ntrackers. However, these solutions often sacrifice accuracy for speed. Thus, we\npropose a general model compression framework for efficient transformer object\ntracking, named CompressTracker, to reduce the size of a pre-trained tracking\nmodel into a lightweight tracker with minimal performance degradation. Our\napproach features a novel stage division strategy that segments the transformer\nlayers of the teacher model into distinct stages, enabling the student model to\nemulate each corresponding teacher stage more effectively. Additionally, we\nalso design a unique replacement training technique that involves randomly\nsubstituting specific stages in the student model with those from the teacher\nmodel, as opposed to training the student model in isolation. Replacement\ntraining enhances the student model's ability to replicate the teacher model's\nbehavior. To further forcing student model to emulate teacher model, we\nincorporate prediction guidance and stage-wise feature mimicking to provide\nadditional supervision during the teacher model's compression process. Our\nframework CompressTracker is structurally agnostic, making it compatible with\nany transformer architecture. We conduct a series of experiment to verify the\neffectiveness and generalizability of CompressTracker. Our CompressTracker-4\nwith 4 transformer layers, which is compressed from OSTrack, retains about 96%\nperformance on LaSOT (66.1% AUC) while achieves 2.17x speed up.\n","authors":["Lingyi Hong","Jinglun Li","Xinyu Zhou","Shilin Yan","Pinxue Guo","Kaixun Jiang","Zhaoyu Chen","Shuyong Gao","Wei Zhang","Hong Lu","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17560v1","updated":"2024-09-26T06:12:08Z","published":"2024-09-26T06:12:08Z","title":"Dynamic Subframe Splitting and Spatio-Temporal Motion Entangled Sparse\n Attention for RGB-E Tracking","summary":" Event-based bionic camera asynchronously captures dynamic scenes with high\ntemporal resolution and high dynamic range, offering potential for the\nintegration of events and RGB under conditions of illumination degradation and\nfast motion. Existing RGB-E tracking methods model event characteristics\nutilising attention mechanism of Transformer before integrating both\nmodalities. Nevertheless, these methods involve aggregating the event stream\ninto a single event frame, lacking the utilisation of the temporal information\ninherent in the event stream.Moreover, the traditional attention mechanism is\nwell-suited for dense semantic features, while the attention mechanism for\nsparse event features require revolution. In this paper, we propose a dynamic\nevent subframe splitting strategy to split the event stream into more\nfine-grained event clusters, aiming to capture spatio-temporal features that\ncontain motion cues. Based on this, we design an event-based sparse attention\nmechanism to enhance the interaction of event features in temporal and spatial\ndimensions. The experimental results indicate that our method outperforms\nexisting state-of-the-art methods on the FE240 and COESOT datasets, providing\nan effective processing manner for the event data.\n","authors":["Pengcheng Shao","Tianyang Xu","Xuefeng Zhu","Xiaojun Wu","Josef Kittler"],"pdf_url":"https://arxiv.org/pdf/2409.17560v1.pdf","comment":"15 pages, 8 figures, conference"},{"id":"http://arxiv.org/abs/2409.16225v2","updated":"2024-09-26T06:04:21Z","published":"2024-09-24T16:38:41Z","title":"VideoPatchCore: An Effective Method to Memorize Normality for Video\n Anomaly Detection","summary":" Video anomaly detection (VAD) is a crucial task in video analysis and\nsurveillance within computer vision. Currently, VAD is gaining attention with\nmemory techniques that store the features of normal frames. The stored features\nare utilized for frame reconstruction, identifying an abnormality when a\nsignificant difference exists between the reconstructed and input frames.\nHowever, this approach faces several challenges due to the simultaneous\noptimization required for both the memory and encoder-decoder model. These\nchallenges include increased optimization difficulty, complexity of\nimplementation, and performance variability depending on the memory size. To\naddress these challenges,we propose an effective memory method for VAD, called\nVideoPatchCore. Inspired by PatchCore, our approach introduces a structure that\nprioritizes memory optimization and configures three types of memory tailored\nto the characteristics of video data. This method effectively addresses the\nlimitations of existing memory-based methods, achieving good performance\ncomparable to state-of-the-art methods. Furthermore, our method requires no\ntraining and is straightforward to implement, making VAD tasks more accessible.\nOur code is available online at github.com/SkiddieAhn/Paper-VideoPatchCore.\n","authors":["Sunghyun Ahn","Youngwan Jo","Kijung Lee","Sanghyun Park"],"pdf_url":"https://arxiv.org/pdf/2409.16225v2.pdf","comment":"Accepted to ACCV 2024"},{"id":"http://arxiv.org/abs/2409.17555v1","updated":"2024-09-26T05:57:35Z","published":"2024-09-26T05:57:35Z","title":"Advancing Open-Set Domain Generalization Using Evidential Bi-Level\n Hardest Domain Scheduler","summary":" In Open-Set Domain Generalization (OSDG), the model is exposed to both new\nvariations of data appearance (domains) and open-set conditions, where both\nknown and novel categories are present at test time. The challenges of this\ntask arise from the dual need to generalize across diverse domains and\naccurately quantify category novelty, which is critical for applications in\ndynamic environments. Recently, meta-learning techniques have demonstrated\nsuperior results in OSDG, effectively orchestrating the meta-train and -test\ntasks by employing varied random categories and predefined domain partition\nstrategies. These approaches prioritize a well-designed training schedule over\ntraditional methods that focus primarily on data augmentation and the\nenhancement of discriminative feature learning. The prevailing meta-learning\nmodels in OSDG typically utilize a predefined sequential domain scheduler to\nstructure data partitions. However, a crucial aspect that remains inadequately\nexplored is the influence brought by strategies of domain schedulers during\ntraining. In this paper, we observe that an adaptive domain scheduler benefits\nmore in OSDG compared with prefixed sequential and random domain schedulers. We\npropose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve\nan adaptive domain scheduler. This method strategically sequences domains by\nassessing their reliabilities in utilizing a follower network, trained with\nconfidence scores learned in an evidential manner, regularized by max rebiasing\ndiscrepancy, and optimized in a bi-level manner. The results show that our\nmethod substantially improves OSDG performance and achieves more discriminative\nembeddings for both the seen and unseen categories. The source code will be\navailable at https://github.com/KPeng9510/EBiL-HaDS.\n","authors":["Kunyu Peng","Di Wen","Kailun Yang","Ao Luo","Yufan Chen","Jia Fu","M. Saquib Sarfraz","Alina Roitberg","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2409.17555v1.pdf","comment":"Accepted to NeurIPS 2024. The source code will be available at\n https://github.com/KPeng9510/EBiL-HaDS"},{"id":"http://arxiv.org/abs/2406.06911v3","updated":"2024-09-26T05:47:36Z","published":"2024-06-11T03:09:37Z","title":"AsyncDiff: Parallelizing Diffusion Models by Asynchronous Denoising","summary":" Diffusion models have garnered significant interest from the community for\ntheir great generative ability across various applications. However, their\ntypical multi-step sequential-denoising nature gives rise to high cumulative\nlatency, thereby precluding the possibilities of parallel computation. To\naddress this, we introduce AsyncDiff, a universal and plug-and-play\nacceleration scheme that enables model parallelism across multiple devices. Our\napproach divides the cumbersome noise prediction model into multiple\ncomponents, assigning each to a different device. To break the dependency chain\nbetween these components, it transforms the conventional sequential denoising\ninto an asynchronous process by exploiting the high similarity between hidden\nstates in consecutive diffusion steps. Consequently, each component is\nfacilitated to compute in parallel on separate devices. The proposed strategy\nsignificantly reduces inference latency while minimally impacting the\ngenerative quality. Specifically, for the Stable Diffusion v2.1, AsyncDiff\nachieves a 2.7x speedup with negligible degradation and a 4.0x speedup with\nonly a slight reduction of 0.38 in CLIP Score, on four NVIDIA A5000 GPUs. Our\nexperiments also demonstrate that AsyncDiff can be readily applied to video\ndiffusion models with encouraging performances. The code is available at\nhttps://github.com/czg1225/AsyncDiff.\n","authors":["Zigeng Chen","Xinyin Ma","Gongfan Fang","Zhenxiong Tan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2406.06911v3.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2312.05284v4","updated":"2024-09-26T05:41:56Z","published":"2023-12-08T12:48:53Z","title":"SlimSAM: 0.1% Data Makes Segment Anything Slim","summary":" Current approaches for compressing the Segment Anything Model (SAM) yield\ncommendable results, yet necessitate extensive data to train a new network from\nscratch. Employing conventional pruning techniques can remarkably reduce data\nrequirements but would suffer from a degradation in performance. To address\nthis challenging trade-off, we introduce SlimSAM, a novel data-efficient SAM\ncompression method that achieves superior performance with extremely less\ntraining data. The essence of SlimSAM is encapsulated in the alternate slimming\nframework which effectively enhances knowledge inheritance under severely\nlimited training data availability and exceptional pruning ratio. Diverging\nfrom prior techniques, our framework progressively compresses the model by\nalternately pruning and distilling distinct, decoupled sub-structures.\nDisturbed Taylor pruning is also proposed to address the misalignment between\nthe pruning objective and training target, thereby boosting the\npost-distillation after pruning. SlimSAM yields significant performance\nimprovements while demanding over 10 times less training data than any other\nexisting compression methods. Even when compared to the original SAM, SlimSAM\nachieves approaching performance while reducing parameter counts to merely 1.4%\n(9.1M), MACs to 0.8% (23G), and requiring only 0.1% (10k) of the SAM training\ndata. The code is available at http://github.com/czg1225/SlimSAM.\n","authors":["Zigeng Chen","Gongfan Fang","Xinyin Ma","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2312.05284v4.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17547v1","updated":"2024-09-26T05:33:30Z","published":"2024-09-26T05:33:30Z","title":"Triple Point Masking","summary":" Existing 3D mask learning methods encounter performance bottlenecks under\nlimited data, and our objective is to overcome this limitation. In this paper,\nwe introduce a triple point masking scheme, named TPM, which serves as a\nscalable framework for pre-training of masked autoencoders to achieve\nmulti-mask learning for 3D point clouds. Specifically, we augment the baselines\nwith two additional mask choices (i.e., medium mask and low mask) as our core\ninsight is that the recovery process of an object can manifest in diverse ways.\nPrevious high-masking schemes focus on capturing the global representation but\nlack the fine-grained recovery capability, so that the generated pre-trained\nweights tend to play a limited role in the fine-tuning process. With the\nsupport of the proposed TPM, available methods can exhibit more flexible and\naccurate completion capabilities, enabling the potential autoencoder in the\npre-training stage to consider multiple representations of a single 3D object.\nIn addition, an SVM-guided weight selection module is proposed to fill the\nencoder parameters for downstream networks with the optimal weight during the\nfine-tuning stage, maximizing linear accuracy and facilitating the acquisition\nof intricate representations for new objects. Extensive experiments show that\nthe four baselines equipped with the proposed TPM achieve comprehensive\nperformance improvements on various downstream tasks.\n","authors":["Jiaming Liu","Linghe Kong","Yue Wu","Maoguo Gong","Hao Li","Qiguang Miao","Wenping Ma","Can Qin"],"pdf_url":"https://arxiv.org/pdf/2409.17547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05769v2","updated":"2024-09-26T05:10:23Z","published":"2024-05-09T13:45:04Z","title":"Exploring Text-Guided Single Image Editing for Remote Sensing Images","summary":" Artificial intelligence generative content (AIGC) has significantly impacted\nimage generation in the field of remote sensing. However, the equally important\narea of remote sensing image (RSI) editing has not received sufficient\nattention. Deep learning based editing methods generally involve two sequential\nstages: generation and editing. During the generation stage, consistency in\ncontent and details between the original and edited images must be maintained,\nwhile in the editing stage, controllability and accuracy of the edits should be\nensured. For natural images, these challenges can be tackled by training\ngenerative backbones on large-scale benchmark datasets and using text guidance\nbased on vision-language models (VLMs). However, these previously effective\napproaches become less viable for RSIs due to two reasons: First, existing\ngenerative RSI benchmark datasets do not fully capture the diversity of remote\nsensing scenarios, particularly in terms of variations in sensors, object\ntypes, and resolutions. Consequently, the generalization capacity of the\ntrained backbone model is often inadequate for universal editing tasks on RSIs.\nSecond, the large spatial resolution of RSIs exacerbates the problem in VLMs\nwhere a single text semantic corresponds to multiple image semantics, leading\nto the introduction of incorrect semantics when using text to guide RSI\nediting. To solve above problems, this paper proposes a text-guided RSI editing\nmethod that is controllable but stable, and can be trained using only a single\nimage. It adopts a multi-scale training approach to preserve consistency\nwithout the need for training on extensive benchmark datasets, while leveraging\nRSI pre-trained VLMs and prompt ensembling (PE) to ensure accuracy and\ncontrollability in the text-guided editing process.\n","authors":["Fangzhou Han","Lingyu Si","Hongwei Dong","Lamei Zhang","Hao Chen","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2405.05769v2.pdf","comment":"14 pages, 14 figures, submitted to IEEE Transactions on Geoscience\n and Remote Sensing"},{"id":"http://arxiv.org/abs/2409.17533v1","updated":"2024-09-26T04:40:38Z","published":"2024-09-26T04:40:38Z","title":"CAMOT: Camera Angle-aware Multi-Object Tracking","summary":" This paper proposes CAMOT, a simple camera angle estimator for multi-object\ntracking to tackle two problems: 1) occlusion and 2) inaccurate distance\nestimation in the depth direction. Under the assumption that multiple objects\nare located on a flat plane in each video frame, CAMOT estimates the camera\nangle using object detection. In addition, it gives the depth of each object,\nenabling pseudo-3D MOT. We evaluated its performance by adding it to various 2D\nMOT methods on the MOT17 and MOT20 datasets and confirmed its effectiveness.\nApplying CAMOT to ByteTrack, we obtained 63.8% HOTA, 80.6% MOTA, and 78.5% IDF1\nin MOT17, which are state-of-the-art results. Its computational cost is\nsignificantly lower than the existing deep-learning-based depth estimators for\ntracking.\n","authors":["Felix Limanta","Kuniaki Uto","Koichi Shinoda"],"pdf_url":"https://arxiv.org/pdf/2409.17533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17531v1","updated":"2024-09-26T04:36:19Z","published":"2024-09-26T04:36:19Z","title":"SimVG: A Simple Framework for Visual Grounding with Decoupled\n Multi-modal Fusion","summary":" Visual grounding is a common vision task that involves grounding descriptive\nsentences to the corresponding regions of an image. Most existing methods use\nindependent image-text encoding and apply complex hand-crafted modules or\nencoder-decoder architectures for modal interaction and query reasoning.\nHowever, their performance significantly drops when dealing with complex\ntextual expressions. This is because the former paradigm only utilizes limited\ndownstream data to fit the multi-modal feature fusion. Therefore, it is only\neffective when the textual expressions are relatively simple. In contrast,\ngiven the wide diversity of textual expressions and the uniqueness of\ndownstream training data, the existing fusion module, which extracts multimodal\ncontent from a visual-linguistic context, has not been fully investigated. In\nthis paper, we present a simple yet robust transformer-based framework, SimVG,\nfor visual grounding. Specifically, we decouple visual-linguistic feature\nfusion from downstream tasks by leveraging existing multimodal pre-trained\nmodels and incorporating additional object tokens to facilitate deep\nintegration of downstream and pre-training tasks. Furthermore, we design a\ndynamic weight-balance distillation method in the multi-branch synchronous\nlearning process to enhance the representation capability of the simpler\nbranch. This branch only consists of a lightweight MLP, which simplifies the\nstructure and improves reasoning speed. Experiments on six widely used VG\ndatasets, i.e., RefCOCO/+/g, ReferIt, Flickr30K, and GRefCOCO, demonstrate the\nsuperiority of SimVG. Finally, the proposed method not only achieves\nimprovements in efficiency and convergence speed but also attains new\nstate-of-the-art performance on these benchmarks. Codes and models will be\navailable at \\url{https://github.com/Dmmm1997/SimVG}.\n","authors":["Ming Dai","Lingfeng Yang","Yihao Xu","Zhenhua Feng","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2409.17531v1.pdf","comment":"21pages, 11figures, NeurIPS2024"},{"id":"http://arxiv.org/abs/2409.17526v1","updated":"2024-09-26T04:27:44Z","published":"2024-09-26T04:27:44Z","title":"Drone Stereo Vision for Radiata Pine Branch Detection and Distance\n Measurement: Integrating SGBM and Segmentation Models","summary":" Manual pruning of radiata pine trees presents significant safety risks due to\ntheir substantial height and the challenging terrains in which they thrive. To\naddress these risks, this research proposes the development of a drone-based\npruning system equipped with specialized pruning tools and a stereo vision\ncamera, enabling precise detection and trimming of branches. Deep learning\nalgorithms, including YOLO and Mask R-CNN, are employed to ensure accurate\nbranch detection, while the Semi-Global Matching algorithm is integrated to\nprovide reliable distance estimation. The synergy between these techniques\nfacilitates the precise identification of branch locations and enables\nefficient, targeted pruning. Experimental results demonstrate that the combined\nimplementation of YOLO and SGBM enables the drone to accurately detect branches\nand measure their distances from the drone. This research not only improves the\nsafety and efficiency of pruning operations but also makes a significant\ncontribution to the advancement of drone technology in the automation of\nagricultural and forestry practices, laying a foundational framework for\nfurther innovations in environmental management.\n","authors":["Yida Lin","Bing Xue","Mengjie Zhang","Sam Schofield","Richard Green"],"pdf_url":"https://arxiv.org/pdf/2409.17526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17524v1","updated":"2024-09-26T04:23:17Z","published":"2024-09-26T04:23:17Z","title":"JoyType: A Robust Design for Multilingual Visual Text Creation","summary":" Generating images with accurately represented text, especially in non-Latin\nlanguages, poses a significant challenge for diffusion models. Existing\napproaches, such as the integration of hint condition diagrams via auxiliary\nnetworks (e.g., ControlNet), have made strides towards addressing this issue.\nHowever, diffusion models often fall short in tasks requiring controlled text\ngeneration, such as specifying particular fonts or producing text in small\nfonts. In this paper, we introduce a novel approach for multilingual visual\ntext creation, named JoyType, designed to maintain the font style of text\nduring the image generation process. Our methodology begins with assembling a\ntraining dataset, JoyType-1M, comprising 1 million pairs of data. Each pair\nincludes an image, its description, and glyph instructions corresponding to the\nfont style within the image. We then developed a text control network, Font\nControlNet, tasked with extracting font style information to steer the image\ngeneration. To further enhance our model's ability to maintain font style,\nnotably in generating small-font text, we incorporated a multi-layer OCR-aware\nloss into the diffusion process. This enhancement allows JoyType to direct text\nrendering using low-level descriptors. Our evaluations, based on both visual\nand accuracy metrics, demonstrate that JoyType significantly outperforms\nexisting state-of-the-art methods. Additionally, JoyType can function as a\nplugin, facilitating the creation of varied image styles in conjunction with\nother stable diffusion models on HuggingFace and CivitAI. Our project is\nopen-sourced on https://jdh-algo.github.io/JoyType/.\n","authors":["Chao Li","Chen Jiang","Xiaolong Liu","Jun Zhao","Guoxin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17524v1.pdf","comment":"Under Review at AAAI 2025"},{"id":"http://arxiv.org/abs/2409.17523v1","updated":"2024-09-26T04:17:27Z","published":"2024-09-26T04:17:27Z","title":"EAGLE: Egocentric AGgregated Language-video Engine","summary":" The rapid evolution of egocentric video analysis brings new insights into\nunderstanding human activities and intentions from a first-person perspective.\nDespite this progress, the fragmentation in tasks like action recognition,\nprocedure learning, and moment retrieval, \\etc, coupled with inconsistent\nannotations and isolated model development, hinders a holistic interpretation\nof video content. In response, we introduce the EAGLE (Egocentric AGgregated\nLanguage-video Engine) model and the EAGLE-400K dataset to provide a unified\nframework that integrates various egocentric video understanding tasks.\nEAGLE-400K, the \\textit{first} large-scale instruction-tuning dataset tailored\nfor egocentric video, features 400K diverse samples to enhance a broad spectrum\nof tasks from activity recognition to procedure knowledge learning. Moreover,\nEAGLE, a strong video multimodal large language model (MLLM), is designed to\neffectively capture both spatial and temporal information. In addition, we\npropose a set of evaluation metrics designed to facilitate a thorough\nassessment of MLLM for egocentric video understanding. Our extensive\nexperiments demonstrate EAGLE's superior performance over existing models,\nhighlighting its ability to balance task-specific understanding with holistic\nvideo interpretation. With EAGLE, we aim to pave the way for research\nopportunities and practical applications in real-world scenarios.\n","authors":["Jing Bi","Yunlong Tang","Luchuan Song","Ali Vosoughi","Nguyen Nguyen","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17523v1.pdf","comment":"Accepted by ACMMM 24"},{"id":"http://arxiv.org/abs/2409.17519v1","updated":"2024-09-26T04:02:20Z","published":"2024-09-26T04:02:20Z","title":"Robotic Environmental State Recognition with Pre-Trained Vision-Language\n Models and Black-Box Optimization","summary":" In order for robots to autonomously navigate and operate in diverse\nenvironments, it is essential for them to recognize the state of their\nenvironment. On the other hand, the environmental state recognition has\ntraditionally involved distinct methods tailored to each state to be\nrecognized. In this study, we perform a unified environmental state recognition\nfor robots through the spoken language with pre-trained large-scale\nvision-language models. We apply Visual Question Answering and Image-to-Text\nRetrieval, which are tasks of Vision-Language Models. We show that with our\nmethod, it is possible to recognize not only whether a room door is\nopen/closed, but also whether a transparent door is open/closed and whether\nwater is running in a sink, without training neural networks or manual\nprogramming. In addition, the recognition accuracy can be improved by selecting\nappropriate texts from the set of prepared texts based on black-box\noptimization. For each state recognition, only the text set and its weighting\nneed to be changed, eliminating the need to prepare multiple different models\nand programs, and facilitating the management of source code and computer\nresource. We experimentally demonstrate the effectiveness of our method and\napply it to the recognition behavior on a mobile robot, Fetch.\n","authors":["Kento Kawaharazuka","Yoshiki Obinata","Naoaki Kanazawa","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2409.17519v1.pdf","comment":"Accepted at Advanced Robotics, website -\n https://haraduka.github.io/vlm-bbo/"},{"id":"http://arxiv.org/abs/2404.05705v2","updated":"2024-09-26T03:58:11Z","published":"2024-04-08T17:42:08Z","title":"Learning 3D-Aware GANs from Unposed Images with Template Feature Field","summary":" Collecting accurate camera poses of training images has been shown to well\nserve the learning of 3D-aware generative adversarial networks (GANs) yet can\nbe quite expensive in practice. This work targets learning 3D-aware GANs from\nunposed images, for which we propose to perform on-the-fly pose estimation of\ntraining images with a learned template feature field (TeFF). Concretely, in\naddition to a generative radiance field as in previous approaches, we ask the\ngenerator to also learn a field from 2D semantic features while sharing the\ndensity from the radiance field. Such a framework allows us to acquire a\ncanonical 3D feature template leveraging the dataset mean discovered by the\ngenerative model, and further efficiently estimate the pose parameters on real\ndata. Experimental results on various challenging datasets demonstrate the\nsuperiority of our approach over state-of-the-art alternatives from both the\nqualitative and the quantitative perspectives.\n","authors":["Xinya Chen","Hanlei Guo","Yanrui Bin","Shangzhan Zhang","Yuanbo Yang","Yue Wang","Yujun Shen","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2404.05705v2.pdf","comment":"https://XDimlab.github.io/TeFF"},{"id":"http://arxiv.org/abs/2409.17512v1","updated":"2024-09-26T03:47:34Z","published":"2024-09-26T03:47:34Z","title":"SCOMatch: Alleviating Overtrusting in Open-set Semi-supervised Learning","summary":" Open-set semi-supervised learning (OSSL) leverages practical open-set\nunlabeled data, comprising both in-distribution (ID) samples from seen classes\nand out-of-distribution (OOD) samples from unseen classes, for semi-supervised\nlearning (SSL). Prior OSSL methods initially learned the decision boundary\nbetween ID and OOD with labeled ID data, subsequently employing self-training\nto refine this boundary. These methods, however, suffer from the tendency to\novertrust the labeled ID data: the scarcity of labeled data caused the\ndistribution bias between the labeled samples and the entire ID data, which\nmisleads the decision boundary to overfit. The subsequent self-training\nprocess, based on the overfitted result, fails to rectify this problem. In this\npaper, we address the overtrusting issue by treating OOD samples as an\nadditional class, forming a new SSL process.\n Specifically, we propose SCOMatch, a novel OSSL method that 1) selects\nreliable OOD samples as new labeled data with an OOD memory queue and a\ncorresponding update strategy and 2) integrates the new SSL process into the\noriginal task through our Simultaneous Close-set and Open-set self-training.\nSCOMatch refines the decision boundary of ID and OOD classes across the entire\ndataset, thereby leading to improved results. Extensive experimental results\nshow that SCOMatch significantly outperforms the state-of-the-art methods on\nvarious benchmarks. The effectiveness is further verified through ablation\nstudies and visualization.\n","authors":["Zerun Wang","Liuyu Xiang","Lang Huang","Jiafeng Mao","Ling Xiao","Toshihiko Yamasaki"],"pdf_url":"https://arxiv.org/pdf/2409.17512v1.pdf","comment":"ECCV 2024 accepted"},{"id":"http://arxiv.org/abs/2409.17510v1","updated":"2024-09-26T03:40:12Z","published":"2024-09-26T03:40:12Z","title":"NeuroPath: A Neural Pathway Transformer for Joining the Dots of Human\n Connectomes","summary":" Although modern imaging technologies allow us to study connectivity between\ntwo distinct brain regions in-vivo, an in-depth understanding of how anatomical\nstructure supports brain function and how spontaneous functional fluctuations\nemerge remarkable cognition is still elusive. Meanwhile, tremendous efforts\nhave been made in the realm of machine learning to establish the nonlinear\nmapping between neuroimaging data and phenotypic traits. However, the absence\nof neuroscience insight in the current approaches poses significant challenges\nin understanding cognitive behavior from transient neural activities. To\naddress this challenge, we put the spotlight on the coupling mechanism of\nstructural connectivity (SC) and functional connectivity (FC) by formulating\nsuch network neuroscience question into an expressive graph representation\nlearning problem for high-order topology. Specifically, we introduce the\nconcept of topological detour to characterize how a ubiquitous instance of FC\n(direct link) is supported by neural pathways (detour) physically wired by SC,\nwhich forms a cyclic loop interacted by brain structure and function. In the\nclich\\'e of machine learning, the multi-hop detour pathway underlying SC-FC\ncoupling allows us to devise a novel multi-head self-attention mechanism within\nTransformer to capture multi-modal feature representation from paired graphs of\nSC and FC. Taken together, we propose a biological-inspired deep model, coined\nas NeuroPath, to find putative connectomic feature representations from the\nunprecedented amount of neuroimages, which can be plugged into various\ndownstream applications such as task recognition and disease diagnosis. We have\nevaluated NeuroPath on large-scale public datasets including HCP and UK Biobank\nunder supervised and zero-shot learning, where the state-of-the-art performance\nby our NeuroPath indicates great potential in network neuroscience.\n","authors":["Ziquan Wei","Tingting Dan","Jiaqi Ding","Paul J Laurienti","Guorong Wu"],"pdf_url":"https://arxiv.org/pdf/2409.17510v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.00591v4","updated":"2024-09-26T03:34:23Z","published":"2024-08-01T14:20:47Z","title":"Regional quality estimation for echocardiography using deep learning","summary":" Automatic estimation of cardiac ultrasound image quality can be beneficial\nfor guiding operators and ensuring the accuracy of clinical measurements.\nPrevious work often fails to distinguish the view correctness of the\nechocardiogram from the image quality. Additionally, previous studies only\nprovide a global image quality value, which limits their practical utility. In\nthis work, we developed and compared three methods to estimate image quality:\n1) classic pixel-based metrics like the generalized contrast-to-noise ratio\n(gCNR) on myocardial segments as region of interest and left ventricle lumen as\nbackground, obtained using a U-Net segmentation 2) local image coherence\nderived from a U-Net model that predicts coherence from B-Mode images 3) a deep\nconvolutional network that predicts the quality of each region directly in an\nend-to-end fashion. We evaluate each method against manual regional image\nquality annotations by three experienced cardiologists. The results indicate\npoor performance of the gCNR metric, with Spearman correlation to the\nannotations of rho = 0.24. The end-to-end learning model obtains the best\nresult, rho = 0.69, comparable to the inter-observer correlation, rho = 0.63.\nFinally, the coherence-based method, with rho = 0.58, outperformed the\nclassical metrics and is more generic than the end-to-end approach. The image\nquality prediction tool is available as an open source Python library at\nhttps://github.com/GillesVanDeVyver/arqee.\n","authors":["Gilles Van De Vyver","Svein-Erik Måsøy","Håvard Dalen","Bjørnar Leangen Grenne","Espen Holte","Sindre Hellum Olaisen","John Nyberg","Andreas Østvik","Lasse Løvstakken","Erik Smistad"],"pdf_url":"https://arxiv.org/pdf/2408.00591v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17508v1","updated":"2024-09-26T03:33:26Z","published":"2024-09-26T03:33:26Z","title":"Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task\n Learning Via Connector-MoE","summary":" Multi-modal large language models (MLLMs) have shown impressive capabilities\nas a general-purpose interface for various visual and linguistic tasks.\nHowever, building a unified MLLM for multi-task learning in the medical field\nremains a thorny challenge. To mitigate the tug-of-war problem of multi-modal\nmulti-task optimization, recent advances primarily focus on improving the LLM\ncomponents, while neglecting the connector that bridges the gap between\nmodalities. In this paper, we introduce Uni-Med, a novel medical generalist\nfoundation model which consists of a universal visual feature extraction\nmodule, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting\nfrom the proposed CMoE that leverages a well-designed router with a mixture of\nprojection experts at the connector, Uni-Med achieves efficient solution to the\ntug-of-war problem and can perform six different medical tasks including\nquestion answering, visual question answering, report generation, referring\nexpression comprehension, referring expression generation and image\nclassification. To the best of our knowledge, Uni-Med is the first effort to\ntackle multi-task interference at the connector. Extensive ablation experiments\nvalidate the effectiveness of introducing CMoE under any configuration, with up\nto an average 8% performance gains. We further provide interpretation analysis\nof the tug-of-war problem from the perspective of gradient optimization and\nparameter statistics. Compared to previous state-of-the-art medical MLLMs,\nUni-Med achieves competitive or superior evaluation metrics on diverse tasks.\nCode, data and model will be soon available at GitHub.\n","authors":["Xun Zhu","Ying Hu","Fanbin Mo","Miao Li","Ji Wu"],"pdf_url":"https://arxiv.org/pdf/2409.17508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21341v2","updated":"2024-09-26T03:31:28Z","published":"2024-07-31T05:15:24Z","title":"High-throughput 3D shape completion of potato tubers on a harvester","summary":" Potato yield is an important metric for farmers to further optimize their\ncultivation practices. Potato yield can be estimated on a harvester using an\nRGB-D camera that can estimate the three-dimensional (3D) volume of individual\npotato tubers. A challenge, however, is that the 3D shape derived from RGB-D\nimages is only partially completed, underestimating the actual volume. To\naddress this issue, we developed a 3D shape completion network, called CoRe++,\nwhich can complete the 3D shape from RGB-D images. CoRe++ is a deep learning\nnetwork that consists of a convolutional encoder and a decoder. The encoder\ncompresses RGB-D images into latent vectors that are used by the decoder to\ncomplete the 3D shape using the deep signed distance field network (DeepSDF).\nTo evaluate our CoRe++ network, we collected partial and complete 3D point\nclouds of 339 potato tubers on an operational harvester in Japan. On the 1425\nRGB-D images in the test set (representing 51 unique potato tubers), our\nnetwork achieved a completion accuracy of 2.8 mm on average. For volumetric\nestimation, the root mean squared error (RMSE) was 22.6 ml, and this was better\nthan the RMSE of the linear regression (31.1 ml) and the base model (36.9 ml).\nWe found that the RMSE can be further reduced to 18.2 ml when performing the 3D\nshape completion in the center of the RGB-D image. With an average 3D shape\ncompletion time of 10 milliseconds per tuber, we can conclude that CoRe++ is\nboth fast and accurate enough to be implemented on an operational harvester for\nhigh-throughput potato yield estimation. Our method can also be applied to\nother tuber, fruit and vegetable crops, thereby enabling versatile, accurate\nand real-time yield monitoring in precision agriculture. Our code, network\nweights and dataset are publicly available at\nhttps://github.com/UTokyo-FieldPhenomics-Lab/corepp.git.\n","authors":["Pieter M. Blok","Federico Magistri","Cyrill Stachniss","Haozhou Wang","James Burridge","Wei Guo"],"pdf_url":"https://arxiv.org/pdf/2407.21341v2.pdf","comment":"20 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2409.17503v1","updated":"2024-09-26T03:21:21Z","published":"2024-09-26T03:21:21Z","title":"Shape-intensity knowledge distillation for robust medical image\n segmentation","summary":" Many medical image segmentation methods have achieved impressive results.\nYet, most existing methods do not take into account the shape-intensity prior\ninformation. This may lead to implausible segmentation results, in particular\nfor images of unseen datasets. In this paper, we propose a novel approach to\nincorporate joint shape-intensity prior information into the segmentation\nnetwork. Specifically, we first train a segmentation network (regarded as the\nteacher network) on class-wise averaged training images to extract valuable\nshape-intensity information, which is then transferred to a student\nsegmentation network with the same network architecture as the teacher via\nknowledge distillation. In this way, the student network regarded as the final\nsegmentation model can effectively integrate the shape-intensity prior\ninformation, yielding more accurate segmentation results. Despite its\nsimplicity, experiments on five medical image segmentation tasks of different\nmodalities demonstrate that the proposed Shape-Intensity Knowledge Distillation\n(SIKD) consistently improves several baseline models (including recent MaxStyle\nand SAMed) under intra-dataset evaluation, and significantly improves the\ncross-dataset generalization ability. The code is available at\nhttps://github.com/whdong-whu/SIKD.\n","authors":["Wenhui Dong","Bo Du","Yongchao Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16663v2","updated":"2024-09-26T02:57:52Z","published":"2024-09-25T06:48:25Z","title":"Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles\n Using Latent Space Generative World Models","summary":" We propose the use of latent space generative world models to address the\ncovariate shift problem in autonomous driving. A world model is a neural\nnetwork capable of predicting an agent's next state given past states and\nactions. By leveraging a world model during training, the driving policy\neffectively mitigates covariate shift without requiring an excessive amount of\ntraining data. During end-to-end training, our policy learns how to recover\nfrom errors by aligning with states observed in human demonstrations, so that\nat runtime it can recover from perturbations outside the training distribution.\nAdditionally, we introduce a novel transformer-based perception encoder that\nemploys multi-view cross-attention and a learned scene query. We present\nqualitative and quantitative results, demonstrating significant improvements\nupon prior state of the art in closed-loop testing in the CARLA simulator, as\nwell as showing the ability to handle perturbations in both CARLA and NVIDIA's\nDRIVE Sim.\n","authors":["Alexander Popov","Alperen Degirmenci","David Wehr","Shashank Hegde","Ryan Oldja","Alexey Kamenev","Bertrand Douillard","David Nistér","Urs Muller","Ruchi Bhargava","Stan Birchfield","Nikolai Smolyanskiy"],"pdf_url":"https://arxiv.org/pdf/2409.16663v2.pdf","comment":"7 pages, 6 figures, for ICRA 2025 conference, for associated video\n file, see https://youtu.be/fO7RZ57gVxk"},{"id":"http://arxiv.org/abs/2401.04585v2","updated":"2024-09-26T02:53:15Z","published":"2024-01-09T14:42:49Z","title":"EDA-DM: Enhanced Distribution Alignment for Post-Training Quantization\n of Diffusion Models","summary":" Diffusion models have achieved great success in image generation tasks\nthrough iterative noise estimation. However, the heavy denoising process and\ncomplex neural networks hinder their low-latency applications in real-world\nscenarios. Quantization can effectively reduce model complexity, and\npost-training quantization (PTQ), which does not require fine-tuning, is highly\npromising for compressing and accelerating diffusion models. Unfortunately, we\nfind that due to the highly dynamic distribution of activations in different\ndenoising steps, existing PTQ methods for diffusion models suffer from\ndistribution mismatch issues at both calibration sample level and\nreconstruction output level, which makes the performance far from satisfactory,\nespecially in low-bit cases. In this paper, we propose Enhanced Distribution\nAlignment for Post-Training Quantization of Diffusion Models (EDA-DM) to\naddress the above issues. Specifically, at the calibration sample level, we\nselect calibration samples based on the density and variety in the latent\nspace, thus facilitating the alignment of their distribution with the overall\nsamples; and at the reconstruction output level, we modify the loss of block\nreconstruction with the losses of layers, aligning the outputs of quantized\nmodel and full-precision model at different network granularity. Extensive\nexperiments demonstrate that EDA-DM significantly outperforms the existing PTQ\nmethods across various models (DDIM, LDM-4, LDM-8, Stable-Diffusion) and\ndifferent datasets (CIFAR-10, LSUN-Bedroom, LSUN-Church, ImageNet, MS-COCO).\n","authors":["Xuewen Liu","Zhikai Li","Junrui Xiao","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2401.04585v2.pdf","comment":"Code: http://github.com/BienLuky/EDA-DM"},{"id":"http://arxiv.org/abs/2409.17487v1","updated":"2024-09-26T02:49:51Z","published":"2024-09-26T02:49:51Z","title":"Learning Quantized Adaptive Conditions for Diffusion Models","summary":" The curvature of ODE trajectories in diffusion models hinders their ability\nto generate high-quality images in a few number of function evaluations (NFE).\nIn this paper, we propose a novel and effective approach to reduce trajectory\ncurvature by utilizing adaptive conditions. By employing a extremely\nlight-weight quantized encoder, our method incurs only an additional 1% of\ntraining parameters, eliminates the need for extra regularization terms, yet\nachieves significantly better sample quality. Our approach accelerates ODE\nsampling while preserving the downstream task image editing capabilities of SDE\ntechniques. Extensive experiments verify that our method can generate high\nquality results under extremely limited sampling costs. With only 6 NFE, we\nachieve 5.14 FID on CIFAR-10, 6.91 FID on FFHQ 64x64 and 3.10 FID on AFHQv2.\n","authors":["Yuchen Liang","Yuchuan Tian","Lei Yu","Huao Tang","Jie Hu","Xiangzhong Fang","Hanting Chen"],"pdf_url":"https://arxiv.org/pdf/2409.17487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17486v1","updated":"2024-09-26T02:48:15Z","published":"2024-09-26T02:48:15Z","title":"Global-Local Medical SAM Adaptor Based on Full Adaption","summary":" Emerging of visual language models, such as the segment anything model (SAM),\nhave made great breakthroughs in the field of universal semantic segmentation\nand significantly aid the improvements of medical image segmentation, in\nparticular with the help of Medical SAM adaptor (Med-SA). However, Med-SA still\ncan be improved, as it fine-tunes SAM in a partial adaption manner. To resolve\nthis problem, we present a novel global medical SAM adaptor (GMed-SA) with full\nadaption, which can adapt SAM globally. We further combine GMed-SA and Med-SA\nto propose a global-local medical SAM adaptor (GLMed-SA) to adapt SAM both\nglobally and locally. Extensive experiments have been performed on the\nchallenging public 2D melanoma segmentation dataset. The results show that\nGLMed-SA outperforms several state-of-the-art semantic segmentation methods on\nvarious evaluation metrics, demonstrating the superiority of our methods.\n","authors":["Meng Wang","Yarong Feng","Yongwei Tang","Tian Zhang","Yuxin Liang","Chao Lv"],"pdf_url":"https://arxiv.org/pdf/2409.17486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17485v1","updated":"2024-09-26T02:47:41Z","published":"2024-09-26T02:47:41Z","title":"Revisiting Deep Ensemble Uncertainty for Enhanced Medical Anomaly\n Detection","summary":" Medical anomaly detection (AD) is crucial in pathological identification and\nlocalization. Current methods typically rely on uncertainty estimation in deep\nensembles to detect anomalies, assuming that ensemble learners should agree on\nnormal samples while exhibiting disagreement on unseen anomalies in the output\nspace. However, these methods may suffer from inadequate disagreement on\nanomalies or diminished agreement on normal samples. To tackle these issues, we\npropose D2UE, a Diversified Dual-space Uncertainty Estimation framework for\nmedical anomaly detection. To effectively balance agreement and disagreement\nfor anomaly detection, we propose Redundancy-Aware Repulsion (RAR), which uses\na similarity kernel that remains invariant to both isotropic scaling and\northogonal transformations, explicitly promoting diversity in learners' feature\nspace. Moreover, to accentuate anomalous regions, we develop Dual-Space\nUncertainty (DSU), which utilizes the ensemble's uncertainty in input and\noutput spaces. In input space, we first calculate gradients of reconstruction\nerror with respect to input images. The gradients are then integrated with\nreconstruction outputs to estimate uncertainty for inputs, enabling effective\nanomaly discrimination even when output space disagreement is minimal. We\nconduct a comprehensive evaluation of five medical benchmarks with different\nbackbones. Experimental results demonstrate the superiority of our method to\nstate-of-the-art methods and the effectiveness of each component in our\nframework. Our code is available at https://github.com/Rubiscol/D2UE.\n","authors":["Yi Gu","Yi Lin","Kwang-Ting Cheng","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2409.17485v1.pdf","comment":"Early accepted by MICCAI2024"},{"id":"http://arxiv.org/abs/2409.18346v1","updated":"2024-09-26T23:48:08Z","published":"2024-09-26T23:48:08Z","title":"MultiClimate: Multimodal Stance Detection on Climate Change Videos","summary":" Climate change (CC) has attracted increasing attention in NLP in recent\nyears. However, detecting the stance on CC in multimodal data is understudied\nand remains challenging due to a lack of reliable datasets. To improve the\nunderstanding of public opinions and communication strategies, this paper\npresents MultiClimate, the first open-source manually-annotated stance\ndetection dataset with $100$ CC-related YouTube videos and $4,209$\nframe-transcript pairs. We deploy state-of-the-art vision and language models,\nas well as multimodal models for MultiClimate stance detection. Results show\nthat text-only BERT significantly outperforms image-only ResNet50 and ViT.\nCombining both modalities achieves state-of-the-art, $0.747$/$0.749$ in\naccuracy/F1. Our 100M-sized fusion models also beat CLIP and BLIP, as well as\nthe much larger 9B-sized multimodal IDEFICS and text-only Llama3 and Gemma2,\nindicating that multimodal stance detection remains challenging for large\nlanguage models. Our code, dataset, as well as supplementary materials, are\navailable at https://github.com/werywjw/MultiClimate.\n","authors":["Jiawen Wang","Longfei Zuo","Siyao Peng","Barbara Plank"],"pdf_url":"https://arxiv.org/pdf/2409.18346v1.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2409.18341v1","updated":"2024-09-26T23:30:48Z","published":"2024-09-26T23:30:48Z","title":"Does End-to-End Autonomous Driving Really Need Perception Tasks?","summary":" End-to-End Autonomous Driving (E2EAD) methods typically rely on supervised\nperception tasks to extract explicit scene information (e.g., objects, maps).\nThis reliance necessitates expensive annotations and constrains deployment and\ndata scalability in real-time applications. In this paper, we introduce SSR, a\nnovel framework that utilizes only 16 navigation-guided tokens as Sparse Scene\nRepresentation, efficiently extracting crucial scene information for E2EAD. Our\nmethod eliminates the need for supervised sub-tasks, allowing computational\nresources to concentrate on essential elements directly related to navigation\nintent. We further introduce a temporal enhancement module that employs a\nBird's-Eye View (BEV) world model, aligning predicted future scenes with actual\nfuture scenes through self-supervision. SSR achieves state-of-the-art planning\nperformance on the nuScenes dataset, demonstrating a 27.2\\% relative reduction\nin L2 error and a 51.6\\% decrease in collision rate to the leading E2EAD\nmethod, UniAD. Moreover, SSR offers a 10.9$\\times$ faster inference speed and\n13$\\times$ faster training time. This framework represents a significant leap\nin real-time autonomous driving systems and paves the way for future scalable\ndeployment. Code will be released at \\url{https://github.com/PeidongLi/SSR}.\n","authors":["Peidong Li","Dixiao Cui"],"pdf_url":"https://arxiv.org/pdf/2409.18341v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2409.18340v1","updated":"2024-09-26T23:30:40Z","published":"2024-09-26T23:30:40Z","title":"DRL-STNet: Unsupervised Domain Adaptation for Cross-modality Medical\n Image Segmentation via Disentangled Representation Learning","summary":" Unsupervised domain adaptation (UDA) is essential for medical image\nsegmentation, especially in cross-modality data scenarios. UDA aims to transfer\nknowledge from a labeled source domain to an unlabeled target domain, thereby\nreducing the dependency on extensive manual annotations. This paper presents\nDRL-STNet, a novel framework for cross-modality medical image segmentation that\nleverages generative adversarial networks (GANs), disentangled representation\nlearning (DRL), and self-training (ST). Our method leverages DRL within a GAN\nto translate images from the source to the target modality. Then, the\nsegmentation model is initially trained with these translated images and\ncorresponding source labels and then fine-tuned iteratively using a combination\nof synthetic and real images with pseudo-labels and real labels. The proposed\nframework exhibits superior performance in abdominal organ segmentation on the\nFLARE challenge dataset, surpassing state-of-the-art methods by 11.4% in the\nDice similarity coefficient and by 13.1% in the Normalized Surface Dice metric,\nachieving scores of 74.21% and 80.69%, respectively. The average running time\nis 41 seconds, and the area under the GPU memory-time curve is 11,292 MB. These\nresults indicate the potential of DRL-STNet for enhancing cross-modality\nmedical image segmentation tasks.\n","authors":["Hui Lin","Florian Schiffers","Santiago López-Tapia","Neda Tavakoli","Daniel Kim","Aggelos K. Katsaggelos"],"pdf_url":"https://arxiv.org/pdf/2409.18340v1.pdf","comment":"MICCAI 2024 Challenge, FLARE Challenge, Unsupervised domain\n adaptation, Organ segmentation, Feature disentanglement, Self-training"},{"id":"http://arxiv.org/abs/2409.18337v1","updated":"2024-09-26T23:19:44Z","published":"2024-09-26T23:19:44Z","title":"Photon Inhibition for Energy-Efficient Single-Photon Imaging","summary":" Single-photon cameras (SPCs) are emerging as sensors of choice for various\nchallenging imaging applications. One class of SPCs based on the single-photon\navalanche diode (SPAD) detects individual photons using an avalanche process;\nthe raw photon data can then be processed to extract scene information under\nextremely low light, high dynamic range, and rapid motion. Yet, single-photon\nsensitivity in SPADs comes at a cost -- each photon detection consumes more\nenergy than that of a CMOS camera. This avalanche power significantly limits\nsensor resolution and could restrict widespread adoption of SPAD-based SPCs. We\npropose a computational-imaging approach called \\emph{photon inhibition} to\naddress this challenge. Photon inhibition strategically allocates detections in\nspace and time based on downstream inference task goals and resource\nconstraints. We develop lightweight, on-sensor computational inhibition\npolicies that use past photon data to disable SPAD pixels in real-time, to\nselect the most informative future photons. As case studies, we design policies\ntailored for image reconstruction and edge detection, and demonstrate, both via\nsimulations and real SPC captured data, considerable reduction in photon\ndetections (over 90\\% of photons) while maintaining task performance metrics.\nOur work raises the question of ``which photons should be detected?'', and\npaves the way for future energy-efficient single-photon imaging.\n","authors":["Lucas J. Koerner","Shantanu Gupta","Atul Ingle","Mohit Gupta"],"pdf_url":"https://arxiv.org/pdf/2409.18337v1.pdf","comment":"Accepted for ECCV 2024. Supplementary material and code available at\n https://wisionlab.com/project/inhibition"},{"id":"http://arxiv.org/abs/2409.18336v1","updated":"2024-09-26T23:18:25Z","published":"2024-09-26T23:18:25Z","title":"DeBaRA: Denoising-Based 3D Room Arrangement Generation","summary":" Generating realistic and diverse layouts of furnished indoor 3D scenes\nunlocks multiple interactive applications impacting a wide range of industries.\nThe inherent complexity of object interactions, the limited amount of available\ndata and the requirement to fulfill spatial constraints all make generative\nmodeling for 3D scene synthesis and arrangement challenging. Current methods\naddress these challenges autoregressively or by using off-the-shelf diffusion\nobjectives by simultaneously predicting all attributes without 3D reasoning\nconsiderations. In this paper, we introduce DeBaRA, a score-based model\nspecifically tailored for precise, controllable and flexible arrangement\ngeneration in a bounded environment. We argue that the most critical component\nof a scene synthesis system is to accurately establish the size and position of\nvarious objects within a restricted area. Based on this insight, we propose a\nlightweight conditional score-based model designed with 3D spatial awareness at\nits core. We demonstrate that by focusing on spatial attributes of objects, a\nsingle trained DeBaRA model can be leveraged at test time to perform several\ndownstream applications such as scene synthesis, completion and re-arrangement.\nFurther, we introduce a novel Self Score Evaluation procedure so it can be\noptimally employed alongside external LLM models. We evaluate our approach\nthrough extensive experiments and demonstrate significant improvement upon\nstate-of-the-art approaches in a range of scenarios.\n","authors":["Léopold Maillard","Nicolas Sereyjol-Garros","Tom Durand","Maks Ovsjanikov"],"pdf_url":"https://arxiv.org/pdf/2409.18336v1.pdf","comment":"Accepted at NeurIPS 2024. Preprint version"},{"id":"http://arxiv.org/abs/2312.00094v3","updated":"2024-09-26T23:14:27Z","published":"2023-11-30T13:07:19Z","title":"Fast ODE-based Sampling for Diffusion Models in Around 5 Steps","summary":" Sampling from diffusion models can be treated as solving the corresponding\nordinary differential equations (ODEs), with the aim of obtaining an accurate\nsolution with as few number of function evaluations (NFE) as possible.\nRecently, various fast samplers utilizing higher-order ODE solvers have emerged\nand achieved better performance than the initial first-order one. However,\nthese numerical methods inherently result in certain approximation errors,\nwhich significantly degrades sample quality with extremely small NFE (e.g.,\naround 5). In contrast, based on the geometric observation that each sampling\ntrajectory almost lies in a two-dimensional subspace embedded in the ambient\nspace, we propose Approximate MEan-Direction Solver (AMED-Solver) that\neliminates truncation errors by directly learning the mean direction for fast\ndiffusion sampling. Besides, our method can be easily used as a plugin to\nfurther improve existing ODE-based samplers. Extensive experiments on image\nsynthesis with the resolution ranging from 32 to 512 demonstrate the\neffectiveness of our method. With only 5 NFE, we achieve 6.61 FID on CIFAR-10,\n10.74 FID on ImageNet 64$\\times$64, and 13.20 FID on LSUN Bedroom. Our code is\navailable at https://github.com/zju-pi/diff-sampler.\n","authors":["Zhenyu Zhou","Defang Chen","Can Wang","Chun Chen"],"pdf_url":"https://arxiv.org/pdf/2312.00094v3.pdf","comment":"Accepted by CVPR 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2409.18326v1","updated":"2024-09-26T22:44:00Z","published":"2024-09-26T22:44:00Z","title":"Automated Segmentation and Analysis of Microscopy Images of Laser Powder\n Bed Fusion Melt Tracks","summary":" With the increasing adoption of metal additive manufacturing (AM),\nresearchers and practitioners are turning to data-driven approaches to optimise\nprinting conditions. Cross-sectional images of melt tracks provide valuable\ninformation for tuning process parameters, developing parameter scaling data,\nand identifying defects. Here we present an image segmentation neural network\nthat automatically identifies and measures melt track dimensions from a\ncross-section image. We use a U-Net architecture to train on a data set of 62\npre-labelled images obtained from different labs, machines, and materials\ncoupled with image augmentation. When neural network hyperparameters such as\nbatch size and learning rate are properly tuned, the learned model shows an\naccuracy for classification of over 99% and an F1 score over 90%. The neural\nnetwork exhibits robustness when tested on images captured by various users,\nprinted on different machines, and acquired using different microscopes. A\npost-processing module extracts the height and width of the melt pool, and the\nwetting angles. We discuss opportunities to improve model performance and\navenues for transfer learning, such as extension to other AM processes such as\ndirected energy deposition.\n","authors":["Aagam Shah","Reimar Weissbach","David A. Griggs","A. John Hart","Elif Ertekin","Sameh Tawfick"],"pdf_url":"https://arxiv.org/pdf/2409.18326v1.pdf","comment":"21 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.02012v2","updated":"2024-09-26T22:28:17Z","published":"2024-08-04T12:48:20Z","title":"Decision Support System to triage of liver trauma","summary":" Trauma significantly impacts global health, accounting for over 5 million\ndeaths annually, which is comparable to mortality rates from diseases such as\ntuberculosis, AIDS, and malaria. In Iran, the financial repercussions of road\ntraffic accidents represent approximately 2% of the nation's Gross National\nProduct each year. Bleeding is the leading cause of mortality in trauma\npatients within the first 24 hours following an injury, making rapid diagnosis\nand assessment of severity crucial. Trauma patients require comprehensive scans\nof all organs, generating a large volume of data. Evaluating CT images for the\nentire body is time-consuming and requires significant expertise, underscoring\nthe need for efficient time management in diagnosis. Efficient diagnostic\nprocesses can significantly reduce treatment costs and decrease the likelihood\nof secondary complications. In this context, the development of a reliable\nDecision Support System (DSS) for trauma triage, particularly focused on the\nabdominal area, is vital. This paper presents a novel method for detecting\nliver bleeding and lacerations using CT scans, utilising the GAN Pix2Pix\ntranslation model. The effectiveness of the method is quantified by Dice score\nmetrics, with the model achieving an accuracy of 97% for liver bleeding and 93%\nfor liver laceration detection. These results represent a notable improvement\nover current state-of-the-art technologies. The system's design integrates\nseamlessly with existing medical imaging technologies, making it a practical\naddition to emergency medical services. This research underscores the potential\nof advanced image translation models like GAN Pix2Pix in improving the\nprecision and speed of medical diagnostics in critical care scenarios.\n","authors":["Ali Jamali","Azadeh Nazemi","Ashkan Sami","Rosemina Bahrololoom","Shahram Paydar","Alireza Shakibafar"],"pdf_url":"https://arxiv.org/pdf/2408.02012v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15997v2","updated":"2024-09-26T21:56:01Z","published":"2024-09-24T11:57:12Z","title":"Improvements to SDXL in NovelAI Diffusion V3","summary":" In this technical report, we document the changes we made to SDXL in the\nprocess of training NovelAI Diffusion V3, our state of the art anime image\ngeneration model.\n","authors":["Juan Ossa","Eren Doğan","Alex Birch","F. Johnson"],"pdf_url":"https://arxiv.org/pdf/2409.15997v2.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.18314v1","updated":"2024-09-26T21:44:20Z","published":"2024-09-26T21:44:20Z","title":"Realistic Evaluation of Model Merging for Compositional Generalization","summary":" Merging has become a widespread way to cheaply combine individual models into\na single model that inherits their capabilities and attains better performance.\nThis popularity has spurred rapid development of many new merging methods,\nwhich are typically validated in disparate experimental settings and frequently\ndiffer in the assumptions made about model architecture, data availability, and\ncomputational budget. In this work, we characterize the relative merits of\ndifferent merging methods by evaluating them in a shared experimental setting\nand precisely identifying the practical requirements of each method.\nSpecifically, our setting focuses on using merging for compositional\ngeneralization of capabilities in image classification, image generation, and\nnatural language processing. Additionally, we measure the computational costs\nof different merging methods as well as how they perform when scaling the\nnumber of models being merged. Taken together, our results clarify the state of\nthe field of model merging and provide a comprehensive and rigorous\nexperimental setup to test new methods.\n","authors":["Derek Tam","Yash Kant","Brian Lester","Igor Gilitschenski","Colin Raffel"],"pdf_url":"https://arxiv.org/pdf/2409.18314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19863v2","updated":"2024-09-26T21:32:36Z","published":"2024-03-28T22:17:19Z","title":"DeNetDM: Debiasing by Network Depth Modulation","summary":" When neural networks are trained on biased datasets, they tend to\ninadvertently learn spurious correlations, leading to challenges in achieving\nstrong generalization and robustness. Current approaches to address such biases\ntypically involve utilizing bias annotations, reweighting based on pseudo-bias\nlabels, or enhancing diversity within bias-conflicting data points through\naugmentation techniques. We introduce DeNetDM, a novel debiasing method based\non the observation that shallow neural networks prioritize learning core\nattributes, while deeper ones emphasize biases when tasked with acquiring\ndistinct information. Using a training paradigm derived from Product of\nExperts, we create both biased and debiased branches with deep and shallow\narchitectures and then distill knowledge to produce the target debiased model.\nExtensive experiments and analyses demonstrate that our approach outperforms\ncurrent debiasing techniques, achieving a notable improvement of around 5% in\nthree datasets, encompassing both synthetic and real-world data. Remarkably,\nDeNetDM accomplishes this without requiring annotations pertaining to bias\nlabels or bias types, while still delivering performance on par with supervised\ncounterparts. Furthermore, our approach effectively harnesses the diversity of\nbias-conflicting points within the data, surpassing previous methods and\nobviating the need for explicit augmentation-based methods to enhance the\ndiversity of such bias-conflicting points. The source code will be available\nupon acceptance.\n","authors":["Silpa Vadakkeeveetil Sreelatha","Adarsh Kappiyath","Abhra Chaudhuri","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2403.19863v2.pdf","comment":"Accepted to NeurIPS 2024, * indicates these authors contributed\n equally"},{"id":"http://arxiv.org/abs/2312.10237v5","updated":"2024-09-26T21:24:00Z","published":"2023-12-15T22:09:04Z","title":"A Distributed Privacy Preserving Model for the Detection of Alzheimer's\n Disease","summary":" In the era of rapidly advancing medical technologies, the segmentation of\nmedical data has become inevitable, necessitating the development of privacy\npreserving machine learning algorithms that can train on distributed data.\nConsolidating sensitive medical data is not always an option particularly due\nto the stringent privacy regulations imposed by the Health Insurance\nPortability and Accountability Act (HIPAA). In this paper, I introduce a HIPAA\ncompliant framework that can train from distributed data. I then propose a\nmultimodal vertical federated model for Alzheimer's Disease (AD) detection, a\nserious neurodegenerative condition that can cause dementia, severely impairing\nbrain function and hindering simple tasks, especially without preventative\ncare. This vertical federated learning (VFL) model offers a distributed\narchitecture that enables collaborative learning across diverse sources of\nmedical data while respecting privacy constraints imposed by HIPAA. The VFL\narchitecture proposed herein offers a novel distributed architecture, enabling\ncollaborative learning across diverse sources of medical data while respecting\nstatutory privacy constraints. By leveraging multiple modalities of data, the\nrobustness and accuracy of AD detection can be enhanced. This model not only\ncontributes to the advancement of federated learning techniques but also holds\npromise for overcoming the hurdles posed by data segmentation in medical\nresearch.\n","authors":["Paul K. Mandal"],"pdf_url":"https://arxiv.org/pdf/2312.10237v5.pdf","comment":"15 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.18301v1","updated":"2024-09-26T21:16:51Z","published":"2024-09-26T21:16:51Z","title":"Harnessing Wavelet Transformations for Generalizable Deepfake Forgery\n Detection","summary":" The evolution of digital image manipulation, particularly with the\nadvancement of deep generative models, significantly challenges existing\ndeepfake detection methods, especially when the origin of the deepfake is\nobscure. To tackle the increasing complexity of these forgeries, we propose\n\\textbf{Wavelet-CLIP}, a deepfake detection framework that integrates wavelet\ntransforms with features derived from the ViT-L/14 architecture, pre-trained in\nthe CLIP fashion. Wavelet-CLIP utilizes Wavelet Transforms to deeply analyze\nboth spatial and frequency features from images, thus enhancing the model's\ncapability to detect sophisticated deepfakes. To verify the effectiveness of\nour approach, we conducted extensive evaluations against existing\nstate-of-the-art methods for cross-dataset generalization and detection of\nunseen images generated by standard diffusion models. Our method showcases\noutstanding performance, achieving an average AUC of 0.749 for cross-data\ngeneralization and 0.893 for robustness against unseen deepfakes, outperforming\nall compared methods. The code can be reproduced from the repo:\n\\url{https://github.com/lalithbharadwajbaru/Wavelet-CLIP}\n","authors":["Lalith Bharadwaj Baru","Shilhora Akshay Patel","Rohit Boddeda"],"pdf_url":"https://arxiv.org/pdf/2409.18301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03022v2","updated":"2024-09-26T21:15:26Z","published":"2024-09-04T18:28:10Z","title":"Boundless: Generating Photorealistic Synthetic Data for Object Detection\n in Urban Streetscapes","summary":" We introduce Boundless, a photo-realistic synthetic data generation system\nfor enabling highly accurate object detection in dense urban streetscapes.\nBoundless can replace massive real-world data collection and manual\nground-truth object annotation (labeling) with an automated and configurable\nprocess. Boundless is based on the Unreal Engine 5 (UE5) City Sample project\nwith improvements enabling accurate collection of 3D bounding boxes across\ndifferent lighting and scene variability conditions.\n We evaluate the performance of object detection models trained on the dataset\ngenerated by Boundless when used for inference on a real-world dataset acquired\nfrom medium-altitude cameras. We compare the performance of the\nBoundless-trained model against the CARLA-trained model and observe an\nimprovement of 7.8 mAP. The results we achieved support the premise that\nsynthetic data generation is a credible methodology for training/fine-tuning\nscalable object detection models for urban scenes.\n","authors":["Mehmet Kerem Turkcan","Yuyang Li","Chengbo Zang","Javad Ghaderi","Gil Zussman","Zoran Kostic"],"pdf_url":"https://arxiv.org/pdf/2409.03022v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18300v1","updated":"2024-09-26T21:15:22Z","published":"2024-09-26T21:15:22Z","title":"SOAR: Self-supervision Optimized UAV Action Recognition with Efficient\n Object-Aware Pretraining","summary":" We introduce SOAR, a novel Self-supervised pretraining algorithm for aerial\nfootage captured by Unmanned Aerial Vehicles (UAVs). We incorporate human\nobject knowledge throughout the pretraining process to enhance UAV video\npretraining efficiency and downstream action recognition performance. This is\nin contrast to prior works that primarily incorporate object information during\nthe fine-tuning stage. Specifically, we first propose a novel object-aware\nmasking strategy designed to retain the visibility of certain patches related\nto objects throughout the pretraining phase. Second, we introduce an\nobject-aware loss function that utilizes object information to adjust the\nreconstruction loss, preventing bias towards less informative background\npatches. In practice, SOAR with a vanilla ViT backbone, outperforms best UAV\naction recognition models, recording a 9.7% and 21.4% boost in top-1 accuracy\non the NEC-Drone and UAV-Human datasets, while delivering an inference speed of\n18.7ms per video, making it 2x to 5x faster. Additionally, SOAR obtains\ncomparable accuracy to prior self-supervised learning (SSL) methods while\nrequiring 87.5% less pretraining time and 25% less memory usage\n","authors":["Ruiqi Xian","Xiyang Wu","Tianrui Guan","Xijun Wang","Boqing Gong","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2409.18300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18297v1","updated":"2024-09-26T21:10:17Z","published":"2024-09-26T21:10:17Z","title":"Flat'n'Fold: A Diverse Multi-Modal Dataset for Garment Perception and\n Manipulation","summary":" We present Flat'n'Fold, a novel large-scale dataset for garment manipulation\nthat addresses critical gaps in existing datasets. Comprising 1,212 human and\n887 robot demonstrations of flattening and folding 44 unique garments across 8\ncategories, Flat'n'Fold surpasses prior datasets in size, scope, and diversity.\nOur dataset uniquely captures the entire manipulation process from crumpled to\nfolded states, providing synchronized multi-view RGB-D images, point clouds,\nand action data, including hand or gripper positions and rotations. We quantify\nthe dataset's diversity and complexity compared to existing benchmarks and show\nthat our dataset features natural and diverse manipulations of real-world\ndemonstrations of human and robot demonstrations in terms of visual and action\ninformation. To showcase Flat'n'Fold's utility, we establish new benchmarks for\ngrasping point prediction and subtask decomposition. Our evaluation of\nstate-of-the-art models on these tasks reveals significant room for\nimprovement. This underscores Flat'n'Fold's potential to drive advances in\nrobotic perception and manipulation of deformable objects. Our dataset can be\ndownloaded at https://cvas-ug.github.io/flat-n-fold\n","authors":["Lipeng Zhuang","Shiyu Fan","Yingdong Ru","Florent Audonnet","Paul Henderson","Gerardo Aragon-Camarasa"],"pdf_url":"https://arxiv.org/pdf/2409.18297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18291v1","updated":"2024-09-26T21:01:45Z","published":"2024-09-26T21:01:45Z","title":"Efficient Microscopic Image Instance Segmentation for Food Crystal\n Quality Control","summary":" This paper is directed towards the food crystal quality control area for\nmanufacturing, focusing on efficiently predicting food crystal counts and size\ndistributions. Previously, manufacturers used the manual counting method on\nmicroscopic images of food liquid products, which requires substantial human\neffort and suffers from inconsistency issues. Food crystal segmentation is a\nchallenging problem due to the diverse shapes of crystals and their surrounding\nhard mimics. To address this challenge, we propose an efficient instance\nsegmentation method based on object detection. Experimental results show that\nthe predicted crystal counting accuracy of our method is comparable with\nexisting segmentation methods, while being five times faster. Based on our\nexperiments, we also define objective criteria for separating hard mimics and\nfood crystals, which could benefit manual annotation tasks on similar dataset.\n","authors":["Xiaoyu Ji","Jan P Allebach","Ali Shakouri","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.18291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14979v2","updated":"2024-09-26T21:00:29Z","published":"2024-07-20T21:06:33Z","title":"RGB2Point: 3D Point Cloud Generation from Single RGB Images","summary":" We introduce RGB2Point, an unposed single-view RGB image to a 3D point cloud\ngeneration based on Transformer. RGB2Point takes an input image of an object\nand generates a dense 3D point cloud. Contrary to prior works based on CNN\nlayers and diffusion denoising approaches, we use pre-trained Transformer\nlayers that are fast and generate high-quality point clouds with consistent\nquality over available categories. Our generated point clouds demonstrate high\nquality on a real-world dataset, as evidenced by improved Chamfer distance\n(51.15%) and Earth Mover's distance (45.96%) metrics compared to the current\nstate-of-the-art. Additionally, our approach shows a better quality on a\nsynthetic dataset, achieving better Chamfer distance (39.26%), Earth Mover's\ndistance (26.95%), and F-score (47.16%). Moreover, our method produces 63.1%\nmore consistent high-quality results across various object categories compared\nto prior works. Furthermore, RGB2Point is computationally efficient, requiring\nonly 2.3GB of VRAM to reconstruct a 3D point cloud from a single RGB image, and\nour implementation generates the results 15,133x faster than a SOTA\ndiffusion-based model.\n","authors":["Jae Joong Lee","Bedrich Benes"],"pdf_url":"https://arxiv.org/pdf/2407.14979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18286v1","updated":"2024-09-26T20:58:11Z","published":"2024-09-26T20:58:11Z","title":"Advancing Object Detection in Transportation with Multimodal Large\n Language Models (MLLMs): A Comprehensive Review and Empirical Testing","summary":" This study aims to comprehensively review and empirically evaluate the\napplication of multimodal large language models (MLLMs) and Large Vision Models\n(VLMs) in object detection for transportation systems. In the first fold, we\nprovide a background about the potential benefits of MLLMs in transportation\napplications and conduct a comprehensive review of current MLLM technologies in\nprevious studies. We highlight their effectiveness and limitations in object\ndetection within various transportation scenarios. The second fold involves\nproviding an overview of the taxonomy of end-to-end object detection in\ntransportation applications and future directions. Building on this, we\nproposed empirical analysis for testing MLLMs on three real-world\ntransportation problems that include object detection tasks namely, road safety\nattributes extraction, safety-critical event detection, and visual reasoning of\nthermal images. Our findings provide a detailed assessment of MLLM performance,\nuncovering both strengths and areas for improvement. Finally, we discuss\npractical limitations and challenges of MLLMs in enhancing object detection in\ntransportation, thereby offering a roadmap for future research and development\nin this critical area.\n","authors":["Huthaifa I. Ashqar","Ahmed Jaber","Taqwa I. Alhadidi","Mohammed Elhenawy"],"pdf_url":"https://arxiv.org/pdf/2409.18286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18282v1","updated":"2024-09-26T20:51:59Z","published":"2024-09-26T20:51:59Z","title":"Synthesizing beta-amyloid PET images from T1-weighted Structural MRI: A\n Preliminary Study","summary":" Beta-amyloid positron emission tomography (A$\\beta$-PET) imaging has become a\ncritical tool in Alzheimer's disease (AD) research and diagnosis, providing\ninsights into the pathological accumulation of amyloid plaques, one of the\nhallmarks of AD. However, the high cost, limited availability, and exposure to\nradioactivity restrict the widespread use of A$\\beta$-PET imaging, leading to a\nscarcity of comprehensive datasets. Previous studies have suggested that\nstructural magnetic resonance imaging (MRI), which is more readily available,\nmay serve as a viable alternative for synthesizing A$\\beta$-PET images. In this\nstudy, we propose an approach to utilize 3D diffusion models to synthesize\nA$\\beta$-PET images from T1-weighted MRI scans, aiming to overcome the\nlimitations associated with direct PET imaging. Our method generates\nhigh-quality A$\\beta$-PET images for cognitive normal cases, although it is\nless effective for mild cognitive impairment (MCI) patients due to the\nvariability in A$\\beta$ deposition patterns among subjects. Our preliminary\nresults suggest that incorporating additional data, such as a larger sample of\nMCI cases and multi-modality information including clinical and demographic\ndetails, cognitive and functional assessments, and longitudinal data, may be\nnecessary to improve A$\\beta$-PET image synthesis for MCI patients.\n","authors":["Qing Lyu","Jin Young Kim","Jeongchul Kim","Christopher T Whitlow"],"pdf_url":"https://arxiv.org/pdf/2409.18282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16005v2","updated":"2024-09-26T20:37:52Z","published":"2024-05-25T02:02:08Z","title":"PTQ4DiT: Post-training Quantization for Diffusion Transformers","summary":" The recent introduction of Diffusion Transformers (DiTs) has demonstrated\nexceptional capabilities in image generation by using a different backbone\narchitecture, departing from traditional U-Nets and embracing the scalable\nnature of transformers. Despite their advanced capabilities, the wide\ndeployment of DiTs, particularly for real-time applications, is currently\nhampered by considerable computational demands at the inference stage.\nPost-training Quantization (PTQ) has emerged as a fast and data-efficient\nsolution that can significantly reduce computation and memory footprint by\nusing low-bit weights and activations. However, its applicability to DiTs has\nnot yet been explored and faces non-trivial difficulties due to the unique\ndesign of DiTs. In this paper, we propose PTQ4DiT, a specifically designed PTQ\nmethod for DiTs. We discover two primary quantization challenges inherent in\nDiTs, notably the presence of salient channels with extreme magnitudes and the\ntemporal variability in distributions of salient activation over multiple\ntimesteps. To tackle these challenges, we propose Channel-wise Salience\nBalancing (CSB) and Spearmen's $\\rho$-guided Salience Calibration (SSC). CSB\nleverages the complementarity property of channel magnitudes to redistribute\nthe extremes, alleviating quantization errors for both activations and weights.\nSSC extends this approach by dynamically adjusting the balanced salience to\ncapture the temporal variations in activation. Additionally, to eliminate extra\ncomputational costs caused by PTQ4DiT during inference, we design an offline\nre-parameterization strategy for DiTs. Experiments demonstrate that our PTQ4DiT\nsuccessfully quantizes DiTs to 8-bit precision (W8A8) while preserving\ncomparable generation ability and further enables effective quantization to\n4-bit weight precision (W4A8) for the first time.\n","authors":["Junyi Wu","Haoxuan Wang","Yuzhang Shang","Mubarak Shah","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2405.16005v2.pdf","comment":"NeurIPS 2024. Code is available at\n https://github.com/adreamwu/PTQ4DiT"},{"id":"http://arxiv.org/abs/2405.04378v4","updated":"2024-09-26T20:32:55Z","published":"2024-05-07T15:00:19Z","title":"Splat-MOVER: Multi-Stage, Open-Vocabulary Robotic Manipulation via\n Editable Gaussian Splatting","summary":" We present Splat-MOVER, a modular robotics stack for open-vocabulary robotic\nmanipulation, which leverages the editability of Gaussian Splatting (GSplat)\nscene representations to enable multi-stage manipulation tasks. Splat-MOVER\nconsists of: (i) ASK-Splat, a GSplat representation that distills semantic and\ngrasp affordance features into the 3D scene. ASK-Splat enables geometric,\nsemantic, and affordance understanding of 3D scenes, which is critical in many\nrobotics tasks; (ii) SEE-Splat, a real-time scene-editing module using 3D\nsemantic masking and infilling to visualize the motions of objects that result\nfrom robot interactions in the real-world. SEE-Splat creates a \"digital twin\"\nof the evolving environment throughout the manipulation task; and (iii)\nGrasp-Splat, a grasp generation module that uses ASK-Splat and SEE-Splat to\npropose affordance-aligned candidate grasps for open-world objects. ASK-Splat\nis trained in real-time from RGB images in a brief scanning phase prior to\noperation, while SEE-Splat and Grasp-Splat run in real-time during operation.\nWe demonstrate the superior performance of Splat-MOVER in hardware experiments\non a Kinova robot compared to two recent baselines in four single-stage,\nopen-vocabulary manipulation tasks and in four multi-stage manipulation tasks,\nusing the edited scene to reflect changes due to prior manipulation stages,\nwhich is not possible with existing baselines. Video demonstrations and the\ncode for the project are available at https://splatmover.github.io.\n","authors":["Ola Shorinwa","Johnathan Tucker","Aliyah Smith","Aiden Swann","Timothy Chen","Roya Firoozi","Monroe Kennedy III","Mac Schwager"],"pdf_url":"https://arxiv.org/pdf/2405.04378v4.pdf","comment":"https://splatmover.github.io"},{"id":"http://arxiv.org/abs/2409.18265v1","updated":"2024-09-26T20:18:14Z","published":"2024-09-26T20:18:14Z","title":"Task-recency bias strikes back: Adapting covariances in Exemplar-Free\n Class Incremental Learning","summary":" Exemplar-Free Class Incremental Learning (EFCIL) tackles the problem of\ntraining a model on a sequence of tasks without access to past data. Existing\nstate-of-the-art methods represent classes as Gaussian distributions in the\nfeature extractor's latent space, enabling Bayes classification or training the\nclassifier by replaying pseudo features. However, we identify two critical\nissues that compromise their efficacy when the feature extractor is updated on\nincremental tasks. First, they do not consider that classes' covariance\nmatrices change and must be adapted after each task. Second, they are\nsusceptible to a task-recency bias caused by dimensionality collapse occurring\nduring training. In this work, we propose AdaGauss -- a novel method that\nadapts covariance matrices from task to task and mitigates the task-recency\nbias owing to the additional anti-collapse loss function. AdaGauss yields\nstate-of-the-art results on popular EFCIL benchmarks and datasets when training\nfrom scratch or starting from a pre-trained backbone. The code is available at:\nhttps://github.com/grypesc/AdaGauss.\n","authors":["Grzegorz Rypeść","Sebastian Cygert","Tomasz Trzciński","Bartłomiej Twardowski"],"pdf_url":"https://arxiv.org/pdf/2409.18265v1.pdf","comment":"Accepted for NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.18261v1","updated":"2024-09-26T20:13:33Z","published":"2024-09-26T20:13:33Z","title":"Omni6D: Large-Vocabulary 3D Object Dataset for Category-Level 6D Object\n Pose Estimation","summary":" 6D object pose estimation aims at determining an object's translation,\nrotation, and scale, typically from a single RGBD image. Recent advancements\nhave expanded this estimation from instance-level to category-level, allowing\nmodels to generalize across unseen instances within the same category. However,\nthis generalization is limited by the narrow range of categories covered by\nexisting datasets, such as NOCS, which also tend to overlook common real-world\nchallenges like occlusion. To tackle these challenges, we introduce Omni6D, a\ncomprehensive RGBD dataset featuring a wide range of categories and varied\nbackgrounds, elevating the task to a more realistic context. 1) The dataset\ncomprises an extensive spectrum of 166 categories, 4688 instances adjusted to\nthe canonical pose, and over 0.8 million captures, significantly broadening the\nscope for evaluation. 2) We introduce a symmetry-aware metric and conduct\nsystematic benchmarks of existing algorithms on Omni6D, offering a thorough\nexploration of new challenges and insights. 3) Additionally, we propose an\neffective fine-tuning approach that adapts models from previous datasets to our\nextensive vocabulary setting. We believe this initiative will pave the way for\nnew insights and substantial progress in both the industrial and academic\nfields, pushing forward the boundaries of general 6D pose estimation.\n","authors":["Mengchen Zhang","Tong Wu","Tai Wang","Tengfei Wang","Ziwei Liu","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2409.18261v1.pdf","comment":"ECCV 2024 (poster). Github page: https://github.com/3DTopia/Omni6D"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.18110v1","updated":"2024-09-26T17:52:57Z","published":"2024-09-26T17:52:57Z","title":"Open-World Evaluation for Retrieving Diverse Perspectives","summary":" We study retrieving a set of documents that covers various perspectives on a\ncomplex and contentious question (e.g., will ChatGPT do more harm than good?).\nWe curate a Benchmark for Retrieval Diversity for Subjective questions (BERDS),\nwhere each example consists of a question and diverse perspectives associated\nwith the question, sourced from survey questions and debate websites. On this\ndata, retrievers paired with a corpus are evaluated to surface a document set\nthat contains diverse perspectives. Our framing diverges from most retrieval\ntasks in that document relevancy cannot be decided by simple string matches to\nreferences. Instead, we build a language model based automatic evaluator that\ndecides whether each retrieved document contains a perspective. This allows us\nto evaluate the performance of three different types of corpus (Wikipedia, web\nsnapshot, and corpus constructed on the fly with retrieved pages from the\nsearch engine) paired with retrievers. Retrieving diverse documents remains\nchallenging, with the outputs from existing retrievers covering all\nperspectives on only 33.74% of the examples. We further study the impact of\nquery expansion and diversity-focused reranking approaches and analyze\nretriever sycophancy. Together, we lay the foundation for future studies in\nretrieval diversity handling complex queries.\n","authors":["Hung-Ting Chen","Eunsol Choi"],"pdf_url":"https://arxiv.org/pdf/2409.18110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18049v1","updated":"2024-09-26T16:49:58Z","published":"2024-09-26T16:49:58Z","title":"Revisit Anything: Visual Place Recognition via Image Segment Retrieval","summary":" Accurately recognizing a revisited place is crucial for embodied agents to\nlocalize and navigate. This requires visual representations to be distinct,\ndespite strong variations in camera viewpoint and scene appearance. Existing\nvisual place recognition pipelines encode the \"whole\" image and search for\nmatches. This poses a fundamental challenge in matching two images of the same\nplace captured from different camera viewpoints: \"the similarity of what\noverlaps can be dominated by the dissimilarity of what does not overlap\". We\naddress this by encoding and searching for \"image segments\" instead of the\nwhole images. We propose to use open-set image segmentation to decompose an\nimage into `meaningful' entities (i.e., things and stuff). This enables us to\ncreate a novel image representation as a collection of multiple overlapping\nsubgraphs connecting a segment with its neighboring segments, dubbed\nSuperSegment. Furthermore, to efficiently encode these SuperSegments into\ncompact vector representations, we propose a novel factorized representation of\nfeature aggregation. We show that retrieving these partial representations\nleads to significantly higher recognition recall than the typical whole image\nbased retrieval. Our segments-based approach, dubbed SegVLAD, sets a new\nstate-of-the-art in place recognition on a diverse selection of benchmark\ndatasets, while being applicable to both generic and task-specialized image\nencoders. Finally, we demonstrate the potential of our method to ``revisit\nanything'' by evaluating our method on an object instance retrieval task, which\nbridges the two disparate areas of research: visual place recognition and\nobject-goal navigation, through their common aim of recognizing goal objects\nspecific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything.\n","authors":["Kartik Garg","Sai Shubodh Puligilla","Shishir Kolathaya","Madhava Krishna","Sourav Garg"],"pdf_url":"https://arxiv.org/pdf/2409.18049v1.pdf","comment":"Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures"},{"id":"http://arxiv.org/abs/2409.18024v1","updated":"2024-09-26T16:32:10Z","published":"2024-09-26T16:32:10Z","title":"Report on the Workshop on Simulations for Information Access (Sim4IA\n 2024) at SIGIR 2024","summary":" This paper is a report of the Workshop on Simulations for Information Access\n(Sim4IA) workshop at SIGIR 2024. The workshop had two keynotes, a panel\ndiscussion, nine lightning talks, and two breakout sessions. Key takeaways were\nuser simulation's importance in academia and industry, the possible bridging of\nonline and offline evaluation, and the issues of organizing a companion shared\ntask around user simulations for information access. We report on how we\norganized the workshop, provide a brief overview of what happened at the\nworkshop, and summarize the main topics and findings of the workshop and future\nwork.\n","authors":["Timo Breuer","Christin Katharina Kreutz","Norbert Fuhr","Krisztian Balog","Philipp Schaer","Nolwenn Bernard","Ingo Frommholz","Marcel Gohsen","Kaixin Ji","Gareth J. F. Jones","Jüri Keller","Jiqun Liu","Martin Mladenov","Gabriella Pasi","Johanne Trippas","Xi Wang","Saber Zerhoudi","ChengXiang Zhai"],"pdf_url":"https://arxiv.org/pdf/2409.18024v1.pdf","comment":"Preprint of a SIGIR Forum submission for Vol. 58 No. 2 - December\n 2024"},{"id":"http://arxiv.org/abs/2409.18003v1","updated":"2024-09-26T16:12:33Z","published":"2024-09-26T16:12:33Z","title":"Enhancing Tourism Recommender Systems for Sustainable City Trips Using\n Retrieval-Augmented Generation","summary":" Tourism Recommender Systems (TRS) have traditionally focused on providing\npersonalized travel suggestions, often prioritizing user preferences without\nconsidering broader sustainability goals. Integrating sustainability into TRS\nhas become essential with the increasing need to balance environmental impact,\nlocal community interests, and visitor satisfaction. This paper proposes a\nnovel approach to enhancing TRS for sustainable city trips using Large Language\nModels (LLMs) and a modified Retrieval-Augmented Generation (RAG) pipeline. We\nenhance the traditional RAG system by incorporating a sustainability metric\nbased on a city's popularity and seasonal demand during the prompt augmentation\nphase. This modification, called Sustainability Augmented Reranking (SAR),\nensures the system's recommendations align with sustainability goals.\nEvaluations using popular open-source LLMs, such as Llama-3.1-Instruct-8B and\nMistral-Instruct-7B, demonstrate that the SAR-enhanced approach consistently\nmatches or outperforms the baseline (without SAR) across most metrics,\nhighlighting the benefits of incorporating sustainability into TRS.\n","authors":["Ashmi Banerjee","Adithi Satish","Wolfgang Wörndl"],"pdf_url":"https://arxiv.org/pdf/2409.18003v1.pdf","comment":"Accepted at the RecSoGood 2024 Workshop co-located with the 18th ACM\n Conference on Recommender Systems (RecSys 2024)"},{"id":"http://arxiv.org/abs/2409.13740v2","updated":"2024-09-26T15:27:08Z","published":"2024-09-10T16:37:58Z","title":"Language agents achieve superhuman synthesis of scientific knowledge","summary":" Language models are known to hallucinate incorrect information, and it is\nunclear if they are sufficiently accurate and reliable for use in scientific\nresearch. We developed a rigorous human-AI comparison methodology to evaluate\nlanguage model agents on real-world literature search tasks covering\ninformation retrieval, summarization, and contradiction detection tasks. We\nshow that PaperQA2, a frontier language model agent optimized for improved\nfactuality, matches or exceeds subject matter expert performance on three\nrealistic literature research tasks without any restrictions on humans (i.e.,\nfull access to internet, search tools, and time). PaperQA2 writes cited,\nWikipedia-style summaries of scientific topics that are significantly more\naccurate than existing, human-written Wikipedia articles. We also introduce a\nhard benchmark for scientific literature research called LitQA2 that guided\ndesign of PaperQA2, leading to it exceeding human performance. Finally, we\napply PaperQA2 to identify contradictions within the scientific literature, an\nimportant scientific task that is challenging for humans. PaperQA2 identifies\n2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of\nwhich 70% are validated by human experts. These results demonstrate that\nlanguage model agents are now capable of exceeding domain experts across\nmeaningful tasks on scientific literature.\n","authors":["Michael D. Skarlinski","Sam Cox","Jon M. Laurent","James D. Braza","Michaela Hinks","Michael J. Hammerling","Manvitha Ponnapati","Samuel G. Rodriques","Andrew D. White"],"pdf_url":"https://arxiv.org/pdf/2409.13740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09825v2","updated":"2024-09-26T15:21:31Z","published":"2024-06-14T08:29:34Z","title":"Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of\n Anomalous Behavior in Bio-regenerative Life Support System Telemetry","summary":" The detection of abnormal or critical system states is essential in condition\nmonitoring. While much attention is given to promptly identifying anomalies, a\nretrospective analysis of these anomalies can significantly enhance our\ncomprehension of the underlying causes of observed undesired behavior. This\naspect becomes particularly critical when the monitored system is deployed in a\nvital environment. In this study, we delve into anomalies within the domain of\nBio-Regenerative Life Support Systems (BLSS) for space exploration and analyze\nanomalies found in telemetry data stemming from the EDEN ISS space greenhouse\nin Antarctica. We employ time series clustering on anomaly detection results to\ncategorize various types of anomalies in both uni- and multivariate settings.\nWe then assess the effectiveness of these methods in identifying systematic\nanomalous behavior. Additionally, we illustrate that the anomaly detection\nmethods MDI and DAMP produce complementary results, as previously indicated by\nresearch.\n","authors":["Ferdinand Rewicki","Jakob Gawlikowski","Julia Niebling","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2406.09825v2.pdf","comment":"12 pages, + Supplemental Materials, Published at Machine Learning and\n Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024"},{"id":"http://arxiv.org/abs/2409.17864v1","updated":"2024-09-26T14:12:23Z","published":"2024-09-26T14:12:23Z","title":"A Multimodal Single-Branch Embedding Network for Recommendation in\n Cold-Start and Missing Modality Scenarios","summary":" Most recommender systems adopt collaborative filtering (CF) and provide\nrecommendations based on past collective interactions. Therefore, the\nperformance of CF algorithms degrades when few or no interactions are\navailable, a scenario referred to as cold-start. To address this issue,\nprevious work relies on models leveraging both collaborative data and side\ninformation on the users or items. Similar to multimodal learning, these models\naim at combining collaborative and content representations in a shared\nembedding space. In this work we propose a novel technique for multimodal\nrecommendation, relying on a multimodal Single-Branch embedding network for\nRecommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction\ndata as well as multimodal side information using the same single-branch\nembedding network on different modalities. This makes SiBraR effective in\nscenarios of missing modality, including cold start. Our extensive experiments\non large-scale recommendation datasets from three different recommendation\ndomains (music, movie, and e-commerce) and providing multimodal content\ninformation (audio, text, image, labels, and interactions) show that SiBraR\nsignificantly outperforms CF as well as state-of-the-art content-based RSs in\ncold-start scenarios, and is competitive in warm scenarios. We show that\nSiBraR's recommendations are accurate in missing modality scenarios, and that\nthe model is able to map different modalities to the same region of the shared\nembedding space, hence reducing the modality gap.\n","authors":["Christian Ganhör","Marta Moscati","Anna Hausberger","Shah Nawaz","Markus Schedl"],"pdf_url":"https://arxiv.org/pdf/2409.17864v1.pdf","comment":"Accepted at 18th ACM Conference on Recommender Systems (RecSys '24)"},{"id":"http://arxiv.org/abs/2409.17769v1","updated":"2024-09-26T12:07:46Z","published":"2024-09-26T12:07:46Z","title":"Value Identification in Multistakeholder Recommender Systems for\n Humanities and Historical Research: The Case of the Digital Archive\n Monasterium.net","summary":" Recommender systems remain underutilized in humanities and historical\nresearch, despite their potential to enhance the discovery of cultural records.\nThis paper offers an initial value identification of the multiple stakeholders\nthat might be impacted by recommendations in Monasterium.net, a digital archive\nfor historical legal documents. Specifically, we discuss the diverse values and\nobjectives of its stakeholders, such as editors, aggregators, platform owners,\nresearchers, publishers, and funding agencies. These in-depth insights into the\npotentially conflicting values of stakeholder groups allow designing and\nadapting recommender systems to enhance their usefulness for humanities and\nhistorical research. Additionally, our findings will support deeper engagement\nwith additional stakeholders to refine value models and evaluation metrics for\nrecommender systems in the given domains. Our conclusions are embedded in and\napplicable to other digital archives and a broader cultural heritage context.\n","authors":["Florian Atzenhofer-Baumgartner","Bernhard C. Geiger","Georg Vogeler","Dominik Kowald"],"pdf_url":"https://arxiv.org/pdf/2409.17769v1.pdf","comment":"To be presented at: NORMalize 2024: The Second Workshop on the\n Normative Design and Evaluation of Recommender Systems, October 18, 2024,\n co-located with the ACM Conference on Recommender Systems 2024 (RecSys 2024),\n Bari, Italy"},{"id":"http://arxiv.org/abs/2409.17745v1","updated":"2024-09-26T11:19:09Z","published":"2024-09-26T11:19:09Z","title":"Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval\n Model","summary":" A supervised ranking model, despite its advantage of being effective, usually\ninvolves complex processing - typically multiple stages of task-specific\npre-training and fine-tuning. This has motivated researchers to explore simpler\npipelines leveraging large language models (LLMs) that are capable of working\nin a zero-shot manner. However, since zero-shot inference does not make use of\na training set of pairs of queries and their relevant documents, its\nperformance is mostly worse than that of supervised models, which are trained\non such example pairs. Motivated by the existing findings that training\nexamples generally improve zero-shot performance, in our work, we explore if\nthis also applies to ranking models. More specifically, given a query and a\npair of documents, the preference prediction task is improved by augmenting\nexamples of preferences for similar queries from a training set. Our proposed\npairwise few-shot ranker demonstrates consistent improvements over the\nzero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset)\nretrieval benchmarks. Our method also achieves a close performance to that of a\nsupervised model without requiring any complex training pipeline.\n","authors":["Nilanjan Sinhababu","Andrew Parry","Debasis Ganguly","Debasis Samanta","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2409.17745v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17730v1","updated":"2024-09-26T11:00:19Z","published":"2024-09-26T11:00:19Z","title":"Autoregressive Generation Strategies for Top-K Sequential\n Recommendations","summary":" The goal of modern sequential recommender systems is often formulated in\nterms of next-item prediction. In this paper, we explore the applicability of\ngenerative transformer-based models for the Top-K sequential recommendation\ntask, where the goal is to predict items a user is likely to interact with in\nthe \"near future\".\n We explore commonly used autoregressive generation strategies, including\ngreedy decoding, beam search, and temperature sampling, to evaluate their\nperformance for the Top-K sequential recommendation task. In addition, we\npropose novel Reciprocal Rank Aggregation (RRA) and Relevance Aggregation (RA)\ngeneration strategies based on multi-sequence generation with temperature\nsampling and subsequent aggregation.\n Experiments on diverse datasets give valuable insights regarding commonly\nused strategies' applicability and show that suggested approaches improve\nperformance on longer time horizons compared to widely-used Top-K prediction\napproach and single-sequence autoregressive generation strategies.\n","authors":["Anna Volodkevich","Danil Gusak","Anton Klenitskiy","Alexey Vasilev"],"pdf_url":"https://arxiv.org/pdf/2409.17730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17711v1","updated":"2024-09-26T10:27:19Z","published":"2024-09-26T10:27:19Z","title":"Efficient Pointwise-Pairwise Learning-to-Rank for News Recommendation","summary":" News recommendation is a challenging task that involves personalization based\non the interaction history and preferences of each user. Recent works have\nleveraged the power of pretrained language models (PLMs) to directly rank news\nitems by using inference approaches that predominately fall into three\ncategories: pointwise, pairwise, and listwise learning-to-rank. While pointwise\nmethods offer linear inference complexity, they fail to capture crucial\ncomparative information between items that is more effective for ranking tasks.\nConversely, pairwise and listwise approaches excel at incorporating these\ncomparisons but suffer from practical limitations: pairwise approaches are\neither computationally expensive or lack theoretical guarantees, and listwise\nmethods often perform poorly in practice. In this paper, we propose a novel\nframework for PLM-based news recommendation that integrates both pointwise\nrelevance prediction and pairwise comparisons in a scalable manner. We present\na rigorous theoretical analysis of our framework, establishing conditions under\nwhich our approach guarantees improved performance. Extensive experiments show\nthat our approach outperforms the state-of-the-art methods on the MIND and\nAdressa news recommendation datasets.\n","authors":["Nithish Kannen","Yao Ma","Gerrit J. J. van den Burg","Jean Baptiste Faddoul"],"pdf_url":"https://arxiv.org/pdf/2409.17711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15953v2","updated":"2024-09-26T10:22:34Z","published":"2024-08-28T17:12:01Z","title":"Modeling and Analyzing the Influence of Non-Item Pages on Sequential\n Next-Item Prediction","summary":" Analyzing sequences of interactions between users and items, sequential\nrecommendation models can learn user intent and make predictions about the next\nitem. Next to item interactions, most systems also have interactions with what\nwe call non-item pages: these pages are not related to specific items but still\ncan provide insights of the user's interests, as, for example, navigation\npages.\n We therefore propose a general way to include these non-item pages in\nsequential recommendation models to enhance next-item prediction. First, we\ndemonstrate the influence of non-item pages on following interactions with the\nhypotheses testing framework HypTrails and propose methods for representing\nnon-item pages in sequential recommendation models. Subsequently, we adapt\npopular sequential recommender models to integrate non-item pages and\ninvestigate their performance with different item representation strategies as\nwell as their ability to handle noisy data. To show the general capabilities of\nthe models to integrate non-item pages, we create a synthetic dataset for a\ncontrolled setting and then evaluate the improvements from including non-item\npages on two real-world datasets.\n Our results show that non-item pages are a valuable source of information,\nand incorporating them in sequential recommendation models increases the\nperformance of next-item prediction across all analyzed model architectures.\n","authors":["Elisabeth Fischer","Albin Zehe","Andreas Hotho","Daniel Schlör"],"pdf_url":"https://arxiv.org/pdf/2408.15953v2.pdf","comment":"37 pages, 19 figures; Submitted to ACM TORS"},{"id":"http://arxiv.org/abs/2409.17580v1","updated":"2024-09-26T06:53:29Z","published":"2024-09-26T06:53:29Z","title":"Enhancing Structured-Data Retrieval with GraphRAG: Soccer Data Case\n Study","summary":" Extracting meaningful insights from large and complex datasets poses\nsignificant challenges, particularly in ensuring the accuracy and relevance of\nretrieved information. Traditional data retrieval methods such as sequential\nsearch and index-based retrieval often fail when handling intricate and\ninterconnected data structures, resulting in incomplete or misleading outputs.\nTo overcome these limitations, we introduce Structured-GraphRAG, a versatile\nframework designed to enhance information retrieval across structured datasets\nin natural language queries. Structured-GraphRAG utilizes multiple knowledge\ngraphs, which represent data in a structured format and capture complex\nrelationships between entities, enabling a more nuanced and comprehensive\nretrieval of information. This graph-based approach reduces the risk of errors\nin language model outputs by grounding responses in a structured format,\nthereby enhancing the reliability of results. We demonstrate the effectiveness\nof Structured-GraphRAG by comparing its performance with that of a recently\npublished method using traditional retrieval-augmented generation. Our findings\nshow that Structured-GraphRAG significantly improves query processing\nefficiency and reduces response times. While our case study focuses on soccer\ndata, the framework's design is broadly applicable, offering a powerful tool\nfor data analysis and enhancing language model applications across various\nstructured domains.\n","authors":["Zahra Sepasdar","Sushant Gautam","Cise Midoglu","Michael A. Riegler","Pål Halvorsen"],"pdf_url":"https://arxiv.org/pdf/2409.17580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05013v2","updated":"2024-09-26T06:19:34Z","published":"2024-06-07T15:23:53Z","title":"CHIQ: Contextual History Enhancement for Improving Query Rewriting in\n Conversational Search","summary":" In this paper, we study how open-source large language models (LLMs) can be\neffectively deployed for improving query rewriting in conversational search,\nespecially for ambiguous queries. We introduce CHIQ, a two-step method that\nleverages the capabilities of LLMs to resolve ambiguities in the conversation\nhistory before query rewriting. This approach contrasts with prior studies that\npredominantly use closed-source LLMs to directly generate search queries from\nconversation history. We demonstrate on five well-established benchmarks that\nCHIQ leads to state-of-the-art results across most settings, showing highly\ncompetitive performances with systems leveraging closed-source LLMs. Our study\nprovides a first step towards leveraging open-source LLMs in conversational\nsearch, as a competitive alternative to the prevailing reliance on commercial\nLLMs. Data, models, and source code will be publicly available upon acceptance\nat https://github.com/fengranMark/CHIQ.\n","authors":["Fengran Mo","Abbas Ghaddar","Kelong Mao","Mehdi Rezagholizadeh","Boxing Chen","Qun Liu","Jian-Yun Nie"],"pdf_url":"https://arxiv.org/pdf/2406.05013v2.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.15763v2","updated":"2024-09-26T05:43:08Z","published":"2024-09-24T05:39:53Z","title":"IRSC: A Zero-shot Evaluation Benchmark for Information Retrieval through\n Semantic Comprehension in Retrieval-Augmented Generation Scenarios","summary":" In Retrieval-Augmented Generation (RAG) tasks using Large Language Models\n(LLMs), the quality of retrieved information is critical to the final output.\nThis paper introduces the IRSC benchmark for evaluating the performance of\nembedding models in multilingual RAG tasks. The benchmark encompasses five\nretrieval tasks: query retrieval, title retrieval, part-of-paragraph retrieval,\nkeyword retrieval, and summary retrieval. Our research addresses the current\nlack of comprehensive testing and effective comparison methods for embedding\nmodels in RAG scenarios. We introduced new metrics: the Similarity of Semantic\nComprehension Index (SSCI) and the Retrieval Capability Contest Index (RCCI),\nand evaluated models such as Snowflake-Arctic, BGE, GTE, and M3E. Our\ncontributions include: 1) the IRSC benchmark, 2) the SSCI and RCCI metrics, and\n3) insights into the cross-lingual limitations of embedding models. The IRSC\nbenchmark aims to enhance the understanding and development of accurate\nretrieval systems in RAG tasks. All code and datasets are available at:\nhttps://github.com/Jasaxion/IRSC_Benchmark\n","authors":["Hai Lin","Shaoxiong Zhan","Junyou Su","Haitao Zheng","Hui Wang"],"pdf_url":"https://arxiv.org/pdf/2409.15763v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10743v4","updated":"2024-09-26T03:38:59Z","published":"2023-12-17T15:28:06Z","title":"A Unified Framework for Multi-Domain CTR Prediction via Large Language\n Models","summary":" Click-Through Rate (CTR) prediction is a crucial task in online\nrecommendation platforms as it involves estimating the probability of user\nengagement with advertisements or items by clicking on them. Given the\navailability of various services like online shopping, ride-sharing, food\ndelivery, and professional services on commercial platforms, recommendation\nsystems in these platforms are required to make CTR predictions across multiple\ndomains rather than just a single domain. However, multi-domain click-through\nrate (MDCTR) prediction remains a challenging task in online recommendation due\nto the complex mutual influence between domains. Traditional MDCTR models\ntypically encode domains as discrete identifiers, ignoring rich semantic\ninformation underlying. Consequently, they can hardly generalize to new\ndomains. Besides, existing models can be easily dominated by some specific\ndomains, which results in significant performance drops in the other domains\n(i.e. the \"seesaw phenomenon\"). In this paper, we propose a novel solution\nUni-CTR to address the above challenges. Uni-CTR leverages a backbone Large\nLanguage Model (LLM) to learn layer-wise semantic representations that capture\ncommonalities between domains. Uni-CTR also uses several domain-specific\nnetworks to capture the characteristics of each domain. Note that we design a\nmasked loss strategy so that these domain-specific networks are decoupled from\nbackbone LLM. This allows domain-specific networks to remain unchanged when\nincorporating new or removing domains, thereby enhancing the flexibility and\nscalability of the system significantly. Experimental results on three public\ndatasets show that Uni-CTR outperforms the state-of-the-art (SOTA) MDCTR models\nsignificantly. Furthermore, Uni-CTR demonstrates remarkable effectiveness in\nzero-shot prediction. We have applied Uni-CTR in industrial scenarios,\nconfirming its efficiency.\n","authors":["Zichuan Fu","Xiangyang Li","Chuhan Wu","Yichao Wang","Kuicai Dong","Xiangyu Zhao","Mengchen Zhao","Huifeng Guo","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2312.10743v4.pdf","comment":"Accept By ACM TRANSACTIONS ON INFORMATION SYSTEMS(TOIS)"},{"id":"http://arxiv.org/abs/2409.17476v1","updated":"2024-09-26T02:24:03Z","published":"2024-09-26T02:24:03Z","title":"Improving the Shortest Plank: Vulnerability-Aware Adversarial Training\n for Robust Recommender System","summary":" Recommender systems play a pivotal role in mitigating information overload in\nvarious fields. Nonetheless, the inherent openness of these systems introduces\nvulnerabilities, allowing attackers to insert fake users into the system's\ntraining data to skew the exposure of certain items, known as poisoning\nattacks. Adversarial training has emerged as a notable defense mechanism\nagainst such poisoning attacks within recommender systems. Existing adversarial\ntraining methods apply perturbations of the same magnitude across all users to\nenhance system robustness against attacks. Yet, in reality, we find that\nattacks often affect only a subset of users who are vulnerable. These\nperturbations of indiscriminate magnitude make it difficult to balance\neffective protection for vulnerable users without degrading recommendation\nquality for those who are not affected. To address this issue, our research\ndelves into understanding user vulnerability. Considering that poisoning\nattacks pollute the training data, we note that the higher degree to which a\nrecommender system fits users' training data correlates with an increased\nlikelihood of users incorporating attack information, indicating their\nvulnerability. Leveraging these insights, we introduce the Vulnerability-aware\nAdversarial Training (VAT), designed to defend against poisoning attacks in\nrecommender systems. VAT employs a novel vulnerability-aware function to\nestimate users' vulnerability based on the degree to which the system fits\nthem. Guided by this estimation, VAT applies perturbations of adaptive\nmagnitude to each user, not only reducing the success ratio of attacks but also\npreserving, and potentially enhancing, the quality of recommendations.\nComprehensive experiments confirm VAT's superior defensive capabilities across\ndifferent recommendation models and against various types of attacks.\n","authors":["Kaike Zhang","Qi Cao","Yunfan Wu","Fei Sun","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2409.17476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17460v1","updated":"2024-09-26T01:38:05Z","published":"2024-09-26T01:38:05Z","title":"Towards More Relevant Product Search Ranking Via Large Language Models:\n An Empirical Study","summary":" Training Learning-to-Rank models for e-commerce product search ranking can be\nchallenging due to the lack of a gold standard of ranking relevance. In this\npaper, we decompose ranking relevance into content-based and engagement-based\naspects, and we propose to leverage Large Language Models (LLMs) for both label\nand feature generation in model training, primarily aiming to improve the\nmodel's predictive capability for content-based relevance. Additionally, we\nintroduce different sigmoid transformations on the LLM outputs to polarize\nrelevance scores in labeling, enhancing the model's ability to balance\ncontent-based and engagement-based relevances and thus prioritize highly\nrelevant items overall. Comprehensive online tests and offline evaluations are\nalso conducted for the proposed design. Our work sheds light on advanced\nstrategies for integrating LLMs into e-commerce product search ranking model\ntraining, offering a pathway to more effective and balanced models with\nimproved ranking relevance.\n","authors":["Qi Liu","Atul Singh","Jingbo Liu","Cun Mu","Zheng Yan"],"pdf_url":"https://arxiv.org/pdf/2409.17460v1.pdf","comment":"To be published in CIKM 2024 GenAIECommerce Workshop"},{"id":"http://arxiv.org/abs/2409.17456v1","updated":"2024-09-26T01:18:29Z","published":"2024-09-26T01:18:29Z","title":"Long or Short or Both? An Exploration on Lookback Time Windows of\n Behavioral Features in Product Search Ranking","summary":" Customer shopping behavioral features are core to product search ranking\nmodels in eCommerce. In this paper, we investigate the effect of lookback time\nwindows when aggregating these features at the (query, product) level over\nhistory. By studying the pros and cons of using long and short time windows, we\npropose a novel approach to integrating these historical behavioral features of\ndifferent time windows. In particular, we address the criticality of using\nquery-level vertical signals in ranking models to effectively aggregate all\ninformation from different behavioral features. Anecdotal evidence for the\nproposed approach is also provided using live product search traffic on\nWalmart.com.\n","authors":["Qi Liu","Atul Singh","Jingbo Liu","Cun Mu","Zheng Yan","Jan Pedersen"],"pdf_url":"https://arxiv.org/pdf/2409.17456v1.pdf","comment":"Published in ACM SIGIR Workshop on eCommerce 2024"},{"id":"http://arxiv.org/abs/2409.17436v1","updated":"2024-09-26T00:08:46Z","published":"2024-09-26T00:08:46Z","title":"Minimizing Live Experiments in Recommender Systems: User Simulation to\n Evaluate Preference Elicitation Policies","summary":" Evaluation of policies in recommender systems typically involves A/B testing\nusing live experiments on real users to assess a new policy's impact on\nrelevant metrics. This ``gold standard'' comes at a high cost, however, in\nterms of cycle time, user cost, and potential user retention. In developing\npolicies for ``onboarding'' new users, these costs can be especially\nproblematic, since on-boarding occurs only once. In this work, we describe a\nsimulation methodology used to augment (and reduce) the use of live\nexperiments. We illustrate its deployment for the evaluation of ``preference\nelicitation'' algorithms used to onboard new users of the YouTube Music\nplatform. By developing counterfactually robust user behavior models, and a\nsimulation service that couples such models with production infrastructure, we\nare able to test new algorithms in a way that reliably predicts their\nperformance on key metrics when deployed live. We describe our domain, our\nsimulation models and platform, results of experiments and deployment, and\nsuggest future steps needed to further realistic simulation as a powerful\ncomplement to live experiments.\n","authors":["Chih-Wei Hsu","Martin Mladenov","Ofer Meshi","James Pine","Hubert Pham","Shane Li","Xujian Liang","Anton Polishko","Li Yang","Ben Scheetz","Craig Boutilier"],"pdf_url":"https://arxiv.org/pdf/2409.17436v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2409.18119v1","updated":"2024-09-26T17:56:59Z","published":"2024-09-26T17:56:59Z","title":"Multi-View and Multi-Scale Alignment for Contrastive Language-Image\n Pre-training in Mammography","summary":" Contrastive Language-Image Pre-training (CLIP) shows promise in medical image\nanalysis but requires substantial data and computational resources. Due to\nthese restrictions, existing CLIP applications in medical imaging focus mainly\non modalities like chest X-rays that have abundant image-report data available,\nleaving many other important modalities under-explored. Here, we propose the\nfirst adaptation of the full CLIP model to mammography, which presents\nsignificant challenges due to labeled data scarcity, high-resolution images\nwith small regions of interest, and data imbalance. We first develop a\nspecialized supervision framework for mammography that leverages its multi-view\nnature. Furthermore, we design a symmetric local alignment module to better\nfocus on detailed features in high-resolution images. Lastly, we incorporate a\nparameter-efficient fine-tuning approach for large language models pre-trained\nwith medical knowledge to address data limitations. Our multi-view and\nmulti-scale alignment (MaMA) method outperforms state-of-the-art baselines for\nthree different tasks on two large real-world mammography datasets, EMBED and\nRSNA-Mammo, with only 52% model size compared with the largest baseline.\n","authors":["Yuexi Du","John Onofrey","Nicha C. Dvornek"],"pdf_url":"https://arxiv.org/pdf/2409.18119v1.pdf","comment":"This work is also the basis of the overall best solution for the\n MICCAI 2024 CXR-LT Challenge"},{"id":"http://arxiv.org/abs/2310.13387v2","updated":"2024-09-26T17:55:48Z","published":"2023-10-20T09:56:07Z","title":"Assumption violations in causal discovery and the robustness of score\n matching","summary":" When domain knowledge is limited and experimentation is restricted by\nethical, financial, or time constraints, practitioners turn to observational\ncausal discovery methods to recover the causal structure, exploiting the\nstatistical properties of their data. Because causal discovery without further\nassumptions is an ill-posed problem, each algorithm comes with its own set of\nusually untestable assumptions, some of which are hard to meet in real\ndatasets. Motivated by these considerations, this paper extensively benchmarks\nthe empirical performance of recent causal discovery methods on observational\ni.i.d. data generated under different background conditions, allowing for\nviolations of the critical assumptions required by each selected approach. Our\nexperimental findings show that score matching-based methods demonstrate\nsurprising performance in the false positive and false negative rate of the\ninferred graph in these challenging scenarios, and we provide theoretical\ninsights into their performance. This work is also the first effort to\nbenchmark the stability of causal discovery algorithms with respect to the\nvalues of their hyperparameters. Finally, we hope this paper will set a new\nstandard for the evaluation of causal discovery methods and can serve as an\naccessible entry point for practitioners interested in the field, highlighting\nthe empirical implications of different algorithm choices.\n","authors":["Francesco Montagna","Atalanti A. Mastakouri","Elias Eulig","Nicoletta Noceti","Lorenzo Rosasco","Dominik Janzing","Bryon Aragam","Francesco Locatello"],"pdf_url":"https://arxiv.org/pdf/2310.13387v2.pdf","comment":"37th Conference on Neural Information Processing Systems (NeurIPS\n 2023)"},{"id":"http://arxiv.org/abs/2409.18104v1","updated":"2024-09-26T17:49:20Z","published":"2024-09-26T17:49:20Z","title":"Find Rhinos without Finding Rhinos: Active Learning with Multimodal\n Imagery of South African Rhino Habitats","summary":" Much of Earth's charismatic megafauna is endangered by human activities,\nparticularly the rhino, which is at risk of extinction due to the poaching\ncrisis in Africa. Monitoring rhinos' movement is crucial to their protection\nbut has unfortunately proven difficult because rhinos are elusive. Therefore,\ninstead of tracking rhinos, we propose the novel approach of mapping communal\ndefecation sites, called middens, which give information about rhinos' spatial\nbehavior valuable to anti-poaching, management, and reintroduction efforts.\nThis paper provides the first-ever mapping of rhino midden locations by\nbuilding classifiers to detect them using remotely sensed thermal, RGB, and\nLiDAR imagery in passive and active learning settings. As existing active\nlearning methods perform poorly due to the extreme class imbalance in our\ndataset, we design MultimodAL, an active learning system employing a ranking\ntechnique and multimodality to achieve competitive performance with passive\nlearning models with 94% fewer labels. Our methods could therefore save over 76\nhours in labeling time when used on a similarly-sized dataset. Unexpectedly,\nour midden map reveals that rhino middens are not randomly distributed\nthroughout the landscape; rather, they are clustered. Consequently, rangers\nshould be targeted at areas with high midden densities to strengthen\nanti-poaching efforts, in line with UN Target 15.7.\n","authors":["Lucia Gordon","Nikhil Behari","Samuel Collier","Elizabeth Bondi-Kelly","Jackson A. Killian","Catherine Ressijac","Peter Boucher","Andrew Davies","Milind Tambe"],"pdf_url":"https://arxiv.org/pdf/2409.18104v1.pdf","comment":"9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good"},{"id":"http://arxiv.org/abs/2409.18102v1","updated":"2024-09-26T17:45:10Z","published":"2024-09-26T17:45:10Z","title":"MALPOLON: A Framework for Deep Species Distribution Modeling","summary":" This paper describes a deep-SDM framework, MALPOLON. Written in Python and\nbuilt upon the PyTorch library, this framework aims to facilitate training and\ninferences of deep species distribution models (deep-SDM) and sharing for users\nwith only general Python language skills (e.g., modeling ecologists) who are\ninterested in testing deep learning approaches to build new SDMs. More advanced\nusers can also benefit from the framework's modularity to run more specific\nexperiments by overriding existing classes while taking advantage of\npress-button examples to train neural networks on multiple classification tasks\nusing custom or provided raw and pre-processed datasets. The framework is\nopen-sourced on GitHub and PyPi along with extensive documentation and examples\nof use in various scenarios. MALPOLON offers straightforward installation,\nYAML-based configuration, parallel computing, multi-GPU utilization, baseline\nand foundational models for benchmarking, and extensive\ntutorials/documentation, aiming to enhance accessibility and performance\nscalability for ecologists and researchers.\n","authors":["Theo Larcher","Lukas Picek","Benjamin Deneu","Titouan Lorieul","Maximilien Servajean","Alexis Joly"],"pdf_url":"https://arxiv.org/pdf/2409.18102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18100v1","updated":"2024-09-26T17:44:29Z","published":"2024-09-26T17:44:29Z","title":"Self-supervised Pretraining for Cardiovascular Magnetic Resonance Cine\n Segmentation","summary":" Self-supervised pretraining (SSP) has shown promising results in learning\nfrom large unlabeled datasets and, thus, could be useful for automated\ncardiovascular magnetic resonance (CMR) short-axis cine segmentation. However,\ninconsistent reports of the benefits of SSP for segmentation have made it\ndifficult to apply SSP to CMR. Therefore, this study aimed to evaluate SSP\nmethods for CMR cine segmentation.\n To this end, short-axis cine stacks of 296 subjects (90618 2D slices) were\nused for unlabeled pretraining with four SSP methods; SimCLR, positional\ncontrastive learning, DINO, and masked image modeling (MIM). Subsets of varying\nnumbers of subjects were used for supervised fine-tuning of 2D models for each\nSSP method, as well as to train a 2D baseline model from scratch. The\nfine-tuned models were compared to the baseline using the 3D Dice similarity\ncoefficient (DSC) in a test dataset of 140 subjects.\n The SSP methods showed no performance gains with the largest supervised\nfine-tuning subset compared to the baseline (DSC = 0.89). When only 10 subjects\n(231 2D slices) are available for supervised training, SSP using MIM (DSC =\n0.86) improves over training from scratch (DSC = 0.82).\n This study found that SSP is valuable for CMR cine segmentation when labeled\ntraining data is scarce, but does not aid state-of-the-art deep learning\nmethods when ample labeled data is available. Moreover, the choice of SSP\nmethod is important. The code is publicly available at:\nhttps://github.com/q-cardIA/ssp-cmr-cine-segmentation\n","authors":["Rob A. J. de Mooij","Josien P. W. Pluim","Cian M. Scannell"],"pdf_url":"https://arxiv.org/pdf/2409.18100v1.pdf","comment":"Accepted to Data Engineering in Medical Imaging (DEMI) Workshop at\n MICCAI 2024"},{"id":"http://arxiv.org/abs/2409.04406v2","updated":"2024-09-26T17:38:26Z","published":"2024-09-06T16:56:06Z","title":"Quantum Kernel Methods under Scrutiny: A Benchmarking Study","summary":" Since the entry of kernel theory in the field of quantum machine learning,\nquantum kernel methods (QKMs) have gained increasing attention with regard to\nboth probing promising applications and delivering intriguing research\ninsights. Two common approaches for computing the underlying Gram matrix have\nemerged: fidelity quantum kernels (FQKs) and projected quantum kernels (PQKs).\nBenchmarking these methods is crucial to gain robust insights and to understand\ntheir practical utility. In this work, we present a comprehensive large-scale\nstudy examining QKMs based on FQKs and PQKs across a manifold of design\nchoices. Our investigation encompasses both classification and regression tasks\nfor five dataset families and 64 datasets, systematically comparing the use of\nFQKs and PQKs quantum support vector machines and kernel ridge regression. This\nresulted in over 20,000 models that were trained and optimized using a\nstate-of-the-art hyperparameter search to ensure robust and comprehensive\ninsights. We delve into the importance of hyperparameters on model performance\nscores and support our findings through rigorous correlation analyses. In this,\nwe also closely inspect two data encoding strategies. Moreover, we provide an\nin-depth analysis addressing the design freedom of PQKs and explore the\nunderlying principles responsible for learning. Our goal is not to identify the\nbest-performing model for a specific task but to uncover the mechanisms that\nlead to effective QKMs and reveal universal patterns.\n","authors":["Jan Schnabel","Marco Roth"],"pdf_url":"https://arxiv.org/pdf/2409.04406v2.pdf","comment":"18 pages main text including 12 figures and 1 table, appendix 14\n pages with 19 figures and 1 table; restructure result section and prune\n appendix"},{"id":"http://arxiv.org/abs/2409.18073v1","updated":"2024-09-26T17:19:49Z","published":"2024-09-26T17:19:49Z","title":"Infer Human's Intentions Before Following Natural Language Instructions","summary":" For AI agents to be helpful to humans, they should be able to follow natural\nlanguage instructions to complete everyday cooperative tasks in human\nenvironments. However, real human instructions inherently possess ambiguity,\nbecause the human speakers assume sufficient prior knowledge about their hidden\ngoals and intentions. Standard language grounding and planning methods fail to\naddress such ambiguities because they do not model human internal goals as\nadditional partially observable factors in the environment. We propose a new\nframework, Follow Instructions with Social and Embodied Reasoning (FISER),\naiming for better natural language instruction following in collaborative\nembodied tasks. Our framework makes explicit inferences about human goals and\nintentions as intermediate reasoning steps. We implement a set of\nTransformer-based models and evaluate them over a challenging benchmark,\nHandMeThat. We empirically demonstrate that using social reasoning to\nexplicitly infer human intentions before making action plans surpasses purely\nend-to-end approaches. We also compare our implementation with strong\nbaselines, including Chain of Thought prompting on the largest available\npre-trained language models, and find that FISER provides better performance on\nthe embodied social reasoning tasks under investigation, reaching the\nstate-of-the-art on HandMeThat.\n","authors":["Yanming Wan","Yue Wu","Yiping Wang","Jiayuan Mao","Natasha Jaques"],"pdf_url":"https://arxiv.org/pdf/2409.18073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18061v1","updated":"2024-09-26T17:01:41Z","published":"2024-09-26T17:01:41Z","title":"Optimal Protocols for Continual Learning via Statistical Physics and\n Control Theory","summary":" Artificial neural networks often struggle with catastrophic forgetting when\nlearning multiple tasks sequentially, as training on new tasks degrades the\nperformance on previously learned ones. Recent theoretical work has addressed\nthis issue by analysing learning curves in synthetic frameworks under\npredefined training protocols. However, these protocols relied on heuristics\nand lacked a solid theoretical foundation assessing their optimality. In this\npaper, we fill this gap combining exact equations for training dynamics,\nderived using statistical physics techniques, with optimal control methods. We\napply this approach to teacher-student models for continual learning and\nmulti-task problems, obtaining a theory for task-selection protocols maximising\nperformance while minimising forgetting. Our theoretical analysis offers\nnon-trivial yet interpretable strategies for mitigating catastrophic\nforgetting, shedding light on how optimal learning protocols can modulate\nestablished effects, such as the influence of task similarity on forgetting.\nFinally, we validate our theoretical findings on real-world data.\n","authors":["Francesco Mori","Stefano Sarao Mannelli","Francesca Mignacco"],"pdf_url":"https://arxiv.org/pdf/2409.18061v1.pdf","comment":"19 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.18051v1","updated":"2024-09-26T16:55:31Z","published":"2024-09-26T16:55:31Z","title":"Inverse Reinforcement Learning with Multiple Planning Horizons","summary":" In this work, we study an inverse reinforcement learning (IRL) problem where\nthe experts are planning under a shared reward function but with different,\nunknown planning horizons. Without the knowledge of discount factors, the\nreward function has a larger feasible solution set, which makes it harder for\nexisting IRL approaches to identify a reward function. To overcome this\nchallenge, we develop algorithms that can learn a global multi-agent reward\nfunction with agent-specific discount factors that reconstruct the expert\npolicies. We characterize the feasible solution space of the reward function\nand discount factors for both algorithms and demonstrate the generalizability\nof the learned reward function across multiple domains.\n","authors":["Jiayu Yao","Weiwei Pan","Finale Doshi-Velez","Barbara E Engelhardt"],"pdf_url":"https://arxiv.org/pdf/2409.18051v1.pdf","comment":"Accepted at RLC 2024"},{"id":"http://arxiv.org/abs/2409.18049v1","updated":"2024-09-26T16:49:58Z","published":"2024-09-26T16:49:58Z","title":"Revisit Anything: Visual Place Recognition via Image Segment Retrieval","summary":" Accurately recognizing a revisited place is crucial for embodied agents to\nlocalize and navigate. This requires visual representations to be distinct,\ndespite strong variations in camera viewpoint and scene appearance. Existing\nvisual place recognition pipelines encode the \"whole\" image and search for\nmatches. This poses a fundamental challenge in matching two images of the same\nplace captured from different camera viewpoints: \"the similarity of what\noverlaps can be dominated by the dissimilarity of what does not overlap\". We\naddress this by encoding and searching for \"image segments\" instead of the\nwhole images. We propose to use open-set image segmentation to decompose an\nimage into `meaningful' entities (i.e., things and stuff). This enables us to\ncreate a novel image representation as a collection of multiple overlapping\nsubgraphs connecting a segment with its neighboring segments, dubbed\nSuperSegment. Furthermore, to efficiently encode these SuperSegments into\ncompact vector representations, we propose a novel factorized representation of\nfeature aggregation. We show that retrieving these partial representations\nleads to significantly higher recognition recall than the typical whole image\nbased retrieval. Our segments-based approach, dubbed SegVLAD, sets a new\nstate-of-the-art in place recognition on a diverse selection of benchmark\ndatasets, while being applicable to both generic and task-specialized image\nencoders. Finally, we demonstrate the potential of our method to ``revisit\nanything'' by evaluating our method on an object instance retrieval task, which\nbridges the two disparate areas of research: visual place recognition and\nobject-goal navigation, through their common aim of recognizing goal objects\nspecific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything.\n","authors":["Kartik Garg","Sai Shubodh Puligilla","Shishir Kolathaya","Madhava Krishna","Sourav Garg"],"pdf_url":"https://arxiv.org/pdf/2409.18049v1.pdf","comment":"Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures"},{"id":"http://arxiv.org/abs/2408.11974v2","updated":"2024-09-26T16:48:34Z","published":"2024-08-21T20:14:54Z","title":"Two-Timescale Gradient Descent Ascent Algorithms for Nonconvex Minimax\n Optimization","summary":" We provide a unified analysis of two-timescale gradient descent ascent\n(TTGDA) for solving structured nonconvex minimax optimization problems in the\nform of $\\min_\\textbf{x} \\max_{\\textbf{y} \\in Y} f(\\textbf{x}, \\textbf{y})$,\nwhere the objective function $f(\\textbf{x}, \\textbf{y})$ is nonconvex in\n$\\textbf{x}$ and concave in $\\textbf{y}$, and the constraint set $Y \\subseteq\n\\mathbb{R}^n$ is convex and bounded. In the convex-concave setting, the\nsingle-timescale gradient descent ascent (GDA) algorithm is widely used in\napplications and has been shown to have strong convergence guarantees. In more\ngeneral settings, however, it can fail to converge. Our contribution is to\ndesign TTGDA algorithms that are effective beyond the convex-concave setting,\nefficiently finding a stationary point of the function $\\Phi(\\cdot) :=\n\\max_{\\textbf{y} \\in Y} f(\\cdot, \\textbf{y})$. We also establish theoretical\nbounds on the complexity of solving both smooth and nonsmooth nonconvex-concave\nminimax optimization problems. To the best of our knowledge, this is the first\nsystematic analysis of TTGDA for nonconvex minimax optimization, shedding light\non its superior performance in training generative adversarial networks (GANs)\nand in other real-world application problems.\n","authors":["Tianyi Lin","Chi Jin","Michael. I. Jordan"],"pdf_url":"https://arxiv.org/pdf/2408.11974v2.pdf","comment":"A preliminary version [arXiv:1906.00331] of this paper, with a subset\n of the results that are presented here, was presented at ICML 2020; 44 Pages,\n 10 Figures"},{"id":"http://arxiv.org/abs/2409.18046v1","updated":"2024-09-26T16:47:32Z","published":"2024-09-26T16:47:32Z","title":"IFCap: Image-like Retrieval and Frequency-based Entity Filtering for\n Zero-shot Captioning","summary":" Recent advancements in image captioning have explored text-only training\nmethods to overcome the limitations of paired image-text data. However,\nexisting text-only training methods often overlook the modality gap between\nusing text data during training and employing images during inference. To\naddress this issue, we propose a novel approach called Image-like Retrieval,\nwhich aligns text features with visually relevant features to mitigate the\nmodality gap. Our method further enhances the accuracy of generated captions by\ndesigning a Fusion Module that integrates retrieved captions with input\nfeatures. Additionally, we introduce a Frequency-based Entity Filtering\ntechnique that significantly improves caption quality. We integrate these\nmethods into a unified framework, which we refer to as IFCap\n($\\textbf{I}$mage-like Retrieval and $\\textbf{F}$requency-based Entity\nFiltering for Zero-shot $\\textbf{Cap}$tioning). Through extensive\nexperimentation, our straightforward yet powerful approach has demonstrated its\nefficacy, outperforming the state-of-the-art methods by a significant margin in\nboth image captioning and video captioning compared to zero-shot captioning\nbased on text-only training.\n","authors":["Soeun Lee","Si-Woo Kim","Taewhan Kim","Dong-Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18046v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.16626v2","updated":"2024-09-26T16:41:27Z","published":"2024-09-25T05:11:58Z","title":"Ascend HiFloat8 Format for Deep Learning","summary":" This preliminary white paper proposes a novel 8-bit floating-point data\nformat HiFloat8 (abbreviated as HiF8) for deep learning. HiF8 features tapered\nprecision. For normal value encoding, it provides 7 exponent values with 3-bit\nmantissa, 8 exponent values with 2-bit mantissa, and 16 exponent values with\n1-bit mantissa. For denormal value encoding, it extends the dynamic range by 7\nextra powers of 2, from 31 to 38 binades (notice that FP16 covers 40 binades).\nMeanwhile, HiF8 encodes all the special values except that positive zero and\nnegative zero are represented by only one bit-pattern. Thanks to the better\nbalance between precision and dynamic range, HiF8 can be simultaneously used in\nboth forward and backward passes of AI training. In this paper, we will\ndescribe the definition and rounding methods of HiF8, as well as the tentative\ntraining and inference solutions. To demonstrate the efficacy of HiF8, massive\nsimulation results on various neural networks, including traditional neural\nnetworks and large language models (LLMs), will also be presented.\n","authors":["Yuanyong Luo","Zhongxing Zhang","Richard Wu","Hu Liu","Ying Jin","Kai Zheng","Minmin Wang","Zhanying He","Guipeng Hu","Luyao Chen","Tianchi Hu","Junsong Wang","Minqi Chen","Mikhaylov Dmitry","Korviakov Vladimir","Bobrin Maxim","Yuhao Hu","Guanfu Chen","Zeyi Huang"],"pdf_url":"https://arxiv.org/pdf/2409.16626v2.pdf","comment":"13 Pages, 4 Figures, 9 Tables"},{"id":"http://arxiv.org/abs/2409.18032v1","updated":"2024-09-26T16:38:48Z","published":"2024-09-26T16:38:48Z","title":"FlowBench: A Large Scale Benchmark for Flow Simulation over Complex\n Geometries","summary":" Simulating fluid flow around arbitrary shapes is key to solving various\nengineering problems. However, simulating flow physics across complex\ngeometries remains numerically challenging and computationally\nresource-intensive, particularly when using conventional PDE solvers. Machine\nlearning methods offer attractive opportunities to create fast and adaptable\nPDE solvers. However, benchmark datasets to measure the performance of such\nmethods are scarce, especially for flow physics across complex geometries. We\nintroduce FlowBench, a dataset for neural simulators with over 10K samples,\nwhich is currently larger than any publicly available flow physics dataset.\nFlowBench contains flow simulation data across complex geometries\n(\\textit{parametric vs. non-parametric}), spanning a range of flow conditions\n(\\textit{Reynolds number and Grashoff number}), capturing a diverse array of\nflow phenomena (\\textit{steady vs. transient; forced vs. free convection}), and\nfor both 2D and 3D. FlowBench contains over 10K data samples, with each sample\nthe outcome of a fully resolved, direct numerical simulation using a\nwell-validated simulator framework designed for modeling transport phenomena in\ncomplex geometries. For each sample, we include velocity, pressure, and\ntemperature field data at 3 different resolutions and several summary\nstatistics features of engineering relevance (such as coefficients of lift and\ndrag, and Nusselt numbers). %Additionally, we include masks and signed distance\nfields for each shape. We envision that FlowBench will enable evaluating the\ninterplay between complex geometry, coupled flow phenomena, and data\nsufficiency on the performance of current, and future, neural PDE solvers. We\nenumerate several evaluation metrics to help rank order the performance of\nneural PDE solvers. We benchmark the performance of several baseline methods\nincluding FNO, CNO, WNO, and DeepONet.\n","authors":["Ronak Tali","Ali Rabeh","Cheng-Hau Yang","Mehdi Shadkhah","Samundra Karki","Abhisek Upadhyaya","Suriya Dhakshinamoorthy","Marjan Saadati","Soumik Sarkar","Adarsh Krishnamurthy","Chinmay Hegde","Aditya Balu","Baskar Ganapathysubramanian"],"pdf_url":"https://arxiv.org/pdf/2409.18032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15126v5","updated":"2024-09-26T16:38:32Z","published":"2024-08-27T15:07:27Z","title":"Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of\n Peptides","summary":" Molecular Dynamics (MD) is crucial in various fields such as materials\nscience, chemistry, and pharmacology to name a few. Conventional MD software\nstruggles with the balance between time cost and prediction accuracy, which\nrestricts its wider application. Recently, data-driven approaches based on deep\ngenerative models have been devised for time-coarsened dynamics, which aim at\nlearning dynamics of diverse molecular systems over a long timestep, enjoying\nboth universality and efficiency. Nevertheless, most current methods are\ndesigned solely to learn from the data distribution regardless of the\nunderlying Boltzmann distribution, and the physics priors such as energies and\nforces are constantly overlooked. In this work, we propose a conditional\ngenerative model called Force-guided Bridge Matching (FBM), which learns\nfull-atom time-coarsened dynamics and targets the Boltzmann-constrained\ndistribution. With the guidance of our delicately-designed intermediate force\nfield, FBM leverages favourable physics priors into the generation process,\ngiving rise to enhanced simulations. Experiments on two datasets consisting of\npeptides verify our superiority in terms of comprehensive metrics and\ndemonstrate transferability to unseen systems.\n","authors":["Ziyang Yu","Wenbing Huang","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15126v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18025v1","updated":"2024-09-26T16:32:19Z","published":"2024-09-26T16:32:19Z","title":"An Adversarial Perspective on Machine Unlearning for AI Safety","summary":" Large language models are finetuned to refuse questions about hazardous\nknowledge, but these protections can often be bypassed. Unlearning methods aim\nat completely removing hazardous capabilities from models and make them\ninaccessible to adversaries. This work challenges the fundamental differences\nbetween unlearning and traditional safety post-training from an adversarial\nperspective. We demonstrate that existing jailbreak methods, previously\nreported as ineffective against unlearning, can be successful when applied\ncarefully. Furthermore, we develop a variety of adaptive methods that recover\nmost supposedly unlearned capabilities. For instance, we show that finetuning\non 10 unrelated examples or removing specific directions in the activation\nspace can recover most hazardous capabilities for models edited with RMU, a\nstate-of-the-art unlearning method. Our findings challenge the robustness of\ncurrent unlearning approaches and question their advantages over safety\ntraining.\n","authors":["Jakub Łucki","Boyi Wei","Yangsibo Huang","Peter Henderson","Florian Tramèr","Javier Rando"],"pdf_url":"https://arxiv.org/pdf/2409.18025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18013v1","updated":"2024-09-26T16:22:08Z","published":"2024-09-26T16:22:08Z","title":"Spatiotemporal Learning on Cell-embedded Graphs","summary":" Data-driven simulation of physical systems has recently kindled significant\nattention, where many neural models have been developed. In particular,\nmesh-based graph neural networks (GNNs) have demonstrated significant potential\nin predicting spatiotemporal dynamics across arbitrary geometric domains.\nHowever, the existing node-edge message passing mechanism in GNNs limits the\nmodel's representation learning ability. In this paper, we proposed a\ncell-embedded GNN model (aka CeGNN) to learn spatiotemporal dynamics with\nlifted performance. Specifically, we introduce a learnable cell attribution to\nthe node-edge message passing process, which better captures the spatial\ndependency of regional features. Such a strategy essentially upgrades the local\naggregation scheme from the first order (e.g., from edge to node) to a higher\norder (e.g., from volume to edge and then to node), which takes advantage of\nvolumetric information in message passing. Meanwhile, a novel feature-enhanced\nblock is designed to further improve the performance of CeGNN and relieve the\nover-smoothness problem, via treating the latent features as basis functions.\nThe extensive experiments on various PDE systems and one real-world dataset\ndemonstrate that CeGNN achieves superior performance compared with other\nbaseline models, particularly reducing the prediction error with up to 1 orders\nof magnitude on several PDE systems.\n","authors":["Yuan Mi","Hao Sun"],"pdf_url":"https://arxiv.org/pdf/2409.18013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18000v1","updated":"2024-09-26T16:09:19Z","published":"2024-09-26T16:09:19Z","title":"Safe Time-Varying Optimization based on Gaussian Processes with\n Spatio-Temporal Kernel","summary":" Ensuring safety is a key aspect in sequential decision making problems, such\nas robotics or process control. The complexity of the underlying systems often\nmakes finding the optimal decision challenging, especially when the\nsafety-critical system is time-varying. Overcoming the problem of optimizing an\nunknown time-varying reward subject to unknown time-varying safety constraints,\nwe propose TVSafeOpt, a new algorithm built on Bayesian optimization with a\nspatio-temporal kernel. The algorithm is capable of safely tracking a\ntime-varying safe region without the need for explicit change detection.\nOptimality guarantees are also provided for the algorithm when the optimization\nproblem becomes stationary. We show that TVSafeOpt compares favorably against\nSafeOpt on synthetic data, both regarding safety and optimality. Evaluation on\na realistic case study with gas compressors confirms that TVSafeOpt ensures\nsafety when solving time-varying optimization problems with unknown reward and\nsafety functions.\n","authors":["Jialin Li","Marta Zagorowska","Giulia De Pasquale","Alisa Rupenyan","John Lygeros"],"pdf_url":"https://arxiv.org/pdf/2409.18000v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17996v1","updated":"2024-09-26T16:07:24Z","published":"2024-09-26T16:07:24Z","title":"PhoCoLens: Photorealistic and Consistent Reconstruction in Lensless\n Imaging","summary":" Lensless cameras offer significant advantages in size, weight, and cost\ncompared to traditional lens-based systems. Without a focusing lens, lensless\ncameras rely on computational algorithms to recover the scenes from multiplexed\nmeasurements. However, current algorithms struggle with inaccurate forward\nimaging models and insufficient priors to reconstruct high-quality images. To\novercome these limitations, we introduce a novel two-stage approach for\nconsistent and photorealistic lensless image reconstruction. The first stage of\nour approach ensures data consistency by focusing on accurately reconstructing\nthe low-frequency content with a spatially varying deconvolution method that\nadjusts to changes in the Point Spread Function (PSF) across the camera's field\nof view. The second stage enhances photorealism by incorporating a generative\nprior from pre-trained diffusion models. By conditioning on the low-frequency\ncontent retrieved in the first stage, the diffusion model effectively\nreconstructs the high-frequency details that are typically lost in the lensless\nimaging process, while also maintaining image fidelity. Our method achieves a\nsuperior balance between data fidelity and visual quality compared to existing\nmethods, as demonstrated with two popular lensless systems, PhlatCam and\nDiffuserCam. Project website: https://phocolens.github.io/.\n","authors":["Xin Cai","Zhiyuan You","Hailong Zhang","Wentao Liu","Jinwei Gu","Tianfan Xue"],"pdf_url":"https://arxiv.org/pdf/2409.17996v1.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2409.17995v1","updated":"2024-09-26T16:07:20Z","published":"2024-09-26T16:07:20Z","title":"Joint Localization and Planning using Diffusion","summary":" Diffusion models have been successfully applied to robotics problems such as\nmanipulation and vehicle path planning. In this work, we explore their\napplication to end-to-end navigation -- including both perception and planning\n-- by considering the problem of jointly performing global localization and\npath planning in known but arbitrary 2D environments. In particular, we\nintroduce a diffusion model which produces collision-free paths in a global\nreference frame given an egocentric LIDAR scan, an arbitrary map, and a desired\ngoal position. To this end, we implement diffusion in the space of paths in\nSE(2), and describe how to condition the denoising process on both obstacles\nand sensor observations. In our evaluation, we show that the proposed\nconditioning techniques enable generalization to realistic maps of considerably\ndifferent appearance than the training environment, demonstrate our model's\nability to accurately describe ambiguous solutions, and run extensive\nsimulation experiments showcasing our model's use as a real-time, end-to-end\nlocalization and planning stack.\n","authors":["L. Lao Beyer","S. Karaman"],"pdf_url":"https://arxiv.org/pdf/2409.17995v1.pdf","comment":"7 pages, 9 figures. Submitted to ICRA 2025, under review"},{"id":"http://arxiv.org/abs/2405.15618v2","updated":"2024-09-26T16:05:30Z","published":"2024-05-24T15:04:36Z","title":"MLPs Learn In-Context on Regression and Classification Tasks","summary":" In-context learning (ICL), the remarkable ability to solve a task from only\ninput exemplars, is often assumed to be a unique hallmark of Transformer\nmodels. By examining commonly employed synthetic ICL tasks, we demonstrate that\nmulti-layer perceptrons (MLPs) can also learn in-context. Moreover, MLPs, and\nthe closely related MLP-Mixer models, learn in-context competitively with\nTransformers given the same compute budget in this setting. We further show\nthat MLPs outperform Transformers on a series of classical tasks from\npsychology designed to test relational reasoning, which are closely related to\nin-context classification. These results underscore a need for studying\nin-context learning beyond attention-based architectures, while also\nchallenging strong prior arguments about MLPs' limited ability to solve\nrelational tasks. Altogether, our results highlight the unexpected competence\nof MLPs, and support the growing interest in all-MLP alternatives to\ntask-specific architectures.\n","authors":["William L. Tong","Cengiz Pehlevan"],"pdf_url":"https://arxiv.org/pdf/2405.15618v2.pdf","comment":"30 pages, 10 figures, code available at\n https://github.com/wtong98/mlp-icl"},{"id":"http://arxiv.org/abs/2409.17992v1","updated":"2024-09-26T16:02:25Z","published":"2024-09-26T16:02:25Z","title":"LoopSR: Looping Sim-and-Real for Lifelong Policy Adaptation of Legged\n Robots","summary":" Reinforcement Learning (RL) has shown its remarkable and generalizable\ncapability in legged locomotion through sim-to-real transfer. However, while\nadaptive methods like domain randomization are expected to make policy more\nrobust to diverse environments, such comprehensiveness potentially detracts\nfrom the policy's performance in any specific environment according to the No\nFree Lunch theorem, leading to a suboptimal solution once deployed in the real\nworld. To address this issue, we propose a lifelong policy adaptation framework\nnamed LoopSR, which utilizes a transformer-based encoder to project real-world\ntrajectories into a latent space, and accordingly reconstruct the real-world\nenvironments back in simulation for further improvement. Autoencoder\narchitecture and contrastive learning methods are adopted to better extract the\ncharacteristics of real-world dynamics. The simulation parameters for continual\ntraining are derived by combining predicted parameters from the decoder with\nretrieved parameters from the simulation trajectory dataset. By leveraging the\ncontinual training, LoopSR achieves superior data efficiency compared with\nstrong baselines, with only a limited amount of data to yield eminent\nperformance in both sim-to-sim and sim-to-real experiments.\n","authors":["Peilin Wu","Weiji Xie","Jiahang Cao","Hang Lai","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17992v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2409.17991v1","updated":"2024-09-26T16:02:13Z","published":"2024-09-26T16:02:13Z","title":"Dimension-independent learning rates for high-dimensional classification\n problems","summary":" We study the problem of approximating and estimating classification functions\nthat have their decision boundary in the $RBV^2$ space. Functions of $RBV^2$\ntype arise naturally as solutions of regularized neural network learning\nproblems and neural networks can approximate these functions without the curse\nof dimensionality. We modify existing results to show that every $RBV^2$\nfunction can be approximated by a neural network with bounded weights.\nThereafter, we prove the existence of a neural network with bounded weights\napproximating a classification function. And we leverage these bounds to\nquantify the estimation rates. Finally, we present a numerical study that\nanalyzes the effect of different regularity conditions on the decision\nboundaries.\n","authors":["Andres Felipe Lerma-Pineda","Philipp Petersen","Simon Frieder","Thomas Lukasiewicz"],"pdf_url":"https://arxiv.org/pdf/2409.17991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17986v1","updated":"2024-09-26T15:56:40Z","published":"2024-09-26T15:56:40Z","title":"Supra-Laplacian Encoding for Transformer on Dynamic Graphs","summary":" Fully connected Graph Transformers (GT) have rapidly become prominent in the\nstatic graph community as an alternative to Message-Passing models, which\nsuffer from a lack of expressivity, oversquashing, and under-reaching. However,\nin a dynamic context, by interconnecting all nodes at multiple snapshots with\nself-attention, GT loose both structural and temporal information. In this\nwork, we introduce Supra-LAplacian encoding for spatio-temporal TransformErs\n(SLATE), a new spatio-temporal encoding to leverage the GT architecture while\nkeeping spatio-temporal information. Specifically, we transform Discrete Time\nDynamic Graphs into multi-layer graphs and take advantage of the spectral\nproperties of their associated supra-Laplacian matrix. Our second contribution\nexplicitly model nodes' pairwise relationships with a cross-attention\nmechanism, providing an accurate edge representation for dynamic link\nprediction. SLATE outperforms numerous state-of-the-art methods based on\nMessage-Passing Graph Neural Networks combined with recurrent models (e.g\nLSTM), and Dynamic Graph Transformers, on 9 datasets. Code and instructions to\nreproduce our results will be open-sourced.\n","authors":["Yannis Karmim","Marc Lafon","Raphaël Fournier S'niehotta","Nicolas Thome"],"pdf_url":"https://arxiv.org/pdf/2409.17986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15244v2","updated":"2024-09-26T15:56:30Z","published":"2024-03-22T14:40:29Z","title":"A Stochastic Quasi-Newton Method for Non-convex Optimization with\n Non-uniform Smoothness","summary":" Classical convergence analyses for optimization algorithms rely on the\nwidely-adopted uniform smoothness assumption. However, recent experimental\nstudies have demonstrated that many machine learning problems exhibit\nnon-uniform smoothness, meaning the smoothness factor is a function of the\nmodel parameter instead of a universal constant. In particular, it has been\nobserved that the smoothness grows with respect to the gradient norm along the\ntraining trajectory. Motivated by this phenomenon, the recently introduced\n$(L_0, L_1)$-smoothness is a more general notion, compared to traditional\n$L$-smoothness, that captures such positive relationship between smoothness and\ngradient norm. Under this type of non-uniform smoothness, existing literature\nhas designed stochastic first-order algorithms by utilizing gradient clipping\ntechniques to obtain the optimal $\\mathcal{O}(\\epsilon^{-3})$ sample complexity\nfor finding an $\\epsilon$-approximate first-order stationary solution.\nNevertheless, the studies of quasi-Newton methods are still lacking.\nConsidering higher accuracy and more robustness for quasi-Newton methods, in\nthis paper we propose a fast stochastic quasi-Newton method when there exists\nnon-uniformity in smoothness. Leveraging gradient clipping and variance\nreduction, our algorithm can achieve the best-known\n$\\mathcal{O}(\\epsilon^{-3})$ sample complexity and enjoys convergence speedup\nwith simple hyperparameter tuning. Our numerical experiments show that our\nproposed algorithm outperforms the state-of-the-art approaches.\n","authors":["Zhenyu Sun","Ermin Wei"],"pdf_url":"https://arxiv.org/pdf/2403.15244v2.pdf","comment":"Paper accepted by CDC 2024"},{"id":"http://arxiv.org/abs/2409.17985v1","updated":"2024-09-26T15:55:59Z","published":"2024-09-26T15:55:59Z","title":"Hypergame Theory for Decentralized Resource Allocation in Multi-user\n Semantic Communications","summary":" Semantic communications (SC) is an emerging communication paradigm in which\nwireless devices can send only relevant information from a source of data while\nrelying on computing resources to regenerate missing data points. However, the\ndesign of a multi-user SC system becomes more challenging because of the\ncomputing and communication overhead required for coordination. Existing\nsolutions for learning the semantic language and performing resource allocation\noften fail to capture the computing and communication tradeoffs involved in\nmultiuser SC. To address this gap, a novel framework for decentralized\ncomputing and communication resource allocation in multiuser SC systems is\nproposed. The challenge of efficiently allocating communication and computing\nresources (for reasoning) in a decentralized manner to maximize the quality of\ntask experience for the end users is addressed through the application of\nStackelberg hyper game theory. Leveraging the concept of second-level hyper\ngames, novel analytical formulations are developed to model misperceptions of\nthe users about each other's communication and control strategies. Further,\nequilibrium analysis of the learned resource allocation protocols examines the\nconvergence of the computing and communication strategies to a local\nStackelberg equilibria, considering misperceptions. Simulation results show\nthat the proposed Stackelberg hyper game results in efficient usage of\ncommunication and computing resources while maintaining a high quality of\nexperience for the users compared to state-of-the-art that does not account for\nthe misperceptions.\n","authors":["Christo Kurisummoottil Thomas","Walid Saad"],"pdf_url":"https://arxiv.org/pdf/2409.17985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15059v2","updated":"2024-09-26T15:53:10Z","published":"2024-05-23T21:17:20Z","title":"Message-Passing Monte Carlo: Generating low-discrepancy point sets via\n Graph Neural Networks","summary":" Discrepancy is a well-known measure for the irregularity of the distribution\nof a point set. Point sets with small discrepancy are called low-discrepancy\nand are known to efficiently fill the space in a uniform manner.\nLow-discrepancy points play a central role in many problems in science and\nengineering, including numerical integration, computer vision, machine\nperception, computer graphics, machine learning, and simulation. In this work,\nwe present the first machine learning approach to generate a new class of\nlow-discrepancy point sets named Message-Passing Monte Carlo (MPMC) points.\nMotivated by the geometric nature of generating low-discrepancy point sets, we\nleverage tools from Geometric Deep Learning and base our model on Graph Neural\nNetworks. We further provide an extension of our framework to higher\ndimensions, which flexibly allows the generation of custom-made points that\nemphasize the uniformity in specific dimensions that are primarily important\nfor the particular problem at hand. Finally, we demonstrate that our proposed\nmodel achieves state-of-the-art performance superior to previous methods by a\nsignificant margin. In fact, MPMC points are empirically shown to be either\noptimal or near-optimal with respect to the discrepancy for low dimension and\nsmall number of points, i.e., for which the optimal discrepancy can be\ndetermined. Code for generating MPMC points can be found at\nhttps://github.com/tk-rusch/MPMC.\n","authors":["T. Konstantin Rusch","Nathan Kirk","Michael M. Bronstein","Christiane Lemieux","Daniela Rus"],"pdf_url":"https://arxiv.org/pdf/2405.15059v2.pdf","comment":"Published in Proceedings of the National Academy of Sciences (PNAS):\n https://www.pnas.org/doi/10.1073/pnas.2409913121"},{"id":"http://arxiv.org/abs/2409.17978v1","updated":"2024-09-26T15:52:36Z","published":"2024-09-26T15:52:36Z","title":"HydraViT: Stacking Heads for a Scalable ViT","summary":" The architecture of Vision Transformers (ViTs), particularly the Multi-head\nAttention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs\non devices with varying constraints, such as mobile phones, requires multiple\nmodels of different sizes. However, this approach has limitations, such as\ntraining and storing each required model separately. This paper introduces\nHydraViT, a novel approach that addresses these limitations by stacking\nattention heads to achieve a scalable ViT. By repeatedly changing the size of\nthe embedded dimensions throughout each layer and their corresponding number of\nattention heads in MHA during training, HydraViT induces multiple subnetworks.\nThereby, HydraViT achieves adaptability across a wide spectrum of hardware\nenvironments while maintaining performance. Our experimental results\ndemonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10\nsubnetworks, covering a wide range of resource constraints. HydraViT achieves\nup to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy\nwith the same throughput on ImageNet-1K compared to the baselines, making it an\neffective solution for scenarios where hardware availability is diverse or\nvaries over time. Source code available at https://github.com/ds-kiel/HydraViT.\n","authors":["Janek Haberer","Ali Hojjat","Olaf Landsiedel"],"pdf_url":"https://arxiv.org/pdf/2409.17978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17972v1","updated":"2024-09-26T15:47:42Z","published":"2024-09-26T15:47:42Z","title":"BEATS: Optimizing LLM Mathematical Capabilities with BackVerify and\n Adaptive Disambiguate based Efficient Tree Search","summary":" Large Language Models (LLMs) have exhibited exceptional performance across a\nbroad range of tasks and domains. However, they still encounter difficulties in\nsolving mathematical problems due to the rigorous and logical nature of\nmathematics. Previous studies have employed techniques such as supervised\nfine-tuning (SFT), prompt engineering, and search-based methods to improve the\nmathematical problem-solving abilities of LLMs. Despite these efforts, their\nperformance remains suboptimal and demands substantial computational resources.\nTo address this issue, we propose a novel approach, BEATS, to enhance\nmathematical problem-solving abilities. Our method leverages newly designed\nprompts that guide the model to iteratively rewrite, advance by one step, and\ngenerate answers based on previous steps. Additionally, we introduce a new\nback-verification technique that uses LLMs to validate the correctness of the\ngenerated answers. Furthermore, we employ a pruning tree search to optimize\nsearch time while achieving strong performance. Notably, our method improves\nQwen2-7b-Instruct's score from 36.94 to 61.52, outperforming GPT4's 42.5 on the\nMATH benchmark.\n","authors":["Linzhuang Sun","Hao Liang","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14500v2","updated":"2024-09-26T15:26:43Z","published":"2024-09-22T15:53:19Z","title":"TabGraphs: A Benchmark and Strong Baselines for Learning on Graphs with\n Tabular Node Features","summary":" Tabular machine learning is an important field for industry and science. In\nthis field, table rows are usually treated as independent data samples, but\nadditional information about relations between them is sometimes available and\ncan be used to improve predictive performance. Such information can be\nnaturally modeled with a graph, thus tabular machine learning may benefit from\ngraph machine learning methods. However, graph machine learning models are\ntypically evaluated on datasets with homogeneous node features, which have\nlittle in common with heterogeneous mixtures of numerical and categorical\nfeatures present in tabular datasets. Thus, there is a critical difference\nbetween the data used in tabular and graph machine learning studies, which does\nnot allow one to understand how successfully graph models can be transferred to\ntabular data. To bridge this gap, we propose a new benchmark of diverse graphs\nwith heterogeneous tabular node features and realistic prediction tasks. We use\nthis benchmark to evaluate a vast set of models, including simple methods\npreviously overlooked in the literature. Our experiments show that graph neural\nnetworks (GNNs) can indeed often bring gains in predictive performance for\ntabular data, but standard tabular models also can be adapted to work with\ngraph data by using simple feature preprocessing, which sometimes enables them\nto compete with and even outperform GNNs. Based on our empirical study, we\nprovide insights for researchers and practitioners in both tabular and graph\nmachine learning fields.\n","authors":["Gleb Bazhenov","Oleg Platonov","Liudmila Prokhorenkova"],"pdf_url":"https://arxiv.org/pdf/2409.14500v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09825v2","updated":"2024-09-26T15:21:31Z","published":"2024-06-14T08:29:34Z","title":"Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of\n Anomalous Behavior in Bio-regenerative Life Support System Telemetry","summary":" The detection of abnormal or critical system states is essential in condition\nmonitoring. While much attention is given to promptly identifying anomalies, a\nretrospective analysis of these anomalies can significantly enhance our\ncomprehension of the underlying causes of observed undesired behavior. This\naspect becomes particularly critical when the monitored system is deployed in a\nvital environment. In this study, we delve into anomalies within the domain of\nBio-Regenerative Life Support Systems (BLSS) for space exploration and analyze\nanomalies found in telemetry data stemming from the EDEN ISS space greenhouse\nin Antarctica. We employ time series clustering on anomaly detection results to\ncategorize various types of anomalies in both uni- and multivariate settings.\nWe then assess the effectiveness of these methods in identifying systematic\nanomalous behavior. Additionally, we illustrate that the anomaly detection\nmethods MDI and DAMP produce complementary results, as previously indicated by\nresearch.\n","authors":["Ferdinand Rewicki","Jakob Gawlikowski","Julia Niebling","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2406.09825v2.pdf","comment":"12 pages, + Supplemental Materials, Published at Machine Learning and\n Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024"},{"id":"http://arxiv.org/abs/2409.17943v1","updated":"2024-09-26T15:18:34Z","published":"2024-09-26T15:18:34Z","title":"On Translating Technical Terminology: A Translation Workflow for\n Machine-Translated Acronyms","summary":" The typical workflow for a professional translator to translate a document\nfrom its source language (SL) to a target language (TL) is not always focused\non what many language models in natural language processing (NLP) do - predict\nthe next word in a series of words. While high-resource languages like English\nand French are reported to achieve near human parity using common metrics for\nmeasurement such as BLEU and COMET, we find that an important step is being\nmissed: the translation of technical terms, specifically acronyms. Some\nstate-of-the art machine translation systems like Google Translate which are\npublicly available can be erroneous when dealing with acronyms - as much as 50%\nin our findings. This article addresses acronym disambiguation for MT systems\nby proposing an additional step to the SL-TL (FR-EN) translation workflow where\nwe first offer a new acronym corpus for public consumption and then experiment\nwith a search-based thresholding algorithm that achieves nearly 10% increase\nwhen compared to Google Translate and OpusMT.\n","authors":["Richard Yue","John E. Ortega","Kenneth Ward Church"],"pdf_url":"https://arxiv.org/pdf/2409.17943v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17939v1","updated":"2024-09-26T15:12:59Z","published":"2024-09-26T15:12:59Z","title":"Predicting Anchored Text from Translation Memories for Machine\n Translation Using Deep Learning Methods","summary":" Translation memories (TMs) are the backbone for professional translation\ntools called computer-aided translation (CAT) tools. In order to perform a\ntranslation using a CAT tool, a translator uses the TM to gather translations\nsimilar to the desired segment to translate (s'). Many CAT tools offer a\nfuzzy-match algorithm to locate segments (s) in the TM that are close in\ndistance to s'. After locating two similar segments, the CAT tool will present\nparallel segments (s, t) that contain one segment in the source language along\nwith its translation in the target language. Additionally, CAT tools contain\nfuzzy-match repair (FMR) techniques that will automatically use the parallel\nsegments from the TM to create new TM entries containing a modified version of\nthe original with the idea in mind that it will be the translation of s'. Most\nFMR techniques use machine translation as a way of \"repairing\" those words that\nhave to be modified. In this article, we show that for a large part of those\nwords which are anchored, we can use other techniques that are based on machine\nlearning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we\nshow that for anchored words that follow the continuous bag-of-words (CBOW)\nparadigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for\nsome cases, better results than neural machine translation for translating\nanchored words from French to English.\n","authors":["Richard Yue","John E. Ortega"],"pdf_url":"https://arxiv.org/pdf/2409.17939v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17937v1","updated":"2024-09-26T15:12:41Z","published":"2024-09-26T15:12:41Z","title":"Adaptive Stream Processing on Edge Devices through Active Inference","summary":" The current scenario of IoT is witnessing a constant increase on the volume\nof data, which is generated in constant stream, calling for novel architectural\nand logical solutions for processing it. Moving the data handling towards the\nedge of the computing spectrum guarantees better distribution of load and, in\nprinciple, lower latency and better privacy. However, managing such a structure\nis complex, especially when requirements, also referred to Service Level\nObjectives (SLOs), specified by applications' owners and infrastructure\nmanagers need to be ensured. Despite the rich number of proposals of Machine\nLearning (ML) based management solutions, researchers and practitioners yet\nstruggle to guarantee long-term prediction and control, and accurate\ntroubleshooting. Therefore, we present a novel ML paradigm based on Active\nInference (AIF) -- a concept from neuroscience that describes how the brain\nconstantly predicts and evaluates sensory information to decrease long-term\nsurprise. We implement it and evaluate it in a heterogeneous real stream\nprocessing use case, where an AIF-based agent continuously optimizes the\nfulfillment of three SLOs for three autonomous driving services running on\nmultiple devices. The agent used causal knowledge to gradually develop an\nunderstanding of how its actions are related to requirements fulfillment, and\nwhich configurations to favor. Through this approach, our agent requires up to\nthirty iterations to converge to the optimal solution, showing the capability\nof offering accurate results in a short amount of time. Furthermore, thanks to\nAIF and its causal structures, our method guarantees full transparency on the\ndecision making, making the interpretation of the results and the\ntroubleshooting effortless.\n","authors":["Boris Sedlak","Victor Casamayor Pujol","Andrea Morichetta","Praveen Kumar Donta","Schahram Dustdar"],"pdf_url":"https://arxiv.org/pdf/2409.17937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17932v1","updated":"2024-09-26T15:08:52Z","published":"2024-09-26T15:08:52Z","title":"Sample compression unleashed : New generalization bounds for real valued\n losses","summary":" The sample compression theory provides generalization guarantees for\npredictors that can be fully defined using a subset of the training dataset and\na (short) message string, generally defined as a binary sequence. Previous\nworks provided generalization bounds for the zero-one loss, which is\nrestrictive, notably when applied to deep learning approaches. In this paper,\nwe present a general framework for deriving new sample compression bounds that\nhold for real-valued losses. We empirically demonstrate the tightness of the\nbounds and their versatility by evaluating them on different types of models,\ne.g., neural networks and decision forests, trained with the Pick-To-Learn\n(P2L) meta-algorithm, which transforms the training method of any\nmachine-learning predictor to yield sample-compressed predictors. In contrast\nto existing P2L bounds, ours are valid in the non-consistent case.\n","authors":["Mathieu Bazinet","Valentina Zantedeschi","Pascal Germain"],"pdf_url":"https://arxiv.org/pdf/2409.17932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17931v1","updated":"2024-09-26T15:08:38Z","published":"2024-09-26T15:08:38Z","title":"Intelligent Energy Management: Remaining Useful Life Prediction and\n Charging Automation System Comprised of Deep Learning and the Internet of\n Things","summary":" Remaining Useful Life (RUL) of battery is an important parameter to know the\nbattery's remaining life and need for recharge. The goal of this research\nproject is to develop machine learning-based models for the battery RUL\ndataset. Different ML models are developed to classify the RUL of the vehicle,\nand the IoT (Internet of Things) concept is simulated for automating the\ncharging system and managing any faults aligning. The graphs plotted depict the\nrelationship between various vehicle parameters using the Blynk IoT platform.\nResults show that the catboost, Multi-Layer Perceptron (MLP), Gated Recurrent\nUnit (GRU), and hybrid model developed could classify RUL into three classes\nwith 99% more accuracy. The data is fed using the tkinter GUI for simulating\nartificial intelligence (AI)-based charging, and with a pyserial backend, data\ncan be entered into the Esp-32 microcontroller for making charge discharge\npossible with the model's predictions. Also, with an IoT system, the charging\ncan be disconnected, monitored, and analyzed for automation. The results show\nthat an accuracy of 99% can be obtained on models MLP, catboost model and\nsimilar accuracy on GRU model can be obtained, and finally relay-based\ntriggering can be made by prediction through the model used for automating the\ncharging and energy-saving mechanism. By showcasing an exemplary Blynk\nplatform-based monitoring and automation phenomenon, we further present\ninnovative ways of monitoring parameters and automating the system.\n","authors":["Biplov Paneru","Bishwash Paneru","DP Sharma Mainali"],"pdf_url":"https://arxiv.org/pdf/2409.17931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15228v3","updated":"2024-09-26T14:57:52Z","published":"2024-09-23T17:22:09Z","title":"A Comprehensive Framework for Evaluating API-oriented Code Generation in\n Large Language Models","summary":" Large language models (LLMs) like GitHub Copilot and ChatGPT have emerged as\npowerful tools for code generation, significantly enhancing productivity and\naccelerating software development. However, existing benchmarks primarily focus\non general code generation without considering API-oriented code generation,\ni.e., generating code that invokes APIs from specific libraries. Given the\ngrowing demand for API-oriented code generation, there is a pressing need for a\nsystematic and automated approach to evaluate LLM on API-oriented code\ngeneration. To address this gap, we propose AutoAPIEval, a lightweight and\nautomated framework designed to evaluate the capabilities of LLMs in\nAPI-oriented code generation. Our framework works with any library that\nprovides API documentation and focuses on two unit tasks: API recommendation\nand code example generation, along with four metrics to evaluate the generated\nAPIs and code examples, such as the proportion of incorrect API recommendations\nfor Task 1, and the proportion of code examples where no specific API is\ninvoked and uncompilable/unexecutable code examples for Task 2. In addition, we\nconducted a case study on three LLMs (ChatGPT, MagiCoder, and DeepSeek Coder)\nand Java Runtime Environment 8 to demonstrate the framework's effectiveness.\nOur findings reveal substantial variability in LLM performance across tasks,\nwith ChatGPT adhering better to instructions, while sharing similar\neffectiveness in code example generation with its counterparts (i.e., MagiCoder\nand DeekSeek Coder). We also identify key factors associated with code quality,\nsuch as API popularity and model confidence, and build classifiers that achieve\nhigh accuracy in detecting incorrect API recommendations and erroneous code\nexamples. Retrieval-augmented generation enhances the quality of code generated\nby LLMs, though its effectiveness varies across different LLMs.\n","authors":["Yixi Wu","Pengfei He","Zehao Wang","Shaowei Wang","Yuan Tian","Tse-Hsun Chen"],"pdf_url":"https://arxiv.org/pdf/2409.15228v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08201v2","updated":"2024-09-26T14:56:57Z","published":"2024-09-12T16:38:20Z","title":"Machine Learning for Two-Sample Testing under Right-Censored Data: A\n Simulation Study","summary":" The focus of this study is to evaluate the effectiveness of Machine Learning\n(ML) methods for two-sample testing with right-censored observations. To\nachieve this, we develop several ML-based methods with varying architectures\nand implement them as two-sample tests. Each method is an ensemble (stacking)\nthat combines predictions from classical two-sample tests. This paper presents\nthe results of training the proposed ML methods, examines their statistical\npower compared to classical two-sample tests, analyzes the null distribution of\nthe proposed methods when the null hypothesis is true, and evaluates the\nsignificance of the features incorporated into the proposed methods. In total,\nthis work covers 18 methods for two-sample testing under right-censored\nobservations, including the proposed methods and classical well-studied\ntwo-sample tests. All results from numerical experiments were obtained from a\nsynthetic dataset generated using the inverse transform sampling method and\nreplicated multiple times through Monte Carlo simulation. To test the\ntwo-sample problem with right-censored observations, one can use the proposed\ntwo-sample methods (scripts, dataset, and models are available on GitHub and\nHugging Face).\n","authors":["Petr Philonenko","Sergey Postovalov"],"pdf_url":"https://arxiv.org/pdf/2409.08201v2.pdf","comment":"20 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17906v1","updated":"2024-09-26T14:52:40Z","published":"2024-09-26T14:52:40Z","title":"Graph Reasoning with Large Language Models via Pseudo-code Prompting","summary":" Large language models (LLMs) have recently achieved remarkable success in\nvarious reasoning tasks in the field of natural language processing. This\nsuccess of LLMs has also motivated their use in graph-related tasks. Among\nothers, recent work has explored whether LLMs can solve graph problems such as\ncounting the number of connected components of a graph or computing the\nshortest path distance between two nodes. Although LLMs possess preliminary\ngraph reasoning abilities, they might still struggle to solve some seemingly\nsimple problems. In this paper, we investigate whether prompting via\npseudo-code instructions can improve the performance of LLMs in solving graph\nproblems. Our experiments demonstrate that using pseudo-code instructions\ngenerally improves the performance of all considered LLMs. The graphs,\npseudo-code prompts, and evaluation code are publicly available.\n","authors":["Konstantinos Skianis","Giannis Nikolentzos","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2409.17906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17902v1","updated":"2024-09-26T14:50:20Z","published":"2024-09-26T14:50:20Z","title":"Designing Short-Stage CDC-XPUFs: Balancing Reliability, Cost, and\n Security in IoT Devices","summary":" The rapid expansion of Internet of Things (IoT) devices demands robust and\nresource-efficient security solutions. Physically Unclonable Functions (PUFs),\nwhich generate unique cryptographic keys from inherent hardware variations,\noffer a promising approach. However, traditional PUFs like Arbiter PUFs (APUFs)\nand XOR Arbiter PUFs (XOR-PUFs) are susceptible to machine learning (ML) and\nreliability-based attacks. In this study, we investigate\nComponent-Differentially Challenged XOR-PUFs (CDC-XPUFs), a less explored\nvariant, to address these vulnerabilities. We propose an optimized CDC-XPUF\ndesign that incorporates a pre-selection strategy to enhance reliability and\nintroduces a novel lightweight architecture to reduce hardware overhead.\nRigorous testing demonstrates that our design significantly lowers resource\nconsumption, maintains strong resistance to ML attacks, and improves\nreliability, effectively mitigating reliability-based attacks. These results\nhighlight the potential of CDC-XPUFs as a secure and efficient candidate for\nwidespread deployment in resource-constrained IoT systems.\n","authors":["Gaoxiang Li","Yu Zhuang"],"pdf_url":"https://arxiv.org/pdf/2409.17902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17896v1","updated":"2024-09-26T14:47:14Z","published":"2024-09-26T14:47:14Z","title":"Model-Free versus Model-Based Reinforcement Learning for Fixed-Wing UAV\n Attitude Control Under Varying Wind Conditions","summary":" This paper evaluates and compares the performance of model-free and\nmodel-based reinforcement learning for the attitude control of fixed-wing\nunmanned aerial vehicles using PID as a reference point. The comparison focuses\non their ability to handle varying flight dynamics and wind disturbances in a\nsimulated environment. Our results show that the Temporal Difference Model\nPredictive Control agent outperforms both the PID controller and other\nmodel-free reinforcement learning methods in terms of tracking accuracy and\nrobustness over different reference difficulties, particularly in nonlinear\nflight regimes. Furthermore, we introduce actuation fluctuation as a key metric\nto assess energy efficiency and actuator wear, and we test two different\napproaches from the literature: action variation penalty and conditioning for\naction policy smoothness. We also evaluate all control methods when subject to\nstochastic turbulence and gusts separately, so as to measure their effects on\ntracking performance, observe their limitations and outline their implications\non the Markov decision process formalism.\n","authors":["David Olivares","Pierre Fournier","Pavan Vasishta","Julien Marzat"],"pdf_url":"https://arxiv.org/pdf/2409.17896v1.pdf","comment":"Published at ICINCO 2024"},{"id":"http://arxiv.org/abs/2409.17889v1","updated":"2024-09-26T14:38:54Z","published":"2024-09-26T14:38:54Z","title":"A multi-source data power load forecasting method using attention\n mechanism-based parallel cnn-gru","summary":" Accurate power load forecasting is crucial for improving energy efficiency\nand ensuring power supply quality. Considering the power load forecasting\nproblem involves not only dynamic factors like historical load variations but\nalso static factors such as climate conditions that remain constant over\nspecific periods. From the model-agnostic perspective, this paper proposes a\nparallel structure network to extract important information from both dynamic\nand static data. Firstly, based on complexity learning theory, it is\ndemonstrated that models integrated through parallel structures exhibit\nsuperior generalization abilities compared to individual base learners.\nAdditionally, the higher the independence between base learners, the stronger\nthe generalization ability of the parallel structure model. This suggests that\nthe structure of machine learning models inherently contains significant\ninformation. Building on this theoretical foundation, a parallel convolutional\nneural network (CNN)-gate recurrent unit (GRU) attention model (PCGA) is\nemployed to address the power load forecasting issue, aiming to effectively\nintegrate the influences of dynamic and static features. The CNN module is\nresponsible for capturing spatial characteristics from static data, while the\nGRU module captures long-term dependencies in dynamic time series data. The\nattention layer is designed to focus on key information from the\nspatial-temporal features extracted by the parallel CNN-GRU. To substantiate\nthe advantages of the parallel structure model in extracting and integrating\nmulti-source information, a series of experiments are conducted.\n","authors":["Chao Min","Yijia Wang","Bo Zhang","Xin Ma","Junyi Cui"],"pdf_url":"https://arxiv.org/pdf/2409.17889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.12060v2","updated":"2024-09-26T14:32:36Z","published":"2021-06-22T21:15:00Z","title":"Faster Randomized Methods for Orthogonality Constrained Problems","summary":" Recent literature has advocated the use of randomized methods for\naccelerating the solution of various matrix problems arising throughout data\nscience and computational science. One popular strategy for leveraging\nrandomization is to use it as a way to reduce problem size. However, methods\nbased on this strategy lack sufficient accuracy for some applications.\nRandomized preconditioning is another approach for leveraging randomization,\nwhich provides higher accuracy. The main challenge in using randomized\npreconditioning is the need for an underlying iterative method, thus randomized\npreconditioning so far have been applied almost exclusively to solving\nregression problems and linear systems. In this article, we show how to expand\nthe application of randomized preconditioning to another important set of\nproblems prevalent across data science: optimization problems with\n(generalized) orthogonality constraints. We demonstrate our approach, which is\nbased on the framework of Riemannian optimization and Riemannian\npreconditioning, on the problem of computing the dominant canonical\ncorrelations and on the Fisher linear discriminant analysis problem. For both\nproblems, we evaluate the effect of preconditioning on the computational costs\nand asymptotic convergence, and demonstrate empirically the utility of our\napproach.\n","authors":["Boris Shustin","Haim Avron"],"pdf_url":"https://arxiv.org/pdf/2106.12060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01807v2","updated":"2024-09-26T14:21:10Z","published":"2023-10-03T05:40:56Z","title":"Discrete, compositional, and symbolic representations through attractor\n dynamics","summary":" Symbolic systems are powerful frameworks for modeling cognitive processes as\nthey encapsulate the rules and relationships fundamental to many aspects of\nhuman reasoning and behavior. Central to these models are systematicity,\ncompositionality, and productivity, making them invaluable in both cognitive\nscience and artificial intelligence. However, certain limitations remain. For\ninstance, the integration of structured symbolic processes and latent\nsub-symbolic processes has been implemented at the computational level through\nfiat methods such as quantization or softmax sampling, which assume, rather\nthan derive, the operations underpinning discretization and symbolicization. In\nthis work, we introduce a novel neural stochastic dynamical systems model that\nintegrates attractor dynamics with symbolic representations to model cognitive\nprocesses akin to the probabilistic language of thought (PLoT). Our model\nsegments the continuous representational space into discrete basins, with\nattractor states corresponding to symbolic sequences, that reflect the\nsemanticity and compositionality characteristic of symbolic systems through\nunsupervised learning, rather than relying on pre-defined primitives. Moreover,\nlike PLoT, our model learns to sample a diverse distribution of attractor\nstates that reflect the mutual information between the input data and the\nsymbolic encodings. This approach establishes a unified framework that\nintegrates both symbolic and sub-symbolic processing through neural dynamics, a\nneuro-plausible substrate with proven expressivity in AI, offering a more\ncomprehensive model that mirrors the complex duality of cognitive operations.\n","authors":["Andrew Nam","Eric Elmoznino","Nikolay Malkin","James McClelland","Yoshua Bengio","Guillaume Lajoie"],"pdf_url":"https://arxiv.org/pdf/2310.01807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17872v1","updated":"2024-09-26T14:19:07Z","published":"2024-09-26T14:19:07Z","title":"A method for identifying causality in the response of nonlinear\n dynamical systems","summary":" Predicting the response of nonlinear dynamical systems subject to random,\nbroadband excitation is important across a range of scientific disciplines,\nsuch as structural dynamics and neuroscience. Building data-driven models\nrequires experimental measurements of the system input and output, but it can\nbe difficult to determine whether inaccuracies in the model stem from modelling\nerrors or noise. This paper presents a novel method to identify the causal\ncomponent of the input-output data from measurements of a system in the\npresence of output noise, as a function of frequency, without needing a high\nfidelity model. An output prediction, calculated using an available model, is\noptimally combined with noisy measurements of the output to predict the input\nto the system. The parameters of the algorithm balance the two output signals\nand are utilised to calculate a nonlinear coherence metric as a measure of\ncausality. This method is applicable to a broad class of nonlinear dynamical\nsystems. There are currently no solutions to this problem in the absence of a\ncomplete benchmark model.\n","authors":["Joseph Massingham","Ole Nielsen","Tore Butlin"],"pdf_url":"https://arxiv.org/pdf/2409.17872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17870v1","updated":"2024-09-26T14:17:58Z","published":"2024-09-26T14:17:58Z","title":"Efficient Arbitrary Precision Acceleration for Large Language Models on\n GPU Tensor Cores","summary":" Large language models (LLMs) have been widely applied but face challenges in\nefficient inference. While quantization methods reduce computational demands,\nultra-low bit quantization with arbitrary precision is hindered by limited GPU\nTensor Core support and inefficient memory management, leading to suboptimal\nacceleration. To address these challenges, we propose a comprehensive\nacceleration scheme for arbitrary precision LLMs. At its core, we introduce a\nnovel bipolar-INT data format that facilitates parallel computing and supports\nsymmetric quantization, effectively reducing data redundancy. Building on this,\nwe implement an arbitrary precision matrix multiplication scheme that\ndecomposes and recovers matrices at the bit level, enabling flexible precision\nwhile maximizing GPU Tensor Core utilization. Furthermore, we develop an\nefficient matrix preprocessing method that optimizes data layout for subsequent\ncomputations. Finally, we design a data recovery-oriented memory management\nsystem that strategically utilizes fast shared memory, significantly enhancing\nkernel execution speed and minimizing memory access latency. Experimental\nresults demonstrate our approach's effectiveness, with up to 13\\times speedup\nin matrix multiplication compared to NVIDIA's CUTLASS. When integrated into\nLLMs, we achieve up to 6.7\\times inference acceleration. These improvements\nsignificantly enhance LLM inference efficiency, enabling broader and more\nresponsive applications of LLMs.\n","authors":["Shaobo Ma","Chao Fang","Haikuo Shao","Zhongfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05208v3","updated":"2024-09-26T14:16:01Z","published":"2023-10-08T15:49:36Z","title":"ZSC-Eval: An Evaluation Toolkit and Benchmark for Multi-agent Zero-shot\n Coordination","summary":" Zero-shot coordination (ZSC) is a new cooperative multi-agent reinforcement\nlearning (MARL) challenge that aims to train an ego agent to work with diverse,\nunseen partners during deployment. The significant difference between the\ndeployment-time partners' distribution and the training partners' distribution\ndetermined by the training algorithm makes ZSC a unique out-of-distribution\n(OOD) generalization challenge. The potential distribution gap between\nevaluation and deployment-time partners leads to inadequate evaluation, which\nis exacerbated by the lack of appropriate evaluation metrics. In this paper, we\npresent ZSC-Eval, the first evaluation toolkit and benchmark for ZSC\nalgorithms. ZSC-Eval consists of: 1) Generation of evaluation partner\ncandidates through behavior-preferring rewards to approximate deployment-time\npartners' distribution; 2) Selection of evaluation partners by Best-Response\nDiversity (BR-Div); 3) Measurement of generalization performance with various\nevaluation partners via the Best-Response Proximity (BR-Prox) metric. We use\nZSC-Eval to benchmark ZSC algorithms in Overcooked and Google Research Football\nenvironments and get novel empirical findings. We also conduct a human\nexperiment of current ZSC algorithms to verify the ZSC-Eval's consistency with\nhuman evaluation. ZSC-Eval is now available at\nhttps://github.com/sjtu-marl/ZSC-Eval.\n","authors":["Xihuai Wang","Shao Zhang","Wenhao Zhang","Wentao Dong","Jingxiao Chen","Ying Wen","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.05208v3.pdf","comment":"Accepted in NeurIPS 2024 Dataset and Benchmark Track"},{"id":"http://arxiv.org/abs/2409.17865v1","updated":"2024-09-26T14:15:54Z","published":"2024-09-26T14:15:54Z","title":"Implementing a Nordic-Baltic Federated Health Data Network: a case\n report","summary":" Background: Centralized collection and processing of healthcare data across\nnational borders pose significant challenges, including privacy concerns, data\nheterogeneity and legal barriers. To address some of these challenges, we\nformed an interdisciplinary consortium to develop a feder-ated health data\nnetwork, comprised of six institutions across five countries, to facilitate\nNordic-Baltic cooperation on secondary use of health data. The objective of\nthis report is to offer early insights into our experiences developing this\nnetwork. Methods: We used a mixed-method ap-proach, combining both experimental\ndesign and implementation science to evaluate the factors affecting the\nimplementation of our network. Results: Technically, our experiments indicate\nthat the network functions without significant performance degradation compared\nto centralized simu-lation. Conclusion: While use of interdisciplinary\napproaches holds a potential to solve challeng-es associated with establishing\nsuch collaborative networks, our findings turn the spotlight on the uncertain\nregulatory landscape playing catch up and the significant operational costs.\n","authors":["Taridzo Chomutare","Aleksandar Babic","Laura-Maria Peltonen","Silja Elunurm","Peter Lundberg","Arne Jönsson","Emma Eneling","Ciprian-Virgil Gerstenberger","Troels Siggaard","Raivo Kolde","Oskar Jerdhaf","Martin Hansson","Alexandra Makhlysheva","Miroslav Muzny","Erik Ylipää","Søren Brunak","Hercules Dalianis"],"pdf_url":"https://arxiv.org/pdf/2409.17865v1.pdf","comment":"24 pages (including appendices), 1 figure"},{"id":"http://arxiv.org/abs/2409.17864v1","updated":"2024-09-26T14:12:23Z","published":"2024-09-26T14:12:23Z","title":"A Multimodal Single-Branch Embedding Network for Recommendation in\n Cold-Start and Missing Modality Scenarios","summary":" Most recommender systems adopt collaborative filtering (CF) and provide\nrecommendations based on past collective interactions. Therefore, the\nperformance of CF algorithms degrades when few or no interactions are\navailable, a scenario referred to as cold-start. To address this issue,\nprevious work relies on models leveraging both collaborative data and side\ninformation on the users or items. Similar to multimodal learning, these models\naim at combining collaborative and content representations in a shared\nembedding space. In this work we propose a novel technique for multimodal\nrecommendation, relying on a multimodal Single-Branch embedding network for\nRecommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction\ndata as well as multimodal side information using the same single-branch\nembedding network on different modalities. This makes SiBraR effective in\nscenarios of missing modality, including cold start. Our extensive experiments\non large-scale recommendation datasets from three different recommendation\ndomains (music, movie, and e-commerce) and providing multimodal content\ninformation (audio, text, image, labels, and interactions) show that SiBraR\nsignificantly outperforms CF as well as state-of-the-art content-based RSs in\ncold-start scenarios, and is competitive in warm scenarios. We show that\nSiBraR's recommendations are accurate in missing modality scenarios, and that\nthe model is able to map different modalities to the same region of the shared\nembedding space, hence reducing the modality gap.\n","authors":["Christian Ganhör","Marta Moscati","Anna Hausberger","Shah Nawaz","Markus Schedl"],"pdf_url":"https://arxiv.org/pdf/2409.17864v1.pdf","comment":"Accepted at 18th ACM Conference on Recommender Systems (RecSys '24)"},{"id":"http://arxiv.org/abs/2409.17858v1","updated":"2024-09-26T14:05:32Z","published":"2024-09-26T14:05:32Z","title":"How Feature Learning Can Improve Neural Scaling Laws","summary":" We develop a solvable model of neural scaling laws beyond the kernel limit.\nTheoretical analysis of this model shows how performance scales with model\nsize, training time, and the total amount of available data. We identify three\nscaling regimes corresponding to varying task difficulties: hard, easy, and\nsuper easy tasks. For easy and super-easy target functions, which lie in the\nreproducing kernel Hilbert space (RKHS) defined by the initial infinite-width\nNeural Tangent Kernel (NTK), the scaling exponents remain unchanged between\nfeature learning and kernel regime models. For hard tasks, defined as those\noutside the RKHS of the initial NTK, we demonstrate both analytically and\nempirically that feature learning can improve scaling with training time and\ncompute, nearly doubling the exponent for hard tasks. This leads to a different\ncompute optimal strategy to scale parameters and training time in the feature\nlearning regime. We support our finding that feature learning improves the\nscaling law for hard tasks but not for easy and super-easy tasks with\nexperiments of nonlinear MLPs fitting functions with power-law Fourier spectra\non the circle and CNNs learning vision tasks.\n","authors":["Blake Bordelon","Alexander Atanasov","Cengiz Pehlevan"],"pdf_url":"https://arxiv.org/pdf/2409.17858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17852v1","updated":"2024-09-26T13:58:06Z","published":"2024-09-26T13:58:06Z","title":"AMARO: All Heavy-Atom Transferable Neural Network Potentials of Protein\n Thermodynamics","summary":" All-atom molecular simulations offer detailed insights into macromolecular\nphenomena, but their substantial computational cost hinders the exploration of\ncomplex biological processes. We introduce Advanced Machine-learning Atomic\nRepresentation Omni-force-field (AMARO), a new neural network potential (NNP)\nthat combines an O(3)-equivariant message-passing neural network architecture,\nTensorNet, with a coarse-graining map that excludes hydrogen atoms. AMARO\ndemonstrates the feasibility of training coarser NNP, without prior energy\nterms, to run stable protein dynamics with scalability and generalization\ncapabilities.\n","authors":["Antonio Mirarchi","Raul P. Pelaez","Guillem Simeon","Gianni De Fabritiis"],"pdf_url":"https://arxiv.org/pdf/2409.17852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17841v1","updated":"2024-09-26T13:45:36Z","published":"2024-09-26T13:45:36Z","title":"Machine Learning-based vs Deep Learning-based Anomaly Detection in\n Multivariate Time Series for Spacecraft Attitude Sensors","summary":" In the framework of Failure Detection, Isolation and Recovery (FDIR) on\nspacecraft, new AI-based approaches are emerging in the state of the art to\novercome the limitations commonly imposed by traditional threshold checking.\n The present research aims at characterizing two different approaches to the\nproblem of stuck values detection in multivariate time series coming from\nspacecraft attitude sensors. The analysis reveals the performance differences\nin the two approaches, while commenting on their interpretability and\ngeneralization to different scenarios.\n","authors":["R. Gallon","F. Schiemenz","A. Krstova","A. Menicucci","E. Gill"],"pdf_url":"https://arxiv.org/pdf/2409.17841v1.pdf","comment":"Accepted for the ESA SPAICE Conference 2024"},{"id":"http://arxiv.org/abs/2409.17836v1","updated":"2024-09-26T13:38:33Z","published":"2024-09-26T13:38:33Z","title":"Language Models as Zero-shot Lossless Gradient Compressors: Towards\n General Neural Parameter Prior Models","summary":" Despite the widespread use of statistical prior models in various fields,\nsuch models for neural network gradients have long been overlooked. The\ninherent challenge stems from their high-dimensional structures and complex\ninterdependencies, which complicate effective modeling. In this work, we\ndemonstrate the potential of large language models (LLMs) to act as gradient\npriors in a zero-shot setting. We examine the property by considering lossless\ngradient compression -- a critical application in distributed learning -- that\ndepends heavily on precise probability modeling. To achieve this, we introduce\nLM-GC, a novel method that integrates LLMs with arithmetic coding. Our\ntechnique converts plain gradients into text-like formats, enhancing token\nefficiency by up to 38 times compared to their plain representations. We ensure\nthat this data conversion maintains a close alignment with the structure of\nplain gradients and the symbols commonly recognized by LLMs. Our experiments\nindicate that LM-GC surpasses existing state-of-the-art lossless compression\nmethods, improving compression rates by 10\\% up to 17.2\\% across various\ndatasets and architectures. Additionally, our approach shows promising\ncompatibility with lossy compression techniques such as quantization and\nsparsification. These findings highlight the significant potential of LLMs as a\nmodel for effectively handling gradients. We will release the source code upon\npublication.\n","authors":["Hui-Po Wang","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2409.17836v1.pdf","comment":"To appear in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17833v1","updated":"2024-09-26T13:35:42Z","published":"2024-09-26T13:35:42Z","title":"Ordinary Differential Equations for Enhanced 12-Lead ECG Generation","summary":" In the realm of artificial intelligence, the generation of realistic training\ndata for supervised learning tasks presents a significant challenge. This is\nparticularly true in the synthesis of electrocardiograms (ECGs), where the\nobjective is to develop a synthetic 12-lead ECG model. The primary complexity\nof this task stems from accurately modeling the intricate biological and\nphysiological interactions among different ECG leads. Although mathematical\nprocess simulators have shed light on these dynamics, effectively incorporating\nthis understanding into generative models is not straightforward. In this work,\nwe introduce an innovative method that employs ordinary differential equations\n(ODEs) to enhance the fidelity of generating 12-lead ECG data. This approach\nintegrates a system of ODEs that represent cardiac dynamics directly into the\ngenerative model's optimization process, allowing for the production of\nbiologically plausible ECG training data that authentically reflects real-world\nvariability and inter-lead dependencies. We conducted an empirical analysis of\nthousands of ECGs and found that incorporating cardiac simulation insights into\nthe data generation process significantly improves the accuracy of heart\nabnormality classifiers trained on this synthetic 12-lead ECG data.\n","authors":["Yakir Yehuda","Kira Radinsky"],"pdf_url":"https://arxiv.org/pdf/2409.17833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00551v2","updated":"2024-09-26T13:34:41Z","published":"2024-06-01T20:46:40Z","title":"Strategic Linear Contextual Bandits","summary":" Motivated by the phenomenon of strategic agents gaming a recommender system\nto maximize the number of times they are recommended to users, we study a\nstrategic variant of the linear contextual bandit problem, where the arms can\nstrategically misreport privately observed contexts to the learner. We treat\nthe algorithm design problem as one of mechanism design under uncertainty and\npropose the Optimistic Grim Trigger Mechanism (OptGTM) that incentivizes the\nagents (i.e., arms) to report their contexts truthfully while simultaneously\nminimizing regret. We also show that failing to account for the strategic\nnature of the agents results in linear regret. However, a trade-off between\nmechanism design and regret minimization appears to be unavoidable. More\nbroadly, this work aims to provide insight into the intersection of online\nlearning and mechanism design.\n","authors":["Thomas Kleine Buening","Aadirupa Saha","Christos Dimitrakakis","Haifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2406.00551v2.pdf","comment":"To appear at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.04769v2","updated":"2024-09-26T13:31:40Z","published":"2024-06-07T09:15:29Z","title":"Diffusion-based Generative Image Outpainting for Recovery of\n FOV-Truncated CT Images","summary":" Field-of-view (FOV) recovery of truncated chest CT scans is crucial for\naccurate body composition analysis, which involves quantifying skeletal muscle\nand subcutaneous adipose tissue (SAT) on CT slices. This, in turn, enables\ndisease prognostication. Here, we present a method for recovering truncated CT\nslices using generative image outpainting. We train a diffusion model and apply\nit to truncated CT slices generated by simulating a small FOV. Our model\nreliably recovers the truncated anatomy and outperforms the previous\nstate-of-the-art despite being trained on 87% less data.\n","authors":["Michelle Espranita Liman","Daniel Rueckert","Florian J. Fintelmann","Philip Müller"],"pdf_url":"https://arxiv.org/pdf/2406.04769v2.pdf","comment":"Shared last authorship: Florian J. Fintelmann and Philip M\\\"uller"},{"id":"http://arxiv.org/abs/2409.17113v2","updated":"2024-09-26T13:30:51Z","published":"2024-09-25T17:27:02Z","title":"Characterizing stable regions in the residual stream of LLMs","summary":" We identify \"stable regions\" in the residual stream of Transformers, where\nthe model's output remains insensitive to small activation changes, but\nexhibits high sensitivity at region boundaries. These regions emerge during\ntraining and become more defined as training progresses or model size\nincreases. The regions appear to be much larger than previously studied\npolytopes. Our analysis suggests that these stable regions align with semantic\ndistinctions, where similar prompts cluster within regions, and activations\nfrom the same region lead to similar next token predictions. This work provides\na promising research direction for understanding the complexity of neural\nnetworks, shedding light on training dynamics, and advancing interpretability.\n","authors":["Jett Janiak","Jacek Karwowski","Chatrik Singh Mangat","Giorgi Giglemiani","Nora Petrova","Stefan Heimersheim"],"pdf_url":"https://arxiv.org/pdf/2409.17113v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14372v2","updated":"2024-09-26T13:23:54Z","published":"2024-05-23T09:48:48Z","title":"Learning Constrained Markov Decision Processes With Non-stationary\n Rewards and Constraints","summary":" In constrained Markov decision processes (CMDPs) with adversarial rewards and\nconstraints, a well-known impossibility result prevents any algorithm from\nattaining both sublinear regret and sublinear constraint violation, when\ncompeting against a best-in-hindsight policy that satisfies constraints on\naverage. In this paper, we show that this negative result can be eased in CMDPs\nwith non-stationary rewards and constraints, by providing algorithms whose\nperformances smoothly degrade as non-stationarity increases. Specifically, we\npropose algorithms attaining $\\tilde{\\mathcal{O}} (\\sqrt{T} + C)$ regret and\npositive constraint violation under bandit feedback, where $C$ is a corruption\nvalue measuring the environment non-stationarity. This can be $\\Theta(T)$ in\nthe worst case, coherently with the impossibility result for adversarial CMDPs.\nFirst, we design an algorithm with the desired guarantees when $C$ is known.\nThen, in the case $C$ is unknown, we show how to obtain the same results by\nembedding such an algorithm in a general meta-procedure. This is of independent\ninterest, as it can be applied to any non-stationary constrained online\nlearning setting.\n","authors":["Francesco Emanuele Stradi","Anna Lunghi","Matteo Castiglioni","Alberto Marchesi","Nicola Gatti"],"pdf_url":"https://arxiv.org/pdf/2405.14372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17825v1","updated":"2024-09-26T13:22:22Z","published":"2024-09-26T13:22:22Z","title":"Physics-aligned Schrödinger bridge","summary":" The reconstruction of physical fields from sparse measurements is pivotal in\nboth scientific research and engineering applications. Traditional methods are\nincreasingly supplemented by deep learning models due to their efficacy in\nextracting features from data. However, except for the low accuracy on complex\nphysical systems, these models often fail to comply with essential physical\nconstraints, such as governing equations and boundary conditions. To overcome\nthis limitation, we introduce a novel data-driven field reconstruction\nframework, termed the Physics-aligned Schr\\\"{o}dinger Bridge (PalSB). This\nframework leverages a diffusion Schr\\\"{o}dinger bridge mechanism that is\nspecifically tailored to align with physical constraints. The PalSB approach\nincorporates a dual-stage training process designed to address both local\nreconstruction mapping and global physical principles. Additionally, a\nboundary-aware sampling technique is implemented to ensure adherence to\nphysical boundary conditions. We demonstrate the effectiveness of PalSB through\nits application to three complex nonlinear systems: cylinder flow from Particle\nImage Velocimetry experiments, two-dimensional turbulence, and a\nreaction-diffusion system. The results reveal that PalSB not only achieves\nhigher accuracy but also exhibits enhanced compliance with physical constraints\ncompared to existing methods. This highlights PalSB's capability to generate\nhigh-quality representations of intricate physical interactions, showcasing its\npotential for advancing field reconstruction techniques.\n","authors":["Zeyu Li","Hongkun Dou","Shen Fang","Wang Han","Yue Deng","Lijun Yang"],"pdf_url":"https://arxiv.org/pdf/2409.17825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17808v1","updated":"2024-09-26T13:02:28Z","published":"2024-09-26T13:02:28Z","title":"Generative Modeling of Molecular Dynamics Trajectories","summary":" Molecular dynamics (MD) is a powerful technique for studying microscopic\nphenomena, but its computational cost has driven significant interest in the\ndevelopment of deep learning-based surrogate models. We introduce generative\nmodeling of molecular trajectories as a paradigm for learning flexible\nmulti-task surrogate models of MD from data. By conditioning on appropriately\nchosen frames of the trajectory, we show such generative models can be adapted\nto diverse tasks such as forward simulation, transition path sampling, and\ntrajectory upsampling. By alternatively conditioning on part of the molecular\nsystem and inpainting the rest, we also demonstrate the first steps towards\ndynamics-conditioned molecular design. We validate the full set of these\ncapabilities on tetrapeptide simulations and show that our model can produce\nreasonable ensembles of protein monomers. Altogether, our work illustrates how\ngenerative modeling can unlock value from MD data towards diverse downstream\ntasks that are not straightforward to address with existing methods or even MD\nitself. Code is available at https://github.com/bjing2016/mdgen.\n","authors":["Bowen Jing","Hannes Stärk","Tommi Jaakkola","Bonnie Berger"],"pdf_url":"https://arxiv.org/pdf/2409.17808v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17806v1","updated":"2024-09-26T12:59:09Z","published":"2024-09-26T12:59:09Z","title":"Continual learning with task specialist","summary":" Continual learning (CL) adapt the deep learning scenarios with timely updated\ndatasets. However, existing CL models suffer from the catastrophic forgetting\nissue, where new knowledge replaces past learning. In this paper, we propose\nContinual Learning with Task Specialists (CLTS) to address the issues of\ncatastrophic forgetting and limited labelled data in real-world datasets by\nperforming class incremental learning of the incoming stream of data. The model\nconsists of Task Specialists (T S) and Task Predictor (T P ) with pre-trained\nStable Diffusion (SD) module. Here, we introduce a new specialist to handle a\nnew task sequence and each T S has three blocks; i) a variational autoencoder\n(V AE) to learn the task distribution in a low dimensional latent space, ii) a\nK-Means block to perform data clustering and iii) Bootstrapping Language-Image\nPre-training (BLIP ) model to generate a small batch of captions from the input\ndata. These captions are fed as input to the pre-trained stable diffusion model\n(SD) for the generation of task samples. The proposed model does not store any\ntask samples for replay, instead uses generated samples from SD to train the T\nP module. A comparison study with four SOTA models conducted on three\nreal-world datasets shows that the proposed model outperforms all the selected\nbaselines\n","authors":["Indu Solomon","Aye Phyu Phyu Aung","Uttam Kumar","Senthilnath Jayavelu"],"pdf_url":"https://arxiv.org/pdf/2409.17806v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17804v1","updated":"2024-09-26T12:57:47Z","published":"2024-09-26T12:57:47Z","title":"Enriched Functional Tree-Based Classifiers: A Novel Approach Leveraging\n Derivatives and Geometric Features","summary":" The positioning of this research falls within the scalar-on-function\nclassification literature, a field of significant interest across various\ndomains, particularly in statistics, mathematics, and computer science. This\nstudy introduces an advanced methodology for supervised classification by\nintegrating Functional Data Analysis (FDA) with tree-based ensemble techniques\nfor classifying high-dimensional time series. The proposed framework, Enriched\nFunctional Tree-Based Classifiers (EFTCs), leverages derivative and geometric\nfeatures, benefiting from the diversity inherent in ensemble methods to further\nenhance predictive performance and reduce variance. While our approach has been\ntested on the enrichment of Functional Classification Trees (FCTs), Functional\nK-NN (FKNN), Functional Random Forest (FRF), Functional XGBoost (FXGB), and\nFunctional LightGBM (FLGBM), it could be extended to other tree-based and\nnon-tree-based classifiers, with appropriate considerations emerging from this\ninvestigation. Through extensive experimental evaluations on seven real-world\ndatasets and six simulated scenarios, this proposal demonstrates fascinating\nimprovements over traditional approaches, providing new insights into the\napplication of FDA in complex, high-dimensional learning problems.\n","authors":["Fabrizio Maturo","Annamaria Porreca"],"pdf_url":"https://arxiv.org/pdf/2409.17804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10615v2","updated":"2024-09-26T12:55:43Z","published":"2024-06-15T12:27:35Z","title":"Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation","summary":" Given the high cost of collecting robotic data in the real world, sample\nefficiency is a consistently compelling pursuit in robotics. In this paper, we\nintroduce SGRv2, an imitation learning framework that enhances sample\nefficiency through improved visual and action representations. Central to the\ndesign of SGRv2 is the incorporation of a critical inductive bias-action\nlocality, which posits that robot's actions are predominantly influenced by the\ntarget object and its interactions with the local environment. Extensive\nexperiments in both simulated and real-world settings demonstrate that action\nlocality is essential for boosting sample efficiency. SGRv2 excels in RLBench\ntasks with keyframe control using merely 5 demonstrations and surpasses the RVT\nbaseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and\nMimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR.\nIn real-world environments, with only eight demonstrations, SGRv2 can perform a\nvariety of tasks at a markedly higher success rate compared to baseline models.\nProject website: http://sgrv2-robot.github.io\n","authors":["Tong Zhang","Yingdong Hu","Jiacheng You","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2406.10615v2.pdf","comment":"CoRL 2024. Project website: http://sgrv2-robot.github.io"},{"id":"http://arxiv.org/abs/2409.17790v1","updated":"2024-09-26T12:37:22Z","published":"2024-09-26T12:37:22Z","title":"CASPFormer: Trajectory Prediction from BEV Images with Deformable\n Attention","summary":" Motion prediction is an important aspect for Autonomous Driving (AD) and\nAdvance Driver Assistance Systems (ADAS). Current state-of-the-art motion\nprediction methods rely on High Definition (HD) maps for capturing the\nsurrounding context of the ego vehicle. Such systems lack scalability in\nreal-world deployment as HD maps are expensive to produce and update in\nreal-time. To overcome this issue, we propose Context Aware Scene Prediction\nTransformer (CASPFormer), which can perform multi-modal motion prediction from\nrasterized Bird-Eye-View (BEV) images. Our system can be integrated with any\nupstream perception module that is capable of generating BEV images. Moreover,\nCASPFormer directly decodes vectorized trajectories without any postprocessing.\nTrajectories are decoded recurrently using deformable attention, as it is\ncomputationally efficient and provides the network with the ability to focus\nits attention on the important spatial locations of the BEV images. In\naddition, we also address the issue of mode collapse for generating multiple\nscene-consistent trajectories by incorporating learnable mode queries. We\nevaluate our model on the nuScenes dataset and show that it reaches\nstate-of-the-art across multiple metrics\n","authors":["Harsh Yadav","Maximilian Schaefer","Kun Zhao","Tobias Meisen"],"pdf_url":"https://arxiv.org/pdf/2409.17790v1.pdf","comment":"Under Review at ICPR 2024, Kolkata"},{"id":"http://arxiv.org/abs/2409.06364v2","updated":"2024-09-26T12:33:46Z","published":"2024-09-10T09:42:58Z","title":"What happens to diffusion model likelihood when your model is\n conditional?","summary":" Diffusion Models (DMs) iteratively denoise random samples to produce\nhigh-quality data. The iterative sampling process is derived from Stochastic\nDifferential Equations (SDEs), allowing a speed-quality trade-off chosen at\ninference. Another advantage of sampling with differential equations is exact\nlikelihood computation. These likelihoods have been used to rank unconditional\nDMs and for out-of-domain classification. Despite the many existing and\npossible uses of DM likelihoods, the distinct properties captured are unknown,\nespecially in conditional contexts such as Text-To-Image (TTI) or\nText-To-Speech synthesis (TTS). Surprisingly, we find that TTS DM likelihoods\nare agnostic to the text input. TTI likelihood is more expressive but cannot\ndiscern confounding prompts. Our results show that applying DMs to conditional\ntasks reveals inconsistencies and strengthens claims that the properties of DM\nlikelihood are unknown. This impact sheds light on the previously unknown\nnature of DM likelihoods. Although conditional DMs maximise likelihood, the\nlikelihood in question is not as sensitive to the conditioning input as one\nexpects. This investigation provides a new point-of-view on diffusion\nlikelihoods.\n","authors":["Mattias Cross","Anton Ragni"],"pdf_url":"https://arxiv.org/pdf/2409.06364v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14590v2","updated":"2024-09-26T12:29:45Z","published":"2024-09-22T20:47:04Z","title":"Explainable AI needs formal notions of explanation correctness","summary":" The use of machine learning (ML) in critical domains such as medicine poses\nrisks and requires regulation. One requirement is that decisions of ML systems\nin high-risk applications should be human-understandable. The field of\n\"explainable artificial intelligence\" (XAI) seemingly addresses this need.\nHowever, in its current form, XAI is unfit to provide quality control for ML;\nit itself needs scrutiny. Popular XAI methods cannot reliably answer important\nquestions about ML models, their training data, or a given test input. We\nrecapitulate results demonstrating that popular XAI methods systematically\nattribute importance to input features that are independent of the prediction\ntarget. This limits their utility for purposes such as model and data\n(in)validation, model improvement, and scientific discovery. We argue that the\nfundamental reason for this limitation is that current XAI methods do not\naddress well-defined problems and are not evaluated against objective criteria\nof explanation correctness. Researchers should formally define the problems\nthey intend to solve first and then design methods accordingly. This will lead\nto notions of explanation correctness that can be theoretically verified and\nobjective metrics of explanation performance that can be assessed using\nground-truth data.\n","authors":["Stefan Haufe","Rick Wilming","Benedict Clark","Rustam Zhumagambetov","Danny Panknin","Ahcène Boubekki"],"pdf_url":"https://arxiv.org/pdf/2409.14590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17786v1","updated":"2024-09-26T12:29:13Z","published":"2024-09-26T12:29:13Z","title":"Predicting the Stay Length of Patients in Hospitals using Convolutional\n Gated Recurrent Deep Learning Model","summary":" Predicting hospital length of stay (LoS) stands as a critical factor in\nshaping public health strategies. This data serves as a cornerstone for\ngovernments to discern trends, patterns, and avenues for enhancing healthcare\ndelivery. In this study, we introduce a robust hybrid deep learning model, a\ncombination of Multi-layer Convolutional (CNNs) deep learning, Gated Recurrent\nUnits (GRU), and Dense neural networks, that outperforms 11 conventional and\nstate-of-the-art Machine Learning (ML) and Deep Learning (DL) methodologies in\naccurately forecasting inpatient hospital stay duration. Our investigation\ndelves into the implementation of this hybrid model, scrutinising variables\nlike geographic indicators tied to caregiving institutions, demographic markers\nencompassing patient ethnicity, race, and age, as well as medical attributes\nsuch as the CCS diagnosis code, APR DRG code, illness severity metrics, and\nhospital stay duration. Statistical evaluations reveal the pinnacle LoS\naccuracy achieved by our proposed model (CNN-GRU-DNN), which averages at 89%\nacross a 10-fold cross-validation test, surpassing LSTM, BiLSTM, GRU, and\nConvolutional Neural Networks (CNNs) by 19%, 18.2%, 18.6%, and 7%,\nrespectively. Accurate LoS predictions not only empower hospitals to optimise\nresource allocation and curb expenses associated with prolonged stays but also\npave the way for novel strategies in hospital stay management. This avenue\nholds promise for catalysing advancements in healthcare research and\ninnovation, inspiring a new era of precision-driven healthcare practices.\n","authors":["Mehdi Neshat","Michael Phipps","Chris A. Browne","Nicole T. Vargas","Seyedali Mirjalili"],"pdf_url":"https://arxiv.org/pdf/2409.17786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08757v4","updated":"2024-09-26T12:17:24Z","published":"2024-03-13T17:55:34Z","title":"Efficient Combinatorial Optimization via Heat Diffusion","summary":" Combinatorial optimization problems are widespread but inherently challenging\ndue to their discrete nature. The primary limitation of existing methods is\nthat they can only access a small fraction of the solution space at each\niteration, resulting in limited efficiency for searching the global optimal. To\novercome this challenge, diverging from conventional efforts of expanding the\nsolver's search scope, we focus on enabling information to actively propagate\nto the solver through heat diffusion. By transforming the target function while\npreserving its optima, heat diffusion facilitates information flow from distant\nregions to the solver, providing more efficient navigation. Utilizing heat\ndiffusion, we propose a framework for solving general combinatorial\noptimization problems. The proposed methodology demonstrates superior\nperformance across a range of the most challenging and widely encountered\ncombinatorial optimizations. Echoing recent advancements in harnessing\nthermodynamics for generative artificial intelligence, our study further\nreveals its significant potential in advancing combinatorial optimization.\n","authors":["Hengyuan Ma","Wenlian Lu","Jianfeng Feng"],"pdf_url":"https://arxiv.org/pdf/2403.08757v4.pdf","comment":"After the rebuttal version for NeurIPS 2024 (poster). Code is\n available in https://github.com/AwakerMhy/HeO"},{"id":"http://arxiv.org/abs/2309.16928v3","updated":"2024-09-26T12:09:22Z","published":"2023-09-29T02:04:24Z","title":"Learning to Receive Help: Intervention-Aware Concept Embedding Models","summary":" Concept Bottleneck Models (CBMs) tackle the opacity of neural architectures\nby constructing and explaining their predictions using a set of high-level\nconcepts. A special property of these models is that they permit concept\ninterventions, wherein users can correct mispredicted concepts and thus improve\nthe model's performance. Recent work, however, has shown that intervention\nefficacy can be highly dependent on the order in which concepts are intervened\non and on the model's architecture and training hyperparameters. We argue that\nthis is rooted in a CBM's lack of train-time incentives for the model to be\nappropriately receptive to concept interventions. To address this, we propose\nIntervention-aware Concept Embedding models (IntCEMs), a novel CBM-based\narchitecture and training paradigm that improves a model's receptiveness to\ntest-time interventions. Our model learns a concept intervention policy in an\nend-to-end fashion from where it can sample meaningful intervention\ntrajectories at train-time. This conditions IntCEMs to effectively select and\nreceive concept interventions when deployed at test-time. Our experiments show\nthat IntCEMs significantly outperform state-of-the-art concept-interpretable\nmodels when provided with test-time concept interventions, demonstrating the\neffectiveness of our approach.\n","authors":["Mateo Espinosa Zarlenga","Katherine M. Collins","Krishnamurthy Dvijotham","Adrian Weller","Zohreh Shams","Mateja Jamnik"],"pdf_url":"https://arxiv.org/pdf/2309.16928v3.pdf","comment":"Accepted as a spotlight at the Thirty-seventh Conference on Neural\n Information Processing Systems (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2409.17763v1","updated":"2024-09-26T11:58:41Z","published":"2024-09-26T11:58:41Z","title":"Confidence intervals uncovered: Are we ready for real-world medical\n imaging AI?","summary":" Medical imaging is spearheading the AI transformation of healthcare.\nPerformance reporting is key to determine which methods should be translated\ninto clinical practice. Frequently, broad conclusions are simply derived from\nmean performance values. In this paper, we argue that this common practice is\noften a misleading simplification as it ignores performance variability. Our\ncontribution is threefold. (1) Analyzing all MICCAI segmentation papers (n =\n221) published in 2023, we first observe that more than 50\\% of papers do not\nassess performance variability at all. Moreover, only one (0.5\\%) paper\nreported confidence intervals (CIs) for model performance. (2) To address the\nreporting bottleneck, we show that the unreported standard deviation (SD) in\nsegmentation papers can be approximated by a second-order polynomial function\nof the mean Dice similarity coefficient (DSC). Based on external validation\ndata from 56 previous MICCAI challenges, we demonstrate that this approximation\ncan accurately reconstruct the CI of a method using information provided in\npublications. (3) Finally, we reconstructed 95\\% CIs around the mean DSC of\nMICCAI 2023 segmentation papers. The median CI width was 0.03 which is three\ntimes larger than the median performance gap between the first and second\nranked method. For more than 60\\% of papers, the mean performance of the\nsecond-ranked method was within the CI of the first-ranked method. We conclude\nthat current publications typically do not provide sufficient evidence to\nsupport which models could potentially be translated into clinical practice.\n","authors":["Evangelia Christodoulou","Annika Reinke","Rola Houhou","Piotr Kalinowski","Selen Erkan","Carole H. Sudre","Ninon Burgos","Sofiène Boutaj","Sophie Loizillon","Maëlys Solal","Nicola Rieke","Veronika Cheplygina","Michela Antonelli","Leon D. Mayer","Minu D. Tizabi","M. Jorge Cardoso","Amber Simpson","Paul F. Jäger","Annette Kopp-Schneider","Gaël Varoquaux","Olivier Colliot","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2409.17763v1.pdf","comment":"Paper accepted at MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2408.10672v2","updated":"2024-09-26T11:42:31Z","published":"2024-08-20T09:17:11Z","title":"Neural Exploratory Landscape Analysis","summary":" Recent research in Meta-Black-Box Optimization (MetaBBO) have shown that\nmeta-trained neural networks can effectively guide the design of black-box\noptimizers, significantly reducing the need for expert tuning and delivering\nrobust performance across complex problem distributions. Despite their success,\na paradox remains: MetaBBO still rely on human-crafted Exploratory Landscape\nAnalysis features to inform the meta-level agent about the low-level\noptimization progress. To address the gap, this paper proposes Neural\nExploratory Landscape Analysis (NeurELA), a novel framework that dynamically\nprofiles landscape features through a two-stage, attention-based neural\nnetwork, executed in an entirely end-to-end fashion. NeurELA is pre-trained\nover a variety of MetaBBO algorithms using a multi-task neuroevolution\nstrategy. Extensive experiments show that NeurELA achieves consistently\nsuperior performance when integrated into different and even unseen MetaBBO\ntasks and can be efficiently fine-tuned for further performance boost. This\nadvancement marks a pivotal step in making MetaBBO algorithms more autonomous\nand broadly applicable.The source code of NeurELA can be accessed at\nhttps://anonymous.4open.science/r/Neur-ELA-303C.\n","authors":["Zeyuan Ma","Jiacheng Chen","Hongshu Guo","Yue-Jiao Gong"],"pdf_url":"https://arxiv.org/pdf/2408.10672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2001.07495v5","updated":"2024-09-26T11:42:25Z","published":"2020-01-21T13:05:31Z","title":"Unsupervisedly Learned Representations: Should the Quest be Over?","summary":" After four decades of research there still exists a Classification accuracy\ngap of about 20% between our best Unsupervisedly Learned Representations\nmethods and the accuracy rates achieved by intelligent animals. It thus may\nwell be that we are looking in the wrong direction. A possible solution to this\npuzzle is presented. We demonstrate that Reinforcement Learning can learn\nrepresentations which achieve the same accuracy as that of animals. Our main\nmodest contribution lies in the observations that: a. when applied to a real\nworld environment Reinforcement Learning does not require labels, and thus may\nbe legitimately considered as Unsupervised Learning, and b. in contrast, when\nReinforcement Learning is applied in a simulated environment it does inherently\nrequire labels and should thus be generally be considered as Supervised\nLearning. The corollary of these observations is that further search for\nUnsupervised Learning competitive paradigms which may be trained in simulated\nenvironments may be futile.\n","authors":["Daniel N. Nissani"],"pdf_url":"https://arxiv.org/pdf/2001.07495v5.pdf","comment":"To be published at The 6th International Conference on Machine\n Learning, Optimization and Data Science - LOD 2020"},{"id":"http://arxiv.org/abs/2409.17754v1","updated":"2024-09-26T11:36:08Z","published":"2024-09-26T11:36:08Z","title":"Byzantine-Robust Aggregation for Securing Decentralized Federated\n Learning","summary":" Federated Learning (FL) emerges as a distributed machine learning approach\nthat addresses privacy concerns by training AI models locally on devices.\nDecentralized Federated Learning (DFL) extends the FL paradigm by eliminating\nthe central server, thereby enhancing scalability and robustness through the\navoidance of a single point of failure. However, DFL faces significant\nchallenges in optimizing security, as most Byzantine-robust algorithms proposed\nin the literature are designed for centralized scenarios. In this paper, we\npresent a novel Byzantine-robust aggregation algorithm to enhance the security\nof Decentralized Federated Learning environments, coined WFAgg. This proposal\nhandles the adverse conditions and strength robustness of dynamic decentralized\ntopologies at the same time by employing multiple filters to identify and\nmitigate Byzantine attacks. Experimental results demonstrate the effectiveness\nof the proposed algorithm in maintaining model accuracy and convergence in the\npresence of various Byzantine attack scenarios, outperforming state-of-the-art\ncentralized Byzantine-robust aggregation schemes (such as Multi-Krum or\nClustering). These algorithms are evaluated on an IID image classification\nproblem in both centralized and decentralized scenarios.\n","authors":["Diego Cajaraville-Aboy","Ana Fernández-Vilas","Rebeca P. Díaz-Redondo","Manuel Fernández-Veiga"],"pdf_url":"https://arxiv.org/pdf/2409.17754v1.pdf","comment":"18 pages, 7 figures, 1 table"},{"id":"http://arxiv.org/abs/2409.17745v1","updated":"2024-09-26T11:19:09Z","published":"2024-09-26T11:19:09Z","title":"Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval\n Model","summary":" A supervised ranking model, despite its advantage of being effective, usually\ninvolves complex processing - typically multiple stages of task-specific\npre-training and fine-tuning. This has motivated researchers to explore simpler\npipelines leveraging large language models (LLMs) that are capable of working\nin a zero-shot manner. However, since zero-shot inference does not make use of\na training set of pairs of queries and their relevant documents, its\nperformance is mostly worse than that of supervised models, which are trained\non such example pairs. Motivated by the existing findings that training\nexamples generally improve zero-shot performance, in our work, we explore if\nthis also applies to ranking models. More specifically, given a query and a\npair of documents, the preference prediction task is improved by augmenting\nexamples of preferences for similar queries from a training set. Our proposed\npairwise few-shot ranker demonstrates consistent improvements over the\nzero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset)\nretrieval benchmarks. Our method also achieves a close performance to that of a\nsupervised model without requiring any complex training pipeline.\n","authors":["Nilanjan Sinhababu","Andrew Parry","Debasis Ganguly","Debasis Samanta","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2409.17745v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17730v1","updated":"2024-09-26T11:00:19Z","published":"2024-09-26T11:00:19Z","title":"Autoregressive Generation Strategies for Top-K Sequential\n Recommendations","summary":" The goal of modern sequential recommender systems is often formulated in\nterms of next-item prediction. In this paper, we explore the applicability of\ngenerative transformer-based models for the Top-K sequential recommendation\ntask, where the goal is to predict items a user is likely to interact with in\nthe \"near future\".\n We explore commonly used autoregressive generation strategies, including\ngreedy decoding, beam search, and temperature sampling, to evaluate their\nperformance for the Top-K sequential recommendation task. In addition, we\npropose novel Reciprocal Rank Aggregation (RRA) and Relevance Aggregation (RA)\ngeneration strategies based on multi-sequence generation with temperature\nsampling and subsequent aggregation.\n Experiments on diverse datasets give valuable insights regarding commonly\nused strategies' applicability and show that suggested approaches improve\nperformance on longer time horizons compared to widely-used Top-K prediction\napproach and single-sequence autoregressive generation strategies.\n","authors":["Anna Volodkevich","Danil Gusak","Anton Klenitskiy","Alexey Vasilev"],"pdf_url":"https://arxiv.org/pdf/2409.17730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17726v1","updated":"2024-09-26T10:56:27Z","published":"2024-09-26T10:56:27Z","title":"Recent advances in interpretable machine learning using structure-based\n protein representations","summary":" Recent advancements in machine learning (ML) are transforming the field of\nstructural biology. For example, AlphaFold, a groundbreaking neural network for\nprotein structure prediction, has been widely adopted by researchers. The\navailability of easy-to-use interfaces and interpretable outcomes from the\nneural network architecture, such as the confidence scores used to color the\npredicted structures, have made AlphaFold accessible even to non-ML experts. In\nthis paper, we present various methods for representing protein 3D structures\nfrom low- to high-resolution, and show how interpretable ML methods can support\ntasks such as predicting protein structures, protein function, and\nprotein-protein interactions. This survey also emphasizes the significance of\ninterpreting and visualizing ML-based inference for structure-based protein\nrepresentations that enhance interpretability and knowledge discovery.\nDeveloping such interpretable approaches promises to further accelerate fields\nincluding drug development and protein design.\n","authors":["Luiz Felipe Vecchietti","Minji Lee","Begench Hangeldiyev","Hyunkyu Jung","Hahnbeom Park","Tae-Kyun Kim","Meeyoung Cha","Ho Min Kim"],"pdf_url":"https://arxiv.org/pdf/2409.17726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17716v1","updated":"2024-09-26T10:38:35Z","published":"2024-09-26T10:38:35Z","title":"QuForge: A Library for Qudits Simulation","summary":" Quantum computing with qudits, an extension of qubits to multiple levels, is\na research field less mature than qubit-based quantum computing. However,\nqudits can offer some advantages over qubits, by representing information with\nfewer separated components. In this article, we present QuForge, a Python-based\nlibrary designed to simulate quantum circuits with qudits. This library\nprovides the necessary quantum gates for implementing quantum algorithms,\ntailored to any chosen qudit dimension. Built on top of differentiable\nframeworks, QuForge supports execution on accelerating devices such as GPUs and\nTPUs, significantly speeding up simulations. It also supports sparse\noperations, leading to a reduction in memory consumption compared to other\nlibraries. Additionally, by constructing quantum circuits as differentiable\ngraphs, QuForge facilitates the implementation of quantum machine learning\nalgorithms, enhancing the capabilities and flexibility of quantum computing\nresearch.\n","authors":["Tiago de Souza Farias","Lucas Friedrich","Jonas Maziero"],"pdf_url":"https://arxiv.org/pdf/2409.17716v1.pdf","comment":"18 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.17711v1","updated":"2024-09-26T10:27:19Z","published":"2024-09-26T10:27:19Z","title":"Efficient Pointwise-Pairwise Learning-to-Rank for News Recommendation","summary":" News recommendation is a challenging task that involves personalization based\non the interaction history and preferences of each user. Recent works have\nleveraged the power of pretrained language models (PLMs) to directly rank news\nitems by using inference approaches that predominately fall into three\ncategories: pointwise, pairwise, and listwise learning-to-rank. While pointwise\nmethods offer linear inference complexity, they fail to capture crucial\ncomparative information between items that is more effective for ranking tasks.\nConversely, pairwise and listwise approaches excel at incorporating these\ncomparisons but suffer from practical limitations: pairwise approaches are\neither computationally expensive or lack theoretical guarantees, and listwise\nmethods often perform poorly in practice. In this paper, we propose a novel\nframework for PLM-based news recommendation that integrates both pointwise\nrelevance prediction and pairwise comparisons in a scalable manner. We present\na rigorous theoretical analysis of our framework, establishing conditions under\nwhich our approach guarantees improved performance. Extensive experiments show\nthat our approach outperforms the state-of-the-art methods on the MIND and\nAdressa news recommendation datasets.\n","authors":["Nithish Kannen","Yao Ma","Gerrit J. J. van den Burg","Jean Baptiste Faddoul"],"pdf_url":"https://arxiv.org/pdf/2409.17711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15600v2","updated":"2024-09-26T10:26:18Z","published":"2024-08-28T07:48:39Z","title":"Exploring Selective Layer Fine-Tuning in Federated Learning","summary":" Federated learning (FL) has emerged as a promising paradigm for fine-tuning\nfoundation models using distributed data in a privacy-preserving manner. Under\nlimited computational resources, clients often find it more practical to\nfine-tune a selected subset of layers, rather than the entire model, based on\ntheir task-specific data. In this study, we provide a thorough theoretical\nexploration of selective layer fine-tuning in FL, emphasizing a flexible\napproach that allows the clients to adjust their selected layers according to\ntheir local data and resources. We theoretically demonstrate that the layer\nselection strategy has a significant impact on model convergence in two\ncritical aspects: the importance of selected layers and the heterogeneous\nchoices across clients. Drawing from these insights, we further propose a\nstrategic layer selection method that utilizes local gradients and regulates\nlayer selections across clients. The extensive experiments on both image and\ntext datasets demonstrate the effectiveness of the proposed strategy compared\nwith several baselines, highlighting its advances in identifying critical\nlayers that adapt to the client heterogeneity and training dynamics in FL.\n","authors":["Yuchang Sun","Yuexiang Xie","Bolin Ding","Yaliang Li","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15953v2","updated":"2024-09-26T10:22:34Z","published":"2024-08-28T17:12:01Z","title":"Modeling and Analyzing the Influence of Non-Item Pages on Sequential\n Next-Item Prediction","summary":" Analyzing sequences of interactions between users and items, sequential\nrecommendation models can learn user intent and make predictions about the next\nitem. Next to item interactions, most systems also have interactions with what\nwe call non-item pages: these pages are not related to specific items but still\ncan provide insights of the user's interests, as, for example, navigation\npages.\n We therefore propose a general way to include these non-item pages in\nsequential recommendation models to enhance next-item prediction. First, we\ndemonstrate the influence of non-item pages on following interactions with the\nhypotheses testing framework HypTrails and propose methods for representing\nnon-item pages in sequential recommendation models. Subsequently, we adapt\npopular sequential recommender models to integrate non-item pages and\ninvestigate their performance with different item representation strategies as\nwell as their ability to handle noisy data. To show the general capabilities of\nthe models to integrate non-item pages, we create a synthetic dataset for a\ncontrolled setting and then evaluate the improvements from including non-item\npages on two real-world datasets.\n Our results show that non-item pages are a valuable source of information,\nand incorporating them in sequential recommendation models increases the\nperformance of next-item prediction across all analyzed model architectures.\n","authors":["Elisabeth Fischer","Albin Zehe","Andreas Hotho","Daniel Schlör"],"pdf_url":"https://arxiv.org/pdf/2408.15953v2.pdf","comment":"37 pages, 19 figures; Submitted to ACM TORS"},{"id":"http://arxiv.org/abs/2407.14788v2","updated":"2024-09-26T10:21:33Z","published":"2024-07-20T07:39:07Z","title":"On the Design and Analysis of LLM-Based Algorithms","summary":" We initiate a formal investigation into the design and analysis of LLM-based\nalgorithms, i.e. algorithms that contain one or multiple calls of large\nlanguage models (LLMs) as sub-routines and critically rely on the capabilities\nof LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt\nengineering to complicated LLM-powered agent systems and compound AI systems,\nhave achieved remarkable empirical success, the design and optimization of them\nhave mostly relied on heuristics and trial-and-errors, which is largely due to\na lack of formal and analytical study for these algorithms. To fill this gap,\nwe start by identifying the computational-graph representation of LLM-based\nalgorithms, the design principle of task decomposition, and some key\nabstractions, which then facilitate our formal analysis for the accuracy and\nefficiency of LLM-based algorithms, despite the black-box nature of LLMs.\nThrough extensive analytical and empirical investigation in a series of case\nstudies, we demonstrate that the proposed framework is broadly applicable to a\nwide range of scenarios and diverse patterns of LLM-based algorithms, such as\nparallel, hierarchical and recursive task decomposition. Our proposed framework\nholds promise for advancing LLM-based algorithms, by revealing the reasons\nbehind curious empirical phenomena, guiding the choices of hyperparameters,\npredicting the empirical performance of algorithms, and inspiring new algorithm\ndesign. To promote further study of LLM-based algorithms, we release our source\ncode at\nhttps://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm.\n","authors":["Yanxi Chen","Yaliang Li","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.14788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17704v1","updated":"2024-09-26T10:20:59Z","published":"2024-09-26T10:20:59Z","title":"Transfer Learning in $\\ell_1$ Regularized Regression: Hyperparameter\n Selection Strategy based on Sharp Asymptotic Analysis","summary":" Transfer learning techniques aim to leverage information from multiple\nrelated datasets to enhance prediction quality against a target dataset. Such\nmethods have been adopted in the context of high-dimensional sparse regression,\nand some Lasso-based algorithms have been invented: Trans-Lasso and Pretraining\nLasso are such examples. These algorithms require the statistician to select\nhyperparameters that control the extent and type of information transfer from\nrelated datasets. However, selection strategies for these hyperparameters, as\nwell as the impact of these choices on the algorithm's performance, have been\nlargely unexplored. To address this, we conduct a thorough, precise study of\nthe algorithm in a high-dimensional setting via an asymptotic analysis using\nthe replica method. Our approach reveals a surprisingly simple behavior of the\nalgorithm: Ignoring one of the two types of information transferred to the\nfine-tuning stage has little effect on generalization performance, implying\nthat efforts for hyperparameter selection can be significantly reduced. Our\ntheoretical findings are also empirically supported by real-world applications\non the IMDb dataset.\n","authors":["Koki Okajima","Tomoyuki Obuchi"],"pdf_url":"https://arxiv.org/pdf/2409.17704v1.pdf","comment":"23 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.17703v1","updated":"2024-09-26T10:20:25Z","published":"2024-09-26T10:20:25Z","title":"PGN: The RNN's New Successor is Effective for Long-Range Time Series\n Forecasting","summary":" Due to the recurrent structure of RNN, the long information propagation path\nposes limitations in capturing long-term dependencies, gradient\nexplosion/vanishing issues, and inefficient sequential execution. Based on\nthis, we propose a novel paradigm called Parallel Gated Network (PGN) as the\nnew successor to RNN. PGN directly captures information from previous time\nsteps through the designed Historical Information Extraction (HIE) layer and\nleverages gated mechanisms to select and fuse it with the current time step\ninformation. This reduces the information propagation path to $\\mathcal{O}(1)$,\neffectively addressing the limitations of RNN. To enhance PGN's performance in\nlong-range time series forecasting tasks, we propose a novel temporal modeling\nframework called Temporal PGN (TPGN). TPGN incorporates two branches to\ncomprehensively capture the semantic information of time series. One branch\nutilizes PGN to capture long-term periodic patterns while preserving their\nlocal characteristics. The other branch employs patches to capture short-term\ninformation and aggregate the global representation of the series. TPGN\nachieves a theoretical complexity of $\\mathcal{O}(\\sqrt{L})$, ensuring\nefficiency in its operations. Experimental results on five benchmark datasets\ndemonstrate the state-of-the-art (SOTA) performance and high efficiency of\nTPGN, further confirming the effectiveness of PGN as the new successor to RNN\nin long-range time series forecasting. The code is available in this\nrepository: \\url{https://github.com/Water2sea/TPGN}.\n","authors":["Yuxin Jia","Youfang Lin","Jing Yu","Shuo Wang","Tianhao Liu","Huaiyu Wan"],"pdf_url":"https://arxiv.org/pdf/2409.17703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17699v1","updated":"2024-09-26T10:12:19Z","published":"2024-09-26T10:12:19Z","title":"MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard\n for Prompt Attacks","summary":" The proliferation of Large Language Models (LLMs) in diverse applications\nunderscores the pressing need for robust security measures to thwart potential\njailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger\ndata integrity and user privacy. Guardrails serve as crucial protective\nmechanisms against such threats, but existing models often fall short in terms\nof both detection accuracy, and computational efficiency. This paper advocates\nfor the significance of jailbreak attack prevention on LLMs, and emphasises the\nrole of input guardrails in safeguarding these models. We introduce MoJE\n(Mixture of Jailbreak Expert), a novel guardrail architecture designed to\nsurpass current limitations in existing state-of-the-art guardrails. By\nemploying simple linguistic statistical techniques, MoJE excels in detecting\njailbreak attacks while maintaining minimal computational overhead during model\ninference. Through rigorous experimentation, MoJE demonstrates superior\nperformance capable of detecting 90% of the attacks without compromising benign\nprompts, enhancing LLMs security against jailbreak attacks.\n","authors":["Giandomenico Cornacchia","Giulio Zizzo","Kieran Fraser","Muhammad Zaid Hamed","Ambrish Rawat","Mark Purcell"],"pdf_url":"https://arxiv.org/pdf/2409.17699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17163v2","updated":"2024-09-26T10:07:06Z","published":"2024-07-24T11:07:20Z","title":"dlordinal: a Python package for deep ordinal classification","summary":" dlordinal is a new Python library that unifies many recent deep ordinal\nclassification methodologies available in the literature. Developed using\nPyTorch as underlying framework, it implements the top performing\nstate-of-the-art deep learning techniques for ordinal classification problems.\nOrdinal approaches are designed to leverage the ordering information present in\nthe target variable. Specifically, it includes loss functions, various output\nlayers, dropout techniques, soft labelling methodologies, and other\nclassification strategies, all of which are appropriately designed to\nincorporate the ordinal information. Furthermore, as the performance metrics to\nassess novel proposals in ordinal classification depend on the distance between\ntarget and predicted classes in the ordinal scale, suitable ordinal evaluation\nmetrics are also included. dlordinal is distributed under the BSD-3-Clause\nlicense and is available at https://github.com/ayrna/dlordinal.\n","authors":["Francisco Bérchez-Moreno","Víctor M. Vargas","Rafael Ayllón-Gavilán","David Guijo-Rubio","César Hervás-Martínez","Juan C. Fernández","Pedro A. Gutiérrez"],"pdf_url":"https://arxiv.org/pdf/2407.17163v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17692v1","updated":"2024-09-26T09:57:16Z","published":"2024-09-26T09:57:16Z","title":"MIO: A Foundation Model on Multimodal Tokens","summary":" In this paper, we introduce MIO, a novel foundation model built on multimodal\ntokens, capable of understanding and generating speech, text, images, and\nvideos in an end-to-end, autoregressive manner. While the emergence of large\nlanguage models (LLMs) and multimodal large language models (MM-LLMs) propels\nadvancements in artificial general intelligence through their versatile\ncapabilities, they still lack true any-to-any understanding and generation.\nRecently, the release of GPT-4o has showcased the remarkable potential of\nany-to-any LLMs for complex real-world tasks, enabling omnidirectional input\nand output across images, speech, and text. However, it is closed-source and\ndoes not support the generation of multimodal interleaved sequences. To address\nthis gap, we present MIO, which is trained on a mixture of discrete tokens\nacross four modalities using causal multimodal modeling. MIO undergoes a\nfour-stage training process: (1) alignment pre-training, (2) interleaved\npre-training, (3) speech-enhanced pre-training, and (4) comprehensive\nsupervised fine-tuning on diverse textual, visual, and speech tasks. Our\nexperimental results indicate that MIO exhibits competitive, and in some cases\nsuperior, performance compared to previous dual-modal baselines, any-to-any\nmodel baselines, and even modality-specific baselines. Moreover, MIO\ndemonstrates advanced capabilities inherent to its any-to-any feature, such as\ninterleaved video-text generation, chain-of-visual-thought reasoning, visual\nguideline generation, instructional image editing, etc.\n","authors":["Zekun Wang","King Zhu","Chunpu Xu","Wangchunshu Zhou","Jiaheng Liu","Yibo Zhang","Jiashuo Wang","Ning Shi","Siyu Li","Yizhi Li","Haoran Que","Zhaoxiang Zhang","Yuanxing Zhang","Ge Zhang","Ke Xu","Jie Fu","Wenhao Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17692v1.pdf","comment":"Technical Report. Codes and models will be available soon"},{"id":"http://arxiv.org/abs/2409.17691v1","updated":"2024-09-26T09:56:13Z","published":"2024-09-26T09:56:13Z","title":"Efficient Bias Mitigation Without Privileged Information","summary":" Deep neural networks trained via empirical risk minimisation often exhibit\nsignificant performance disparities across groups, particularly when group and\ntask labels are spuriously correlated (e.g., \"grassy background\" and \"cows\").\nExisting bias mitigation methods that aim to address this issue often either\nrely on group labels for training or validation, or require an extensive\nhyperparameter search. Such data and computational requirements hinder the\npractical deployment of these methods, especially when datasets are too large\nto be group-annotated, computational resources are limited, and models are\ntrained through already complex pipelines. In this paper, we propose Targeted\nAugmentations for Bias Mitigation (TAB), a simple hyperparameter-free framework\nthat leverages the entire training history of a helper model to identify\nspurious samples, and generate a group-balanced training set from which a\nrobust model can be trained. We show that TAB improves worst-group performance\nwithout any group information or model selection, outperforming existing\nmethods while maintaining overall accuracy.\n","authors":["Mateo Espinosa Zarlenga","Swami Sankaranarayanan","Jerone T. A. Andrews","Zohreh Shams","Mateja Jamnik","Alice Xiang"],"pdf_url":"https://arxiv.org/pdf/2409.17691v1.pdf","comment":"Accepted at the 18th European Conference on Computer Vision (ECCV\n 2024) as an Oral presentation"},{"id":"http://arxiv.org/abs/2312.05181v3","updated":"2024-09-26T09:52:13Z","published":"2023-12-08T17:08:03Z","title":"Tenplex: Dynamic Parallelism for Deep Learning using Parallelizable\n Tensor Collections","summary":" Deep learning (DL) jobs use multi-dimensional parallelism, i.e. combining\ndata, model, and pipeline parallelism, to use large GPU clusters efficiently.\nLong-running jobs may experience changes to their GPU allocation: (i) resource\nelasticity during training adds or removes GPUs; (ii) hardware maintenance may\nrequire redeployment on different GPUs; and (iii) GPU failures force jobs to\nrun with fewer devices. Current DL frameworks tie jobs to a set of GPUs and\nthus lack support for these scenarios. In particular, they cannot change the\nmulti-dimensional parallelism of an already-running job in an efficient and\nmodel-independent way.\n We describe Scalai, a state management library for DL systems that enables\njobs to change their parallelism dynamically after the GPU allocation is\nupdated at runtime. Scalai achieves this through a new abstraction, a\nparallelizable tensor collection (PTC), that externalizes the job state during\ntraining. After a GPU change, Scalai uses the PTC to transform the job state:\nthe PTC repartitions the dataset state under data parallelism and exposes it to\nDL workers through a virtual file system; and the PTC obtains the model state\nas partitioned checkpoints and transforms them to reflect the new\nparallelization configuration. For efficiency, Scalai executes PTC\ntransformations in parallel with minimum data movement between workers. Our\nexperiments show that Scalai enables DL jobs to support dynamic parallelization\nwith low overhead.\n","authors":["Marcel Wagenländer","Guo Li","Bo Zhao","Luo Mai","Peter Pietzuch"],"pdf_url":"https://arxiv.org/pdf/2312.05181v3.pdf","comment":"The 30th Symposium on Operating Systems Principles (SOSP24)"},{"id":"http://arxiv.org/abs/2409.17687v1","updated":"2024-09-26T09:51:29Z","published":"2024-09-26T09:51:29Z","title":"Graph Edit Distance with General Costs Using Neural Set Divergence","summary":" Graph Edit Distance (GED) measures the (dis-)similarity between two given\ngraphs, in terms of the minimum-cost edit sequence that transforms one graph to\nthe other. However, the exact computation of GED is NP-Hard, which has recently\nmotivated the design of neural methods for GED estimation. However, they do not\nexplicitly account for edit operations with different costs. In response, we\npropose GRAPHEDX, a neural GED estimator that can work with general costs\nspecified for the four edit operations, viz., edge deletion, edge addition,\nnode deletion and node addition. We first present GED as a quadratic assignment\nproblem (QAP) that incorporates these four costs. Then, we represent each graph\nas a set of node and edge embeddings and use them to design a family of neural\nset divergence surrogates. We replace the QAP terms corresponding to each\noperation with their surrogates. Computing such neural set divergence require\naligning nodes and edges of the two graphs. We learn these alignments using a\nGumbel-Sinkhorn permutation generator, additionally ensuring that the node and\nedge alignments are consistent with each other. Moreover, these alignments are\ncognizant of both the presence and absence of edges between node-pairs.\nExperiments on several datasets, under a variety of edit cost settings, show\nthat GRAPHEDX consistently outperforms state-of-the-art methods and heuristics\nin terms of prediction error.\n","authors":["Eeshaan Jain","Indradyumna Roy","Saswat Meher","Soumen Chakrabarti","Abir De"],"pdf_url":"https://arxiv.org/pdf/2409.17687v1.pdf","comment":"Published at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17685v1","updated":"2024-09-26T09:51:08Z","published":"2024-09-26T09:51:08Z","title":"Artificial Data Point Generation in Clustered Latent Space for Small\n Medical Datasets","summary":" One of the growing trends in machine learning is the use of data generation\ntechniques, since the performance of machine learning models is dependent on\nthe quantity of the training dataset. However, in many medical applications,\ncollecting large datasets is challenging due to resource constraints, which\nleads to overfitting and poor generalization. This paper introduces a novel\nmethod, Artificial Data Point Generation in Clustered Latent Space (AGCL),\ndesigned to enhance classification performance on small medical datasets\nthrough synthetic data generation. The AGCL framework involves feature\nextraction, K-means clustering, cluster evaluation based on a class separation\nmetric, and the generation of synthetic data points from clusters with distinct\nclass representations. This method was applied to Parkinson's disease\nscreening, utilizing facial expression data, and evaluated across multiple\nmachine learning classifiers. Experimental results demonstrate that AGCL\nsignificantly improves classification accuracy compared to baseline, GN and\nkNNMTD. AGCL achieved the highest overall test accuracy of 83.33% and\ncross-validation accuracy of 90.90% in majority voting over different emotions,\nconfirming its effectiveness in augmenting small datasets.\n","authors":["Yasaman Haghbin","Hadi Moradi","Reshad Hosseini"],"pdf_url":"https://arxiv.org/pdf/2409.17685v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.17684v1","updated":"2024-09-26T09:51:07Z","published":"2024-09-26T09:51:07Z","title":"Preserving logical and functional dependencies in synthetic tabular data","summary":" Dependencies among attributes are a common aspect of tabular data. However,\nwhether existing tabular data generation algorithms preserve these dependencies\nwhile generating synthetic data is yet to be explored. In addition to the\nexisting notion of functional dependencies, we introduce the notion of logical\ndependencies among the attributes in this article. Moreover, we provide a\nmeasure to quantify logical dependencies among attributes in tabular data.\nUtilizing this measure, we compare several state-of-the-art synthetic data\ngeneration algorithms and test their capability to preserve logical and\nfunctional dependencies on several publicly available datasets. We demonstrate\nthat currently available synthetic tabular data generation algorithms do not\nfully preserve functional dependencies when they generate synthetic datasets.\nIn addition, we also showed that some tabular synthetic data generation models\ncan preserve inter-attribute logical dependencies. Our review and comparison of\nthe state-of-the-art reveal research needs and opportunities to develop\ntask-specific synthetic tabular data generation models.\n","authors":["Chaithra Umesh","Kristian Schultz","Manjunath Mahendra","Saparshi Bej","Olaf Wolkenhauer"],"pdf_url":"https://arxiv.org/pdf/2409.17684v1.pdf","comment":"Submitted to Pattern Recognition Journal"},{"id":"http://arxiv.org/abs/2208.13197v2","updated":"2024-09-26T09:40:31Z","published":"2022-08-28T10:47:32Z","title":"IDP-PGFE: An Interpretable Disruption Predictor based on Physics-Guided\n Feature Extraction","summary":" Disruption prediction has made rapid progress in recent years, especially in\nmachine learning (ML)-based methods. Understanding why a predictor makes a\ncertain prediction can be as crucial as the prediction's accuracy for future\ntokamak disruption predictors. The purpose of most disruption predictors is\naccuracy or cross-machine capability. However, if a disruption prediction model\ncan be interpreted, it can tell why certain samples are classified as\ndisruption precursors. This allows us to tell the types of incoming disruption\nand gives us insight into the mechanism of disruption. This paper designs a\ndisruption predictor called Interpretable Disruption Predictor based On\nPhysics-guided feature extraction (IDP-PGFE) on J-TEXT. The prediction\nperformance of the model is effectively improved by extracting physics-guided\nfeatures. A high-performance model is required to ensure the validity of the\ninterpretation results. The interpretability study of IDP-PGFE provides an\nunderstanding of J-TEXT disruption and is generally consistent with existing\ncomprehension of disruption. IDP-PGFE has been applied to the disruption due to\ncontinuously increasing density towards density limit experiments on J-TEXT.\nThe time evolution of the PGFE features contribution demonstrates that the\napplication of ECRH triggers radiation-caused disruption, which lowers the\ndensity at disruption. While the application of RMP indeed raises the density\nlimit in J-TEXT. The interpretability study guides intuition on the physical\nmechanisms of density limit disruption that RMPs affect not only the MHD\ninstabilities but also the radiation profile, which delays density limit\ndisruption.\n","authors":["Chengshuo Shen","Wei Zheng","Yonghua Ding","Xinkun Ai","Fengming Xue","Yu Zhong","Nengchao Wang","Li Gao","Zhipeng Chen","Zhoujun Yang","Zhongyong Chen","Yuan Pan","J-TEXT team"],"pdf_url":"https://arxiv.org/pdf/2208.13197v2.pdf","comment":"17 pages, 13 figures"},{"id":"http://arxiv.org/abs/2409.17677v1","updated":"2024-09-26T09:36:47Z","published":"2024-09-26T09:36:47Z","title":"Optimal Memorization Capacity of Transformers","summary":" Recent research in the field of machine learning has increasingly focused on\nthe memorization capacity of Transformers, but how efficient they are is not\nyet well understood. We demonstrate that Transformers can memorize labels with\n$\\tilde{O}(\\sqrt{N})$ parameters in a next-token prediction setting for $N$\ninput sequences of length $n$, which is proved to be optimal up to logarithmic\nfactors. This indicates that Transformers can efficiently perform memorization\nwith little influence from the input length $n$ owing to the benefit of\nparameter sharing. We also analyze the memorization capacity in the\nsequence-to-sequence setting, and find that $\\tilde{O}(\\sqrt{nN})$ parameters\nare not only sufficient, but also necessary at least for Transformers with\nhardmax. These results suggest that while self-attention mechanisms can\nefficiently identify input sequences, the feed-forward network becomes a\nbottleneck when associating a label to each token.\n","authors":["Tokio Kajitsuka","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2409.17677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13503v2","updated":"2024-09-26T09:26:05Z","published":"2024-09-20T13:44:00Z","title":"SatFed: A Resource-Efficient LEO Satellite-Assisted Heterogeneous\n Federated Learning Framework","summary":" Traditional federated learning (FL) frameworks rely heavily on terrestrial\nnetworks, where coverage limitations and increasing bandwidth congestion\nsignificantly hinder model convergence. Fortunately, the advancement of\nlow-Earth orbit (LEO) satellite networks offers promising new communication\navenues to augment traditional terrestrial FL. Despite this potential, the\nlimited satellite-ground communication bandwidth and the heterogeneous\noperating environments of ground devices-including variations in data,\nbandwidth, and computing power-pose substantial challenges for effective and\nrobust satellite-assisted FL. To address these challenges, we propose SatFed, a\nresource-efficient satellite-assisted heterogeneous FL framework. SatFed\nimplements freshness-based model prioritization queues to optimize the use of\nhighly constrained satellite-ground bandwidth, ensuring the transmission of the\nmost critical models. Additionally, a multigraph is constructed to capture\nreal-time heterogeneous relationships between devices, including data\ndistribution, terrestrial bandwidth, and computing capability. This multigraph\nenables SatFed to aggregate satellite-transmitted models into peer guidance,\nenhancing local training in heterogeneous environments. Extensive experiments\nwith real-world LEO satellite networks demonstrate that SatFed achieves\nsuperior performance and robustness compared to state-of-the-art benchmarks.\n","authors":["Yuxin Zhang","Zheng Lin","Zhe Chen","Zihan Fang","Wenjun Zhu","Xianhao Chen","Jin Zhao","Yue Gao"],"pdf_url":"https://arxiv.org/pdf/2409.13503v2.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.17663v1","updated":"2024-09-26T09:21:48Z","published":"2024-09-26T09:21:48Z","title":"Explanation Bottleneck Models","summary":" Recent concept-based interpretable models have succeeded in providing\nmeaningful explanations by pre-defined concept sets. However, the dependency on\nthe pre-defined concepts restricts the application because of the limited\nnumber of concepts for explanations. This paper proposes a novel interpretable\ndeep neural network called explanation bottleneck models (XBMs). XBMs generate\na text explanation from the input without pre-defined concepts and then predict\na final task prediction based on the generated explanation by leveraging\npre-trained vision-language encoder-decoder models. To achieve both the target\ntask performance and the explanation quality, we train XBMs through the target\ntask loss with the regularization penalizing the explanation decoder via the\ndistillation from the frozen pre-trained decoder. Our experiments, including a\ncomparison to state-of-the-art concept bottleneck models, confirm that XBMs\nprovide accurate and fluent natural language explanations without pre-defined\nconcept sets. Code will be available at https://github.com/yshinya6/xbm/.\n","authors":["Shin'ya Yamaguchi","Kosuke Nishida"],"pdf_url":"https://arxiv.org/pdf/2409.17663v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.14816v2","updated":"2024-09-26T09:11:28Z","published":"2024-09-23T08:46:15Z","title":"VARADE: a Variational-based AutoRegressive model for Anomaly Detection\n on the Edge","summary":" Detecting complex anomalies on massive amounts of data is a crucial task in\nIndustry 4.0, best addressed by deep learning. However, available solutions are\ncomputationally demanding, requiring cloud architectures prone to latency and\nbandwidth issues. This work presents VARADE, a novel solution implementing a\nlight autoregressive framework based on variational inference, which is best\nsuited for real-time execution on the edge. The proposed approach was validated\non a robotic arm, part of a pilot production line, and compared with several\nstate-of-the-art algorithms, obtaining the best trade-off between anomaly\ndetection accuracy, power consumption and inference frequency on two different\nedge platforms.\n","authors":["Alessio Mascolini","Sebastiano Gaiardelli","Francesco Ponzio","Nicola Dall'Ora","Enrico Macii","Sara Vinco","Santa Di Cataldo","Franco Fummi"],"pdf_url":"https://arxiv.org/pdf/2409.14816v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15897v2","updated":"2024-09-26T09:04:58Z","published":"2023-12-26T06:20:55Z","title":"Recursive Distillation for Open-Set Distributed Robot Localization","summary":" A typical assumption in state-of-the-art self-localization models is that an\nannotated training dataset is available for the target workspace. However, this\nis not necessarily true when a robot travels around the general open world.\nThis work introduces a novel training scheme for open-world distributed robot\nsystems. In our scheme, a robot (``student\") can ask the other robots it meets\nat unfamiliar places (``teachers\") for guidance. Specifically, a\npseudo-training dataset is reconstructed from the teacher model and then used\nfor continual learning of the student model under domain, class, and vocabulary\nincremental setup. Unlike typical knowledge transfer schemes, our scheme\nintroduces only minimal assumptions on the teacher model, so that it can handle\nvarious types of open-set teachers, including those uncooperative, untrainable\n(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In\nthis paper, we investigate a ranking function as an instance of such generic\nmodels, using a challenging data-free recursive distillation scenario, where a\nstudent once trained can recursively join the next-generation open teacher set.\n","authors":["Kenta Tsukahara","Kanji Tanaka"],"pdf_url":"https://arxiv.org/pdf/2312.15897v2.pdf","comment":"5 pages, 4 figures, technical report"},{"id":"http://arxiv.org/abs/2409.15246v2","updated":"2024-09-26T08:48:03Z","published":"2024-09-23T17:42:05Z","title":"On-Air Deep Learning Integrated Semantic Inference Models for Enhanced\n Earth Observation Satellite Networks","summary":" Earth Observation (EO) systems play a crucial role in achieving Sustainable\nDevelopment Goals by collecting and analyzing vital global data through\nsatellite networks. These systems are essential for tasks like mapping,\ndisaster monitoring, and resource management, but they face challenges in\nprocessing and transmitting large volumes of EO data, especially in specialized\nfields such as agriculture and real-time disaster response. Domain-adapted\nLarge Language Models (LLMs) provide a promising solution by facilitating data\nfusion between extensive EO data and semantic EO data. By improving integration\nand interpretation of diverse datasets, LLMs address the challenges of\nprocessing specialized information in agriculture and disaster response\napplications. This fusion enhances the accuracy and relevance of transmitted\ndata. This paper presents a framework for semantic communication in EO\nsatellite networks, aimed at improving data transmission efficiency and overall\nsystem performance through cognitive processing techniques. The proposed system\nemploys Discrete-Task-Oriented Source-Channel Coding (DT-JSCC) and Semantic\nData Augmentation (SA) to focus on relevant information while minimizing\ncommunication overhead. By integrating cognitive semantic processing and\ninter-satellite links, the framework enhances the analysis and transmission of\nmultispectral satellite imagery, improving object detection, pattern\nrecognition, and real-time decision-making. The introduction of Cognitive\nSemantic Augmentation (CSA) allows satellites to process and transmit semantic\ninformation, boosting adaptability to changing environments and application\nneeds. This end-to-end architecture is tailored for next-generation satellite\nnetworks, such as those supporting 6G, and demonstrates significant\nimprovements in efficiency and accuracy.\n","authors":["Hong-fu Chou","Vu Nguyen Ha","Prabhu Thiruvasagam","Thanh-Dung Le","Geoffrey Eappen","Ti Ti Nguyen","Luis M. Garces-Socarras","Jorge L. Gonzalez-Rios","Juan Carlos Merlano-Duncan","Symeon Chatzinotas"],"pdf_url":"https://arxiv.org/pdf/2409.15246v2.pdf","comment":"18 pages, 10 figures, magazine"},{"id":"http://arxiv.org/abs/2409.17643v1","updated":"2024-09-26T08:46:48Z","published":"2024-09-26T08:46:48Z","title":"Efficient Fairness-Performance Pareto Front Computation","summary":" There is a well known intrinsic trade-off between the fairness of a\nrepresentation and the performance of classifiers derived from the\nrepresentation. Due to the complexity of optimisation algorithms in most modern\nrepresentation learning approaches, for a given method it may be non-trivial to\ndecide whether the obtained fairness-performance curve of the method is\noptimal, i.e., whether it is close to the true Pareto front for these\nquantities for the underlying data distribution.\n In this paper we propose a new method to compute the optimal Pareto front,\nwhich does not require the training of complex representation models. We show\nthat optimal fair representations possess several useful structural properties,\nand that these properties enable a reduction of the computation of the Pareto\nFront to a compact discrete problem. We then also show that these compact\napproximating problems can be efficiently solved via off-the shelf\nconcave-convex programming methods.\n Since our approach is independent of the specific model of representations,\nit may be used as the benchmark to which representation learning algorithms may\nbe compared. We experimentally evaluate the approach on a number of real world\nbenchmark datasets.\n","authors":["Mark Kozdoba","Binyamin Perets","Shie Mannor"],"pdf_url":"https://arxiv.org/pdf/2409.17643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02733v3","updated":"2024-09-26T08:45:22Z","published":"2023-06-05T09:29:46Z","title":"Realising Synthetic Active Inference Agents, Part II: Variational\n Message Updates","summary":" The Free Energy Principle (FEP) describes (biological) agents as minimising a\nvariational Free Energy (FE) with respect to a generative model of their\nenvironment. Active Inference (AIF) is a corollary of the FEP that describes\nhow agents explore and exploit their environment by minimising an expected FE\nobjective. In two related papers, we describe a scalable, epistemic approach to\nsynthetic AIF, by message passing on free-form Forney-style Factor Graphs\n(FFGs). A companion paper (part I) introduces a Constrained FFG (CFFG) notation\nthat visually represents (generalised) FE objectives for AIF. The current paper\n(part II) derives message passing algorithms that minimise (generalised) FE\nobjectives on a CFFG by variational calculus. A comparison between simulated\nBethe and generalised FE agents illustrates how the message passing approach to\nsynthetic AIF induces epistemic behaviour on a T-maze navigation task.\nExtension of the T-maze simulation to 1) learning goal statistics, and 2) a\nmulti-agent bargaining setting, illustrate how this approach encourages reuse\nof nodes and updates in alternative settings. With a full message passing\naccount of synthetic AIF agents, it becomes possible to derive and reuse\nmessage updates across models and move closer to industrial applications of\nsynthetic AIF.\n","authors":["Thijs van de Laar","Magnus Koudahl","Bert de Vries"],"pdf_url":"https://arxiv.org/pdf/2306.02733v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17635v1","updated":"2024-09-26T08:32:31Z","published":"2024-09-26T08:32:31Z","title":"FlowMAC: Conditional Flow Matching for Audio Coding at Low Bit Rates","summary":" This paper introduces FlowMAC, a novel neural audio codec for high-quality\ngeneral audio compression at low bit rates based on conditional flow matching\n(CFM). FlowMAC jointly learns a mel spectrogram encoder, quantizer and decoder.\nAt inference time the decoder integrates a continuous normalizing flow via an\nODE solver to generate a high-quality mel spectrogram. This is the first time\nthat a CFM-based approach is applied to general audio coding, enabling a\nscalable, simple and memory efficient training. Our subjective evaluations show\nthat FlowMAC at 3 kbps achieves similar quality as state-of-the-art GAN-based\nand DDPM-based neural audio codecs at double the bit rate. Moreover, FlowMAC\noffers a tunable inference pipeline, which permits to trade off complexity and\nquality. This enables real-time coding on CPU, while maintaining high\nperceptual quality.\n","authors":["Nicola Pia","Martin Strauss","Markus Multrus","Bernd Edler"],"pdf_url":"https://arxiv.org/pdf/2409.17635v1.pdf","comment":"Submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2409.17632v1","updated":"2024-09-26T08:28:14Z","published":"2024-09-26T08:28:14Z","title":"Model-Free Stochastic Process Modeling and Optimization using\n Normalizing Flows","summary":" Real-world chemical processes often exhibit stochastic dynamics with\nnon-trivial correlations and state-dependent fluctuations. However, most\nprocess models simply add stationary noise terms to a deterministic prediction,\nwhich can lead to inaccurate predictions. This work proposes using conditional\nnormalizing flows as discrete-time models (DTMs) to learn the stochastic\ndynamics of chemical processes. Normalizing flows learn an explicit expression\nof the system states' probability density function (PDF) given prior states and\ncontrol inputs. The resulting model naturally allows for formulating stochastic\nand probabilistic setpoint-tracking objectives and chance constraints. In\napplications to a continuous reactor and a reactor cascade, the normalizing\nflow yields stable simulations over long time horizons and high-quality results\nin stochastic and probabilistic MPC formulation for open-loop control.\nFurthermore, a chance-constrained optimization finds reliable startup controls\nfor the reactor cascade with stochastic reactions. In conclusion, the\nconditional normalizing flow presents an excellent choice for modeling\nnonlinear stochastic dynamics.\n","authors":["Eike Cramer"],"pdf_url":"https://arxiv.org/pdf/2409.17632v1.pdf","comment":"13 pages, 7 Figures, 5 Tables"},{"id":"http://arxiv.org/abs/2409.17628v1","updated":"2024-09-26T08:22:09Z","published":"2024-09-26T08:22:09Z","title":"Convolutional Signal Propagation: A Simple Scalable Algorithm for\n Hypergraphs","summary":" Last decade has seen the emergence of numerous methods for learning on\ngraphs, particularly Graph Neural Networks (GNNs). These methods, however, are\noften not directly applicable to more complex structures like bipartite graphs\n(equivalent to hypergraphs), which represent interactions among two entity\ntypes (e.g. a user liking a movie). This paper proposes Convolutional Signal\nPropagation (CSP), a non-parametric simple and scalable method that natively\noperates on bipartite graphs (hypergraphs) and can be implemented with just a\nfew lines of code. After defining CSP, we demonstrate its relationship with\nwell-established methods like label propagation, Naive Bayes, and Hypergraph\nConvolutional Networks. We evaluate CSP against several reference methods on\nreal-world datasets from multiple domains, focusing on retrieval and\nclassification tasks. Our results show that CSP offers competitive performance\nwhile maintaining low computational complexity, making it an ideal first choice\nas a baseline for hypergraph node classification and retrieval. Moreover,\ndespite operating on hypergraphs, CSP achieves good results in tasks typically\nnot associated with hypergraphs, such as natural language processing.\n","authors":["Pavel Procházka","Marek Dědič","Lukáš Bajer"],"pdf_url":"https://arxiv.org/pdf/2409.17628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.11531v2","updated":"2024-09-26T08:20:59Z","published":"2023-09-20T10:50:28Z","title":"EPTQ: Enhanced Post-Training Quantization via Hessian-guided\n Network-wise Optimization","summary":" Quantization is a key method for deploying deep neural networks on edge\ndevices with limited memory and computation resources. Recent improvements in\nPost-Training Quantization (PTQ) methods were achieved by an additional local\noptimization process for learning the weight quantization rounding policy.\nHowever, a gap exists when employing network-wise optimization with small\nrepresentative datasets. In this paper, we propose a new method for enhanced\nPTQ (EPTQ) that employs a network-wise quantization optimization process, which\nbenefits from considering cross-layer dependencies during optimization. EPTQ\nenables network-wise optimization with a small representative dataset using a\nnovel sample-layer attention score based on a label-free Hessian matrix upper\nbound. The label-free approach makes our method suitable for the PTQ scheme. We\ngive a theoretical analysis for the said bound and use it to construct a\nknowledge distillation loss that guides the optimization to focus on the more\nsensitive layers and samples. In addition, we leverage the Hessian upper bound\nto improve the weight quantization parameters selection by focusing on the more\nsensitive elements in the weight tensors. Empirically, by employing EPTQ we\nachieve state-of-the-art results on various models, tasks, and datasets,\nincluding ImageNet classification, COCO object detection, and Pascal-VOC for\nsemantic segmentation.\n","authors":["Ofir Gordon","Elad Cohen","Hai Victor Habi","Arnon Netzer"],"pdf_url":"https://arxiv.org/pdf/2309.11531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17625v1","updated":"2024-09-26T08:20:05Z","published":"2024-09-26T08:20:05Z","title":"Benign or Not-Benign Overfitting in Token Selection of Attention\n Mechanism","summary":" Modern over-parameterized neural networks can be trained to fit the training\ndata perfectly while still maintaining a high generalization performance. This\n\"benign overfitting\" phenomenon has been studied in a surge of recent\ntheoretical work; however, most of these studies have been limited to linear\nmodels or two-layer neural networks. In this work, we analyze benign\noverfitting in the token selection mechanism of the attention architecture,\nwhich characterizes the success of transformer models. We first show the\nexistence of a benign overfitting solution and explain its mechanism in the\nattention architecture. Next, we discuss whether the model converges to such a\nsolution, raising the difficulties specific to the attention architecture. We\nthen present benign overfitting cases and not-benign overfitting cases by\nconditioning different scenarios based on the behavior of attention\nprobabilities during training. To the best of our knowledge, this is the first\nstudy to characterize benign overfitting for the attention mechanism.\n","authors":["Keitaro Sakamoto","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2409.17625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17622v1","updated":"2024-09-26T08:16:59Z","published":"2024-09-26T08:16:59Z","title":"Neural P$^3$M: A Long-Range Interaction Modeling Enhancer for Geometric\n GNNs","summary":" Geometric graph neural networks (GNNs) have emerged as powerful tools for\nmodeling molecular geometry. However, they encounter limitations in effectively\ncapturing long-range interactions in large molecular systems. To address this\nchallenge, we introduce Neural P$^3$M, a versatile enhancer of geometric GNNs\nto expand the scope of their capabilities by incorporating mesh points\nalongside atoms and reimaging traditional mathematical operations in a\ntrainable manner. Neural P$^3$M exhibits flexibility across a wide range of\nmolecular systems and demonstrates remarkable accuracy in predicting energies\nand forces, outperforming on benchmarks such as the MD22 dataset. It also\nachieves an average improvement of 22% on the OE62 dataset while integrating\nwith various architectures.\n","authors":["Yusong Wang","Chaoran Cheng","Shaoning Li","Yuxuan Ren","Bin Shao","Ge Liu","Pheng-Ann Heng","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.17622v1.pdf","comment":"Published as a conference paper at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.16959v2","updated":"2024-09-26T08:12:59Z","published":"2024-06-21T03:21:22Z","title":"Recurrent Stochastic Configuration Networks for Temporal Data Analytics","summary":" Temporal data modelling techniques with neural networks are useful in many\ndomain applications, including time-series forecasting and control engineering.\nThis paper aims at developing a recurrent version of stochastic configuration\nnetworks (RSCNs) for problem solving, where we have no underlying assumption on\nthe dynamic orders of the input variables. Given a collection of historical\ndata, we first build an initial RSCN model in the light of a supervisory\nmechanism, followed by an online update of the output weights by using a\nprojection algorithm. Some theoretical results are established, including the\necho state property, the universal approximation property of RSCNs for both the\noffline and online learnings, and the convergence of the output weights. The\nproposed RSCN model is remarkably distinguished from the well-known echo state\nnetworks (ESNs) in terms of the way of assigning the input random weight matrix\nand a special structure of the random feedback matrix. A comprehensive\ncomparison study among the long short-term memory (LSTM) network, the original\nESN, and several state-of-the-art ESN methods such as the simple cycle\nreservoir (SCR), the polynomial ESN (PESN), the leaky-integrator ESN (LIESN)\nand RSCN is carried out. Numerical results clearly indicate that the proposed\nRSCN performs favourably over all of the datasets.\n","authors":["Dianhui Wang","Gang Dang"],"pdf_url":"https://arxiv.org/pdf/2406.16959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17612v1","updated":"2024-09-26T08:03:19Z","published":"2024-09-26T08:03:19Z","title":"Diversity-Driven Synthesis: Enhancing Dataset Distillation through\n Directed Weight Adjustment","summary":" The sharp increase in data-related expenses has motivated research into\ncondensing datasets while retaining the most informative features. Dataset\ndistillation has thus recently come to the fore. This paradigm generates\nsynthetic dataset that are representative enough to replace the original\ndataset in training a neural network. To avoid redundancy in these synthetic\ndatasets, it is crucial that each element contains unique features and remains\ndiverse from others during the synthesis stage. In this paper, we provide a\nthorough theoretical and empirical analysis of diversity within synthesized\ndatasets. We argue that enhancing diversity can improve the parallelizable yet\nisolated synthesizing approach. Specifically, we introduce a novel method that\nemploys dynamic and directed weight adjustment techniques to modulate the\nsynthesis process, thereby maximizing the representativeness and diversity of\neach synthetic instance. Our method ensures that each batch of synthetic data\nmirrors the characteristics of a large, varying subset of the original dataset.\nExtensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet,\nand ImageNet-1K, demonstrate the superior performance of our method,\nhighlighting its effectiveness in producing diverse and representative\nsynthetic datasets with minimal computational expense.\n","authors":["Jiawei Du","Xin Zhang","Juncheng Hu","Wenxin Huang","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.17612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15254v3","updated":"2024-09-26T08:01:39Z","published":"2024-09-23T17:53:42Z","title":"Archon: An Architecture Search Framework for Inference-Time Techniques","summary":" Inference-time techniques are emerging as highly effective tools to increase\nlarge language model (LLM) capabilities. However, there is still limited\nunderstanding of the best practices for developing systems that combine\ninference-time techniques with one or more LLMs, with challenges including: (1)\neffectively allocating inference compute budget, (2) understanding the\ninteractions between different combinations of inference-time techniques and\ntheir impact on downstream performance, and 3) efficiently searching over the\nlarge space of model choices, inference-time techniques, and their\ncompositions. To address these challenges, we introduce Archon, an automated\nframework for designing inference-time architectures. Archon defines an\nextensible design space, encompassing methods such as generation ensembling,\nmulti-sampling, ranking, fusion, critiquing, verification, and unit testing. It\nthen transforms the problem of selecting and combining LLMs and inference-time\ntechniques into a hyperparameter optimization objective. To optimize this\nobjective, we introduce automated Inference-Time Architecture Search (ITAS)\nalgorithms. Given target benchmark(s), an inference compute budget, and\navailable LLMs, ITAS outputs optimized architectures. We evaluate Archon\narchitectures across a wide range of instruction-following and reasoning\nbenchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval,\nMixEval Hard, MATH, and CodeContests. We show that automatically designed\ninference-time architectures by Archon outperform strong models such as GPT-4o\nand Claude 3.5 Sonnet on these benchmarks, achieving an average increase of\n15.1 and 11.2 percentage points with all-source models and open-source models,\nrespectively. We make our code and datasets available publicly on Github:\nhttps://github.com/ScalingIntelligence/Archon.\n","authors":["Jon Saad-Falcon","Adrian Gamarra Lafuente","Shlok Natarajan","Nahum Maru","Hristo Todorov","Etash Guha","E. Kelly Buchanan","Mayee Chen","Neel Guha","Christopher Ré","Azalia Mirhoseini"],"pdf_url":"https://arxiv.org/pdf/2409.15254v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16341v2","updated":"2024-09-26T07:54:10Z","published":"2024-09-24T17:20:02Z","title":"Quality Matters: Evaluating Synthetic Data for Tool-Using LLMs","summary":" Training large language models (LLMs) for external tool usage is a rapidly\nexpanding field, with recent research focusing on generating synthetic data to\naddress the shortage of available data. However, the absence of systematic data\nquality checks poses complications for properly training and testing models. To\nthat end, we propose two approaches for assessing the reliability of data for\ntraining LLMs to use external tools. The first approach uses intuitive,\nhuman-defined correctness criteria. The second approach uses a model-driven\nassessment with in-context evaluation. We conduct a thorough evaluation of data\nquality on two popular benchmarks, followed by an extrinsic evaluation that\nshowcases the impact of data quality on model performance. Our results\ndemonstrate that models trained on high-quality data outperform those trained\non unvalidated data, even when trained with a smaller quantity of data. These\nfindings empirically support the significance of assessing and ensuring the\nreliability of training data for tool-using LLMs.\n","authors":["Shadi Iskander","Nachshon Cohen","Zohar Karnin","Ori Shapira","Sofia Tolmach"],"pdf_url":"https://arxiv.org/pdf/2409.16341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04428v2","updated":"2024-09-26T07:53:04Z","published":"2024-09-06T17:48:44Z","title":"Hybrid Spiking Neural Networks for Low-Power Intra-Cortical\n Brain-Machine Interfaces","summary":" Intra-cortical brain-machine interfaces (iBMIs) have the potential to\ndramatically improve the lives of people with paraplegia by restoring their\nability to perform daily activities. However, current iBMIs suffer from\nscalability and mobility limitations due to bulky hardware and wiring. Wireless\niBMIs offer a solution but are constrained by a limited data rate. To overcome\nthis challenge, we are investigating hybrid spiking neural networks for\nembedded neural decoding in wireless iBMIs. The networks consist of a temporal\nconvolution-based compression followed by recurrent processing and a final\ninterpolation back to the original sequence length. As recurrent units, we\nexplore gated recurrent units (GRUs), leaky integrate-and-fire (LIF) neurons,\nand a combination of both - spiking GRUs (sGRUs) and analyze their differences\nin terms of accuracy, footprint, and activation sparsity. To that end, we train\ndecoders on the \"Nonhuman Primate Reaching with Multichannel Sensorimotor\nCortex Electrophysiology\" dataset and evaluate it using the NeuroBench\nframework, targeting both tracks of the IEEE BioCAS Grand Challenge on Neural\nDecoding. Our approach achieves high accuracy in predicting velocities of\nprimate reaching movements from multichannel primary motor cortex recordings\nwhile maintaining a low number of synaptic operations, surpassing the current\nbaseline models in the NeuroBench framework. This work highlights the potential\nof hybrid neural networks to facilitate wireless iBMIs with high decoding\nprecision and a substantial increase in the number of monitored neurons, paving\nthe way toward more advanced neuroprosthetic technologies.\n","authors":["Alexandru Vasilache","Jann Krausse","Klaus Knobloch","Juergen Becker"],"pdf_url":"https://arxiv.org/pdf/2409.04428v2.pdf","comment":"This work has been accepted at the 2024 IEEE Biomedical Circuits and\n Systems Conference"},{"id":"http://arxiv.org/abs/2408.03944v2","updated":"2024-09-26T07:47:50Z","published":"2024-07-22T03:56:27Z","title":"Improving Fast Adversarial Training Paradigm: An Example Taxonomy\n Perspective","summary":" While adversarial training is an effective defense method against adversarial\nattacks, it notably increases the training cost. To this end, fast adversarial\ntraining (FAT) is presented for efficient training and has become a hot\nresearch topic. However, FAT suffers from catastrophic overfitting, which leads\nto a performance drop compared with multi-step adversarial training. However,\nthe cause of catastrophic overfitting remains unclear and lacks exploration. In\nthis paper, we present an example taxonomy in FAT, which identifies that\ncatastrophic overfitting is caused by the imbalance between the inner and outer\noptimization in FAT. Furthermore, we investigated the impact of varying degrees\nof training loss, revealing a correlation between training loss and\ncatastrophic overfitting. Based on these observations, we redesign the loss\nfunction in FAT with the proposed dynamic label relaxation to concentrate the\nloss range and reduce the impact of misclassified examples. Meanwhile, we\nintroduce batch momentum initialization to enhance the diversity to prevent\ncatastrophic overfitting in an efficient manner. Furthermore, we also propose\nCatastrophic Overfitting aware Loss Adaptation (COLA), which employs a separate\ntraining strategy for examples based on their loss degree. Our proposed method,\nnamed example taxonomy aware FAT (ETA), establishes an improved paradigm for\nFAT. Experiment results demonstrate our ETA achieves state-of-the-art\nperformance. Comprehensive experiments on four standard datasets demonstrate\nthe competitiveness of our proposed method.\n","authors":["Jie Gui","Chengze Jiang","Minjing Dong","Kun Tong","Xinli Shi","Yuan Yan Tang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.03944v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2409.17605v1","updated":"2024-09-26T07:43:12Z","published":"2024-09-26T07:43:12Z","title":"Good Data Is All Imitation Learning Needs","summary":" In this paper, we address the limitations of traditional teacher-student\nmodels, imitation learning, and behaviour cloning in the context of\nAutonomous/Automated Driving Systems (ADS), where these methods often struggle\nwith incomplete coverage of real-world scenarios. To enhance the robustness of\nsuch models, we introduce the use of Counterfactual Explanations (CFEs) as a\nnovel data augmentation technique for end-to-end ADS. CFEs, by generating\ntraining samples near decision boundaries through minimal input modifications,\nlead to a more comprehensive representation of expert driver strategies,\nparticularly in safety-critical scenarios. This approach can therefore help\nimprove the model's ability to handle rare and challenging driving events, such\nas anticipating darting out pedestrians, ultimately leading to safer and more\ntrustworthy decision-making for ADS. Our experiments in the CARLA simulator\ndemonstrate that CF-Driver outperforms the current state-of-the-art method,\nachieving a higher driving score and lower infraction rates. Specifically,\nCF-Driver attains a driving score of 84.2, surpassing the previous best model\nby 15.02 percentage points. These results highlight the effectiveness of\nincorporating CFEs in training end-to-end ADS. To foster further research, the\nCF-Driver code is made publicly available.\n","authors":["Amir Samadi","Konstantinos Koufos","Kurt Debattista","Mehrdad Dianati"],"pdf_url":"https://arxiv.org/pdf/2409.17605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17604v1","updated":"2024-09-26T07:40:47Z","published":"2024-09-26T07:40:47Z","title":"RmGPT: Rotating Machinery Generative Pretrained Model","summary":" In industry, the reliability of rotating machinery is critical for production\nefficiency and safety. Current methods of Prognostics and Health Management\n(PHM) often rely on task-specific models, which face significant challenges in\nhandling diverse datasets with varying signal characteristics, fault modes and\noperating conditions. Inspired by advancements in generative pretrained models,\nwe propose RmGPT, a unified model for diagnosis and prognosis tasks. RmGPT\nintroduces a novel token-based framework, incorporating Signal Tokens, Prompt\nTokens, Time-Frequency Task Tokens and Fault Tokens to handle heterogeneous\ndata within a unified model architecture. We leverage self-supervised learning\nfor robust feature extraction and introduce a next signal token prediction\npretraining strategy, alongside efficient prompt learning for task-specific\nadaptation. Extensive experiments demonstrate that RmGPT significantly\noutperforms state-of-the-art algorithms, achieving near-perfect accuracy in\ndiagnosis tasks and exceptionally low errors in prognosis tasks. Notably, RmGPT\nexcels in few-shot learning scenarios, achieving 92% accuracy in 16-class\none-shot experiments, highlighting its adaptability and robustness. This work\nestablishes RmGPT as a powerful PHM foundation model for rotating machinery,\nadvancing the scalability and generalizability of PHM solutions.\n","authors":["Yilin Wang","Yifei Yu","Kong Sun","Peixuan Lei","Yuxuan Zhang","Enrico Zio","Aiguo Xia","Yuanxiang Li"],"pdf_url":"https://arxiv.org/pdf/2409.17604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16206v2","updated":"2024-09-26T07:32:09Z","published":"2024-05-25T12:35:31Z","title":"GlycanML: A Multi-Task and Multi-Structure Benchmark for Glycan Machine\n Learning","summary":" Glycans are basic biomolecules and perform essential functions within living\norganisms. The rapid increase of functional glycan data provides a good\nopportunity for machine learning solutions to glycan understanding. However,\nthere still lacks a standard machine learning benchmark for glycan function\nprediction. In this work, we fill this blank by building a comprehensive\nbenchmark for Glycan Machine Learning (GlycanML). The GlycanML benchmark\nconsists of diverse types of tasks including glycan taxonomy prediction, glycan\nimmunogenicity prediction, glycosylation type prediction, and protein-glycan\ninteraction prediction. Glycans can be represented by both sequences and graphs\nin GlycanML, which enables us to extensively evaluate sequence-based models and\ngraph neural networks (GNNs) on benchmark tasks. Furthermore, by concurrently\nperforming eight glycan taxonomy prediction tasks, we introduce the\nGlycanML-MTL testbed for multi-task learning (MTL) algorithms. Experimental\nresults show the superiority of modeling glycans with multi-relational GNNs,\nand suitable MTL methods can further boost model performance. We provide all\ndatasets and source codes at https://github.com/GlycanML/GlycanML and maintain\na leaderboard at https://GlycanML.github.io/project\n","authors":["Minghao Xu","Yunteng Geng","Yihang Zhang","Ling Yang","Jian Tang","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.16206v2.pdf","comment":"Research project paper. All code and data are released"},{"id":"http://arxiv.org/abs/2409.17592v1","updated":"2024-09-26T07:19:12Z","published":"2024-09-26T07:19:12Z","title":"Deep Manifold Part 1: Anatomy of Neural Network Manifold","summary":" Based on the numerical manifold method principle, we developed a mathematical\nframework of a neural network manifold: Deep Manifold and discovered that\nneural networks: 1) is numerical computation combining forward and inverse; 2)\nhave near infinite degrees of freedom; 3) exponential learning capacity with\ndepth; 4) have self-progressing boundary conditions; 5) has training hidden\nbottleneck. We also define two concepts: neural network learning space and deep\nmanifold space and introduce two concepts: neural network intrinsic pathway and\nfixed point. We raise three fundamental questions: 1). What is the training\ncompletion definition; 2). where is the deep learning convergence point (neural\nnetwork fixed point); 3). How important is token timestamp in training data\ngiven negative time is critical in inverse problem.\n","authors":["Max Y. Ma","Gen-Hua Shi"],"pdf_url":"https://arxiv.org/pdf/2409.17592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17591v1","updated":"2024-09-26T07:16:38Z","published":"2024-09-26T07:16:38Z","title":"Conjugate Bayesian Two-step Change Point Detection for Hawkes Process","summary":" The Bayesian two-step change point detection method is popular for the Hawkes\nprocess due to its simplicity and intuitiveness. However, the non-conjugacy\nbetween the point process likelihood and the prior requires most existing\nBayesian two-step change point detection methods to rely on non-conjugate\ninference methods. These methods lack analytical expressions, leading to low\ncomputational efficiency and impeding timely change point detection. To address\nthis issue, this work employs data augmentation to propose a conjugate Bayesian\ntwo-step change point detection method for the Hawkes process, which proves to\nbe more accurate and efficient. Extensive experiments on both synthetic and\nreal data demonstrate the superior effectiveness and efficiency of our method\ncompared to baseline methods. Additionally, we conduct ablation studies to\nexplore the robustness of our method concerning various hyperparameters. Our\ncode is publicly available at https://github.com/Aurora2050/CoBay-CPD.\n","authors":["Zeyue Zhang","Xiaoling Lu","Feng Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.17591v1.pdf","comment":"10 pages, accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17587v1","updated":"2024-09-26T07:07:08Z","published":"2024-09-26T07:07:08Z","title":"Multimodal Banking Dataset: Understanding Client Needs through Event\n Sequences","summary":" Financial organizations collect a huge amount of data about clients that\ntypically has a temporal (sequential) structure and is collected from various\nsources (modalities). Due to privacy issues, there are no large-scale\nopen-source multimodal datasets of event sequences, which significantly limits\nthe research in this area. In this paper, we present the industrial-scale\npublicly available multimodal banking dataset, MBD, that contains more than\n1.5M corporate clients with several modalities: 950M bank transactions, 1B geo\nposition events, 5M embeddings of dialogues with technical support and monthly\naggregated purchases of four bank's products. All entries are properly\nanonymized from real proprietary bank data. Using this dataset, we introduce a\nnovel benchmark with two business tasks: campaigning (purchase prediction in\nthe next month) and matching of clients. We provide numerical results that\ndemonstrate the superiority of our multi-modal baselines over single-modal\ntechniques for each task. As a result, the proposed dataset can open new\nperspectives and facilitate the future development of practically important\nlarge-scale multimodal algorithms for event sequences.\n HuggingFace Link: https://huggingface.co/datasets/ai-lab/MBD\n Github Link: https://github.com/Dzhambo/MBD\n","authors":["Mollaev Dzhambulat","Alexander Kostin","Postnova Maria","Ivan Karpukhin","Ivan A Kireev","Gleb Gusev","Andrey Savchenko"],"pdf_url":"https://arxiv.org/pdf/2409.17587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06379v3","updated":"2024-09-26T07:05:47Z","published":"2023-10-10T07:43:41Z","title":"Understanding the Expressivity and Trainability of Fourier Neural\n Operator: A Mean-Field Perspective","summary":" In this paper, we explores the expressivity and trainability of the Fourier\nNeural Operator (FNO). We establish a mean-field theory for the FNO, analyzing\nthe behavior of the random FNO from an edge of chaos perspective. Our\ninvestigation into the expressivity of a random FNO involves examining the\nordered-chaos phase transition of the network based on the weight distribution.\nThis phase transition demonstrates characteristics unique to the FNO, induced\nby mode truncation, while also showcasing similarities to those of densely\nconnected networks. Furthermore, we identify a connection between expressivity\nand trainability: the ordered and chaotic phases correspond to regions of\nvanishing and exploding gradients, respectively. This finding provides a\npractical prerequisite for the stable training of the FNO. Our experimental\nresults corroborate our theoretical findings.\n","authors":["Takeshi Koshizuka","Masahiro Fujisawa","Yusuke Tanaka","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2310.06379v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17583v1","updated":"2024-09-26T07:01:29Z","published":"2024-09-26T07:01:29Z","title":"Let the Quantum Creep In: Designing Quantum Neural Network Models by\n Gradually Swapping Out Classical Components","summary":" Artificial Intelligence (AI), with its multiplier effect and wide\napplications in multiple areas, could potentially be an important application\nof quantum computing. Since modern AI systems are often built on neural\nnetworks, the design of quantum neural networks becomes a key challenge in\nintegrating quantum computing into AI. To provide a more fine-grained\ncharacterisation of the impact of quantum components on the performance of\nneural networks, we propose a framework where classical neural network layers\nare gradually replaced by quantum layers that have the same type of input and\noutput while keeping the flow of information between layers unchanged,\ndifferent from most current research in quantum neural network, which favours\nan end-to-end quantum model. We start with a simple three-layer classical\nneural network without any normalisation layers or activation functions, and\ngradually change the classical layers to the corresponding quantum versions. We\nconduct numerical experiments on image classification datasets such as the\nMNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of\nperformance brought by the systematic introduction of quantum components.\nThrough this framework, our research sheds new light on the design of future\nquantum neural network models where it could be more favourable to search for\nmethods and frameworks that harness the advantages from both the classical and\nquantum worlds.\n","authors":["Peiyong Wang","Casey. R. Myers","Lloyd C. L. Hollenberg","Udaya Parampalli"],"pdf_url":"https://arxiv.org/pdf/2409.17583v1.pdf","comment":"50 pages (including Appendix), many figures, accepted as a poster on\n QTML2024. Code available at\n https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In"},{"id":"http://arxiv.org/abs/2409.17582v1","updated":"2024-09-26T07:01:06Z","published":"2024-09-26T07:01:06Z","title":"Multiplicative Logit Adjustment Approximates Neural-Collapse-Aware\n Decision Boundary Adjustment","summary":" Real-world data distributions are often highly skewed. This has spurred a\ngrowing body of research on long-tailed recognition to address this imbalance\nin training classification models. Among the methods studied, multiplicative\nlogit adjustment (MLA) stands out as a simple and effective method. However, it\nlacks theoretical guarantees, which raises concerns about the optimality of its\nadjustment method. We provide a theoretical justification for the effectiveness\nof MLA with the following two-step theory. First, we develop a theory that\nadjusts optimal decision boundaries by estimating feature spread on the basis\nof neural collapse. Then, we demonstrate that MLA approximates this optimal\nmethod. Additionally, through experiments on long-tailed datasets, we\nillustrate the practical usefulness of MLA under more realistic conditions. We\nalso offer experimental insights to guide the tuning of MLA's hyperparameters.\n","authors":["Naoya Hasegawa","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2409.17582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17567v1","updated":"2024-09-26T06:28:56Z","published":"2024-09-26T06:28:56Z","title":"Derandomizing Multi-Distribution Learning","summary":" Multi-distribution or collaborative learning involves learning a single\npredictor that works well across multiple data distributions, using samples\nfrom each during training. Recent research on multi-distribution learning,\nfocusing on binary loss and finite VC dimension classes, has shown near-optimal\nsample complexity that is achieved with oracle efficient algorithms. That is,\nthese algorithms are computationally efficient given an efficient ERM for the\nclass. Unlike in classical PAC learning, where the optimal sample complexity is\nachieved with deterministic predictors, current multi-distribution learning\nalgorithms output randomized predictors. This raises the question: can these\nalgorithms be derandomized to produce a deterministic predictor for multiple\ndistributions? Through a reduction to discrepancy minimization, we show that\nderandomizing multi-distribution learning is computationally hard, even when\nERM is computationally efficient. On the positive side, we identify a\nstructural condition enabling an efficient black-box reduction, converting\nexisting randomized multi-distribution predictors into deterministic ones.\n","authors":["Kasper Green Larsen","Omar Montasser","Nikita Zhivotovskiy"],"pdf_url":"https://arxiv.org/pdf/2409.17567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17565v1","updated":"2024-09-26T06:27:26Z","published":"2024-09-26T06:27:26Z","title":"Pixel-Space Post-Training of Latent Diffusion Models","summary":" Latent diffusion models (LDMs) have made significant advancements in the\nfield of image generation in recent years. One major advantage of LDMs is their\nability to operate in a compressed latent space, allowing for more efficient\ntraining and deployment. However, despite these advantages, challenges with\nLDMs still remain. For example, it has been observed that LDMs often generate\nhigh-frequency details and complex compositions imperfectly. We hypothesize\nthat one reason for these flaws is due to the fact that all pre- and\npost-training of LDMs are done in latent space, which is typically $8 \\times 8$\nlower spatial-resolution than the output images. To address this issue, we\npropose adding pixel-space supervision in the post-training process to better\npreserve high-frequency details. Experimentally, we show that adding a\npixel-space objective significantly improves both supervised quality\nfine-tuning and preference-based post-training by a large margin on a\nstate-of-the-art DiT transformer and U-Net diffusion models in both visual\nquality and visual flaw metrics, while maintaining the same text alignment\nquality.\n","authors":["Christina Zhang","Simran Motwani","Matthew Yu","Ji Hou","Felix Juefei-Xu","Sam Tsai","Peter Vajda","Zijian He","Jialiang Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16997v2","updated":"2024-09-26T06:13:04Z","published":"2024-09-25T15:02:25Z","title":"INT-FlashAttention: Enabling Flash Attention for INT8 Quantization","summary":" As the foundation of large language models (LLMs), self-attention module\nfaces the challenge of quadratic time and memory complexity with respect to\nsequence length. FlashAttention accelerates attention computation and reduces\nits memory usage by leveraging the GPU memory hierarchy. A promising research\ndirection is to integrate FlashAttention with quantization methods. This paper\nintroduces INT-FlashAttention, the first INT8 quantization architecture\ncompatible with the forward workflow of FlashAttention, which significantly\nimproves the inference speed of FlashAttention on Ampere GPUs. We implement our\nINT-FlashAttention prototype with fully INT8 activations and general\nmatrix-multiplication (GEMM) kernels, making it the first attention operator\nwith fully INT8 input. As a general token-level post-training quantization\nframework, INT-FlashAttention is also compatible with other data formats like\nINT4, etc. Experimental results show INT-FlashAttention achieves 72% faster\ninference speed and 82% smaller quantization error compared to standard\nFlashAttention with FP16 and FP8 data format.\n","authors":["Shimao Chen","Zirui Liu","Zhiying Wu","Ce Zheng","Peizhuang Cong","Zihan Jiang","Yuhan Wu","Lei Su","Tong Yang"],"pdf_url":"https://arxiv.org/pdf/2409.16997v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17557v1","updated":"2024-09-26T06:10:29Z","published":"2024-09-26T06:10:29Z","title":"Joint Source-Channel Coding: Fundamentals and Recent Progress in\n Practical Designs","summary":" Semantic- and task-oriented communication has emerged as a promising approach\nto reducing the latency and bandwidth requirements of next-generation mobile\nnetworks by transmitting only the most relevant information needed to complete\na specific task at the receiver. This is particularly advantageous for\nmachine-oriented communication of high data rate content, such as images and\nvideos, where the goal is rapid and accurate inference, rather than perfect\nsignal reconstruction. While semantic- and task-oriented compression can be\nimplemented in conventional communication systems, joint source-channel coding\n(JSCC) offers an alternative end-to-end approach by optimizing compression and\nchannel coding together, or even directly mapping the source signal to the\nmodulated waveform. Although all digital communication systems today rely on\nseparation, thanks to its modularity, JSCC is known to achieve higher\nperformance in finite blocklength scenarios, and to avoid cliff and the\nlevelling-off effects in time-varying channel scenarios. This article provides\nan overview of the information theoretic foundations of JSCC, surveys practical\nJSCC designs over the decades, and discusses the reasons for their limited\nadoption in practical systems. We then examine the recent resurgence of JSCC,\ndriven by the integration of deep learning techniques, particularly through\nDeepJSCC, highlighting its many surprising advantages in various scenarios.\nFinally, we discuss why it may be time to reconsider today's strictly separate\narchitectures, and reintroduce JSCC to enable high-fidelity, low-latency\ncommunications in critical applications such as autonomous driving, drone\nsurveillance, or wearable systems.\n","authors":["Deniz Gündüz","Michèle A. Wigger","Tze-Yang Tung","Ping Zhang","Yong Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.17557v1.pdf","comment":"Under review for possible publication"},{"id":"http://arxiv.org/abs/2307.08038v2","updated":"2024-09-26T06:02:24Z","published":"2023-07-16T13:34:44Z","title":"Bivariate DeepKriging for Large-scale Spatial Interpolation of Wind\n Fields","summary":" High spatial resolution wind data are essential for a wide range of\napplications in climate, oceanographic and meteorological studies. Large-scale\nspatial interpolation or downscaling of bivariate wind fields having velocity\nin two dimensions is a challenging task because wind data tend to be\nnon-Gaussian with high spatial variability and heterogeneity. In spatial\nstatistics, cokriging is commonly used for predicting bivariate spatial fields.\nHowever, the cokriging predictor is not optimal except for Gaussian processes.\nAdditionally, cokriging is computationally prohibitive for large datasets. In\nthis paper, we propose a method, called bivariate DeepKriging, which is a\nspatially dependent deep neural network (DNN) with an embedding layer\nconstructed by spatial radial basis functions for bivariate spatial data\nprediction. We then develop a distribution-free uncertainty quantification\nmethod based on bootstrap and ensemble DNN. Our proposed approach outperforms\nthe traditional cokriging predictor with commonly used covariance functions,\nsuch as the linear model of co-regionalization and flexible bivariate Mat\\'ern\ncovariance. We demonstrate the computational efficiency and scalability of the\nproposed DNN model, with computations that are, on average, 20 times faster\nthan those of conventional techniques. We apply the bivariate DeepKriging\nmethod to the wind data over the Middle East region at 506,771 locations. The\nprediction performance of the proposed method is superior over the cokriging\npredictors and dramatically reduces computation time.\n","authors":["Pratik Nag","Ying Sun","Brian J Reich"],"pdf_url":"https://arxiv.org/pdf/2307.08038v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17555v1","updated":"2024-09-26T05:57:35Z","published":"2024-09-26T05:57:35Z","title":"Advancing Open-Set Domain Generalization Using Evidential Bi-Level\n Hardest Domain Scheduler","summary":" In Open-Set Domain Generalization (OSDG), the model is exposed to both new\nvariations of data appearance (domains) and open-set conditions, where both\nknown and novel categories are present at test time. The challenges of this\ntask arise from the dual need to generalize across diverse domains and\naccurately quantify category novelty, which is critical for applications in\ndynamic environments. Recently, meta-learning techniques have demonstrated\nsuperior results in OSDG, effectively orchestrating the meta-train and -test\ntasks by employing varied random categories and predefined domain partition\nstrategies. These approaches prioritize a well-designed training schedule over\ntraditional methods that focus primarily on data augmentation and the\nenhancement of discriminative feature learning. The prevailing meta-learning\nmodels in OSDG typically utilize a predefined sequential domain scheduler to\nstructure data partitions. However, a crucial aspect that remains inadequately\nexplored is the influence brought by strategies of domain schedulers during\ntraining. In this paper, we observe that an adaptive domain scheduler benefits\nmore in OSDG compared with prefixed sequential and random domain schedulers. We\npropose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve\nan adaptive domain scheduler. This method strategically sequences domains by\nassessing their reliabilities in utilizing a follower network, trained with\nconfidence scores learned in an evidential manner, regularized by max rebiasing\ndiscrepancy, and optimized in a bi-level manner. The results show that our\nmethod substantially improves OSDG performance and achieves more discriminative\nembeddings for both the seen and unseen categories. The source code will be\navailable at https://github.com/KPeng9510/EBiL-HaDS.\n","authors":["Kunyu Peng","Di Wen","Kailun Yang","Ao Luo","Yufan Chen","Jia Fu","M. Saquib Sarfraz","Alina Roitberg","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2409.17555v1.pdf","comment":"Accepted to NeurIPS 2024. The source code will be available at\n https://github.com/KPeng9510/EBiL-HaDS"},{"id":"http://arxiv.org/abs/2406.14990v2","updated":"2024-09-26T05:51:20Z","published":"2024-06-21T09:03:37Z","title":"Learning Variable Compliance Control From a Few Demonstrations for\n Bimanual Robot with Haptic Feedback Teleoperation System","summary":" Automating dexterous, contact-rich manipulation tasks using rigid robots is a\nsignificant challenge in robotics. Rigid robots, defined by their actuation\nthrough position commands, face issues of excessive contact forces due to their\ninability to adapt to contact with the environment, potentially causing damage.\nWhile compliance control schemes have been introduced to mitigate these issues\nby controlling forces via external sensors, they are hampered by the need for\nfine-tuning task-specific controller parameters. Learning from Demonstrations\n(LfD) offers an intuitive alternative, allowing robots to learn manipulations\nthrough observed actions. In this work, we introduce a novel system to enhance\nthe teaching of dexterous, contact-rich manipulations to rigid robots. Our\nsystem is twofold: firstly, it incorporates a teleoperation interface utilizing\nVirtual Reality (VR) controllers, designed to provide an intuitive and\ncost-effective method for task demonstration with haptic feedback. Secondly, we\npresent Comp-ACT (Compliance Control via Action Chunking with Transformers), a\nmethod that leverages the demonstrations to learn variable compliance control\nfrom a few demonstrations. Our methods have been validated across various\ncomplex contact-rich manipulation tasks using single-arm and bimanual robot\nsetups in simulated and real-world environments, demonstrating the\neffectiveness of our system in teaching robots dexterous manipulations with\nenhanced adaptability and safety. Code available at:\nhttps://github.com/omron-sinicx/CompACT\n","authors":["Tatsuya Kamijo","Cristian C. Beltran-Hernandez","Masashi Hamaya"],"pdf_url":"https://arxiv.org/pdf/2406.14990v2.pdf","comment":"Accepted to IROS 2024"},{"id":"http://arxiv.org/abs/2409.17550v1","updated":"2024-09-26T05:39:52Z","published":"2024-09-26T05:39:52Z","title":"A Simple but Strong Baseline for Sounding Video Generation: Effective\n Adaptation of Audio and Video Diffusion Models for Joint Generation","summary":" In this work, we build a simple but strong baseline for sounding video\ngeneration. Given base diffusion models for audio and video, we integrate them\nwith additional modules into a single model and train it to make the model\njointly generate audio and video. To enhance alignment between audio-video\npairs, we introduce two novel mechanisms in our model. The first one is\ntimestep adjustment, which provides different timestep information to each base\nmodel. It is designed to align how samples are generated along with timesteps\nacross modalities. The second one is a new design of the additional modules,\ntermed Cross-Modal Conditioning as Positional Encoding (CMC-PE). In CMC-PE,\ncross-modal information is embedded as if it represents temporal position\ninformation, and the embeddings are fed into the model like positional\nencoding. Compared with the popular cross-attention mechanism, CMC-PE provides\na better inductive bias for temporal alignment in the generated data.\nExperimental results validate the effectiveness of the two newly introduced\nmechanisms and also demonstrate that our method outperforms existing methods.\n","authors":["Masato Ishii","Akio Hayakawa","Takashi Shibuya","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2409.17550v1.pdf","comment":"The source code will be released soon"},{"id":"http://arxiv.org/abs/2409.17546v1","updated":"2024-09-26T05:25:25Z","published":"2024-09-26T05:25:25Z","title":"MASSFormer: Mobility-Aware Spectrum Sensing using Transformer-Driven\n Tiered Structure","summary":" In this paper, we develop a novel mobility-aware transformer-driven tiered\nstructure (MASSFormer) based cooperative spectrum sensing method that\neffectively models the spatio-temporal dynamics of user movements. Unlike\nexisting methods, our method considers a dynamic scenario involving mobile\nprimary users (PUs) and secondary users (SUs)and addresses the complexities\nintroduced by user mobility. The transformer architecture utilizes an attention\nmechanism, enabling the proposed method to adeptly model the temporal dynamics\nof user mobility by effectively capturing long-range dependencies within the\ninput data. The proposed method first computes tokens from the sequence of\ncovariance matrices (CMs) for each SU and processes them in parallel using the\nSUtransformer network to learn the spatio-temporal features at SUlevel.\nSubsequently, the collaborative transformer network learns the group-level PU\nstate from all SU-level feature representations. The attention-based sequence\npooling method followed by the transformer encoder adjusts the contributions of\nall tokens. The main goal of predicting the PU states at each SU-level and\ngroup-level is to improve detection performance even more. We conducted a\nsufficient amount of simulations and compared the detection performance of\ndifferent SS methods. The proposed method is tested under imperfect reporting\nchannel scenarios to show robustness. The efficacy of our method is validated\nwith the simulation results demonstrating its higher performance compared with\nexisting methods in terms of detection probability, sensing error, and\nclassification accuracy.\n","authors":["Dimpal Janu","Sandeep Mandia","Kuldeep Singh","Sandeep Kumar"],"pdf_url":"https://arxiv.org/pdf/2409.17546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17545v1","updated":"2024-09-26T05:24:14Z","published":"2024-09-26T05:24:14Z","title":"Modulated Intervention Preference Optimization (MIPO): Keey the Easy,\n Refine the Difficult","summary":" Preference optimization methods typically begin training with a well-trained\nSFT model as a reference model. In RLHF and DPO, a regularization term is used\nduring the preference optimization process to prevent the policy model from\ndeviating too far from the reference model's distribution, thereby avoiding the\ngeneration of anomalous responses. When the reference model is already\nwell-aligned with the given data or only requires slight adjustments, this\napproach can produce a well-aligned model. However, if the reference model is\nnot aligned with the given data and requires significant deviation from its\ncurrent state, a regularization term may actually hinder the model alignment.\nIn this study, we propose \\textbf{Modulated Intervention Preference\nOptimization (MIPO)} to address this issue. MIPO modulates the degree of\nintervention from the reference model based on how well the given data is\naligned with it. If the data is well-aligned, the intervention is increased to\nprevent the policy model from diverging significantly from reference model.\nConversely, if the alignment is poor, the interference is reduced to facilitate\nmore extensive training. We compare the performance of MIPO and DPO using\nMistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental\nresults demonstrate that MIPO consistently outperforms DPO across various\nevaluation scenarios.\n","authors":["Cheolhun Jang"],"pdf_url":"https://arxiv.org/pdf/2409.17545v1.pdf","comment":"8pages, submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2409.17544v1","updated":"2024-09-26T05:22:16Z","published":"2024-09-26T05:22:16Z","title":"Optimizing the Induced Correlation in Omnibus Joint Graph Embeddings","summary":" Theoretical and empirical evidence suggests that joint graph embedding\nalgorithms induce correlation across the networks in the embedding space. In\nthe Omnibus joint graph embedding framework, previous results explicitly\ndelineated the dual effects of the algorithm-induced and model-inherent\ncorrelations on the correlation across the embedded networks. Accounting for\nand mitigating the algorithm-induced correlation is key to subsequent\ninference, as sub-optimal Omnibus matrix constructions have been demonstrated\nto lead to loss in inference fidelity. This work presents the first efforts to\nautomate the Omnibus construction in order to address two key questions in this\njoint embedding framework: the correlation-to-OMNI problem and the flat\ncorrelation problem. In the flat correlation problem, we seek to understand the\nminimum algorithm-induced flat correlation (i.e., the same across all graph\npairs) produced by a generalized Omnibus embedding. Working in a subspace of\nthe fully general Omnibus matrices, we prove both a lower bound for this flat\ncorrelation and that the classical Omnibus construction induces the maximal\nflat correlation. In the correlation-to-OMNI problem, we present an algorithm\n-- named corr2Omni -- that, from a given matrix of estimated pairwise graph\ncorrelations, estimates the matrix of generalized Omnibus weights that induces\noptimal correlation in the embedding space. Moreover, in both simulated and\nreal data settings, we demonstrate the increased effectiveness of our corr2Omni\nalgorithm versus the classical Omnibus construction.\n","authors":["Konstantinos Pantazis","Michael Trosset","William N. Frost","Carey E. Priebe","Vince Lyzinski"],"pdf_url":"https://arxiv.org/pdf/2409.17544v1.pdf","comment":"34 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.17538v1","updated":"2024-09-26T04:56:49Z","published":"2024-09-26T04:56:49Z","title":"On the Implicit Relation Between Low-Rank Adaptation and Differential\n Privacy","summary":" A significant approach in natural language processing involves large-scale\npre-training on general domain data followed by adaptation to specific tasks or\ndomains. As models grow in size, full fine-tuning all parameters becomes\nincreasingly impractical. To address this, some methods for low-rank task\nadaptation of language models have been proposed, e.g. LoRA and FLoRA. These\nmethods keep the pre-trained model weights fixed and incorporate trainable\nlow-rank decomposition matrices into some layers of the transformer\narchitecture, called adapters. This approach significantly reduces the number\nof trainable parameters required for downstream tasks compared to full\nfine-tuning all parameters. In this work, we look at low-rank adaptation from\nthe lens of data privacy. We show theoretically that the low-rank adaptation\nused in LoRA and FLoRA is equivalent to injecting some random noise into the\nbatch gradients w.r.t the adapter parameters coming from their full\nfine-tuning, and we quantify the variance of the injected noise. By\nestablishing a Berry-Esseen type bound on the total variation distance between\nthe noise distribution and a Gaussian distribution with the same variance, we\nshow that the dynamics of LoRA and FLoRA are very close to differentially\nprivate full fine-tuning the adapters, which suggests that low-rank adaptation\nimplicitly provides privacy w.r.t the fine-tuning data. Finally, using\nJohnson-Lindenstrauss lemma, we show that when augmented with gradient\nclipping, low-rank adaptation is almost equivalent to differentially private\nfull fine-tuning adapters with a fixed noise scale.\n","authors":["Saber Malekmohammadi","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2409.17538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13689v3","updated":"2024-09-26T04:54:43Z","published":"2024-08-24T23:20:38Z","title":"Decentralised Variational Inference Frameworks for Multi-object Tracking\n on Sensor Network","summary":" This paper tackles the challenge of multi-sensor multi-object tracking by\nproposing various decentralised Variational Inference (VI) schemes that match\nthe tracking performance of centralised sensor fusion with only local message\nexchanges among neighboring sensors. We first establish a centralised VI sensor\nfusion scheme as a benchmark and analyse the limitations of its decentralised\ncounterpart, which requires sensors to await consensus at each VI iteration.\nTherefore, we propose a decentralised gradient-based VI framework that\noptimises the Locally Maximised Evidence Lower Bound (LM-ELBO) instead of the\nstandard ELBO, which reduces the parameter search space and enables faster\nconvergence, making it particularly beneficial for decentralised tracking.This\nproposed framework is inherently self-evolving, improving with advancements in\ndecentralised optimisation techniques for convergence guarantees and\nefficiency. Further, we enhance the convergence speed of proposed decentralised\nschemes using natural gradients and gradient tracking strategies. Results\nverify that our decentralised VI schemes are empirically equivalent to\ncentralised fusion in tracking performance. Notably, the decentralised natural\ngradient VI method is the most communication-efficient, with communication\ncosts comparable to suboptimal decentralised strategies while delivering\nnotably higher tracking accuracy.\n","authors":["Qing Li","Runze Gan","Simon Godsill"],"pdf_url":"https://arxiv.org/pdf/2408.13689v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15734v2","updated":"2024-09-26T04:37:48Z","published":"2024-09-24T04:39:47Z","title":"Trust-Region Sequential Quadratic Programming for Stochastic\n Optimization with Random Models","summary":" In this work, we consider solving optimization problems with a stochastic\nobjective and deterministic equality constraints. We propose a Trust-Region\nSequential Quadratic Programming method to find both first- and second-order\nstationary points. Our method utilizes a random model to represent the\nobjective function, which is constructed from stochastic observations of the\nobjective and is designed to satisfy proper adaptive accuracy conditions with a\nhigh but fixed probability. To converge to first-order stationary points, our\nmethod computes a gradient step in each iteration defined by minimizing a\nquadratic approximation of the objective subject to a (relaxed) linear\napproximation of the problem constraints and a trust-region constraint. To\nconverge to second-order stationary points, our method additionally computes an\neigen step to explore the negative curvature of the reduced Hessian matrix, as\nwell as a second-order correction step to address the potential Maratos effect,\nwhich arises due to the nonlinearity of the problem constraints. Such an effect\nmay impede the method from moving away from saddle points. Both gradient and\neigen step computations leverage a novel parameter-free decomposition of the\nstep and the trust-region radius, accounting for the proportions among the\nfeasibility residual, optimality residual, and negative curvature. We establish\nglobal almost sure first- and second-order convergence guarantees for our\nmethod, and present computational results on CUTEst problems, regression\nproblems, and saddle-point problems to demonstrate its superiority over\nexisting line-search-based stochastic methods.\n","authors":["Yuchen Fang","Sen Na","Michael W. Mahoney","Mladen Kolar"],"pdf_url":"https://arxiv.org/pdf/2409.15734v2.pdf","comment":"41 pages, 3 figures"},{"id":"http://arxiv.org/abs/2406.05316v3","updated":"2024-09-26T03:54:15Z","published":"2024-06-08T01:32:44Z","title":"CMamba: Channel Correlation Enhanced State Space Models for Multivariate\n Time Series Forecasting","summary":" Recent advancements in multivariate time series forecasting have been\npropelled by Linear-based, Transformer-based, and Convolution-based models,\nwith Transformer-based architectures gaining prominence for their efficacy in\ntemporal and cross-channel mixing. More recently, Mamba, a state space model,\nhas emerged with robust sequence and feature mixing capabilities. However, the\nsuitability of the vanilla Mamba design for time series forecasting remains an\nopen question, particularly due to its inadequate handling of cross-channel\ndependencies. Capturing cross-channel dependencies is critical in enhancing the\nperformance of multivariate time series prediction. Recent findings show that\nself-attention excels in capturing cross-channel dependencies, whereas other\nsimpler mechanisms, such as MLP, may degrade model performance. This is\ncounterintuitive, as MLP, being a learnable architecture, should theoretically\ncapture both correlations and irrelevances, potentially leading to neutral or\nimproved performance. Diving into the self-attention mechanism, we attribute\nthe observed degradation in MLP performance to its lack of data dependence and\nglobal receptive field, which result in MLP's lack of generalization ability.\nBased on the above insights, we introduce a refined Mamba variant tailored for\ntime series forecasting. Our proposed model, \\textbf{CMamba}, incorporates a\nmodified Mamba (M-Mamba) module for temporal dependencies modeling, a global\ndata-dependent MLP (GDD-MLP) to effectively capture cross-channel dependencies,\nand a Channel Mixup mechanism to mitigate overfitting. Comprehensive\nexperiments conducted on seven real-world datasets demonstrate the efficacy of\nour model in improving forecasting performance.\n","authors":["Chaolv Zeng","Zhanyu Liu","Guanjie Zheng","Linghe Kong"],"pdf_url":"https://arxiv.org/pdf/2406.05316v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17517v1","updated":"2024-09-26T03:52:41Z","published":"2024-09-26T03:52:41Z","title":"Dataset Distillation-based Hybrid Federated Learning on Non-IID Data","summary":" In federated learning, the heterogeneity of client data has a great impact on\nthe performance of model training. Many heterogeneity issues in this process\nare raised by non-independently and identically distributed (Non-IID) data.\nThis study focuses on the issue of label distribution skew. To address it, we\npropose a hybrid federated learning framework called HFLDD, which integrates\ndataset distillation to generate approximately independent and equally\ndistributed (IID) data, thereby improving the performance of model training.\nParticularly, we partition the clients into heterogeneous clusters, where the\ndata labels among different clients within a cluster are unbalanced while the\ndata labels among different clusters are balanced. The cluster headers collect\ndistilled data from the corresponding cluster members, and conduct model\ntraining in collaboration with the server. This training process is like\ntraditional federated learning on IID data, and hence effectively alleviates\nthe impact of Non-IID data on model training. Furthermore, we compare our\nproposed method with typical baseline methods on public datasets. Experimental\nresults demonstrate that when the data labels are severely imbalanced, the\nproposed HFLDD outperforms the baseline methods in terms of both test accuracy\nand communication cost.\n","authors":["Xiufang Shi","Wei Zhang","Mincheng Wu","Guangyi Liu","Zhenyu Wen","Shibo He","Tejal Shah","Rajiv Ranjan"],"pdf_url":"https://arxiv.org/pdf/2409.17517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11337v3","updated":"2024-09-26T03:51:19Z","published":"2023-02-18T07:40:03Z","title":"Bayesian Matrix Decomposition and Applications","summary":" The sole aim of this book is to give a self-contained introduction to\nconcepts and mathematical tools in Bayesian matrix decomposition in order to\nseamlessly introduce matrix decomposition techniques and their applications in\nsubsequent sections. However, we clearly realize our inability to cover all the\nuseful and interesting results concerning Bayesian matrix decomposition and\ngiven the paucity of scope to present this discussion, e.g., the separated\nanalysis of variational inference for conducting the optimization. We refer the\nreader to literature in the field of Bayesian analysis for a more detailed\nintroduction to the related fields.\n This book is primarily a summary of purpose, significance of important\nBayesian matrix decomposition methods, e.g., real-valued decomposition,\nnonnegative matrix factorization, Bayesian interpolative decomposition, and the\norigin and complexity of the methods which shed light on their applications.\nThe mathematical prerequisite is a first course in statistics and linear\nalgebra. Other than this modest background, the development is self-contained,\nwith rigorous proof provided throughout.\n","authors":["Jun Lu"],"pdf_url":"https://arxiv.org/pdf/2302.11337v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17516v1","updated":"2024-09-26T03:50:55Z","published":"2024-09-26T03:50:55Z","title":"Functional Classification of Spiking Signal Data Using Artificial\n Intelligence Techniques: A Review","summary":" Human brain neuron activities are incredibly significant nowadays. Neuronal\nbehavior is assessed by analyzing signal data such as electroencephalography\n(EEG), which can offer scientists valuable information about diseases and\nhuman-computer interaction. One of the difficulties researchers confront while\nevaluating these signals is the existence of large volumes of spike data.\nSpikes are some considerable parts of signal data that can happen as a\nconsequence of vital biomarkers or physical issues such as electrode movements.\nHence, distinguishing types of spikes is important. From this spot, the spike\nclassification concept commences. Previously, researchers classified spikes\nmanually. The manual classification was not precise enough as it involves\nextensive analysis. Consequently, Artificial Intelligence (AI) was introduced\ninto neuroscience to assist clinicians in classifying spikes correctly. This\nreview discusses the importance and use of AI in spike classification, focusing\non the recognition of neural activity noises. The task is divided into three\nmain components: preprocessing, classification, and evaluation. Existing\nmethods are introduced and their importance is determined. The review also\nhighlights the need for more efficient algorithms. The primary goal is to\nprovide a perspective on spike classification for future research and provide a\ncomprehensive understanding of the methodologies and issues involved. The\nreview organizes materials in the spike classification field for future\nstudies. In this work, numerous studies were extracted from different\ndatabases. The PRISMA-related research guidelines were then used to choose\npapers. Then, research studies based on spike classification using machine\nlearning and deep learning approaches with effective preprocessing were\nselected.\n","authors":["Danial Sharifrazi","Nouman Javed","Javad Hassannataj Joloudari","Roohallah Alizadehsani","Prasad N. Paradkar","Ru-San Tan","U. Rajendra Acharya","Asim Bhatti"],"pdf_url":"https://arxiv.org/pdf/2409.17516v1.pdf","comment":"8 figures, 32 pages"},{"id":"http://arxiv.org/abs/2409.17513v1","updated":"2024-09-26T03:48:47Z","published":"2024-09-26T03:48:47Z","title":"Comparing Unidirectional, Bidirectional, and Word2vec Models for\n Discovering Vulnerabilities in Compiled Lifted Code","summary":" Ransomware and other forms of malware cause significant financial and\noperational damage to organizations by exploiting long-standing and often\ndifficult-to-detect software vulnerabilities. To detect vulnerabilities such as\nbuffer overflows in compiled code, this research investigates the application\nof unidirectional transformer-based embeddings, specifically GPT-2. Using a\ndataset of LLVM functions, we trained a GPT-2 model to generate embeddings,\nwhich were subsequently used to build LSTM neural networks to differentiate\nbetween vulnerable and non-vulnerable code. Our study reveals that embeddings\nfrom the GPT-2 model significantly outperform those from bidirectional models\nof BERT and RoBERTa, achieving an accuracy of 92.5% and an F1-score of 89.7%.\nLSTM neural networks were developed with both frozen and unfrozen embedding\nmodel layers. The model with the highest performance was achieved when the\nembedding layers were unfrozen. Further, the research finds that, in exploring\nthe impact of different optimizers within this domain, the SGD optimizer\ndemonstrates superior performance over Adam. Overall, these findings reveal\nimportant insights into the potential of unidirectional transformer-based\napproaches in enhancing cybersecurity defenses.\n","authors":["Gary A. McCully","John D. Hastings","Shengjie Xu","Adam Fortier"],"pdf_url":"https://arxiv.org/pdf/2409.17513v1.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.17510v1","updated":"2024-09-26T03:40:12Z","published":"2024-09-26T03:40:12Z","title":"NeuroPath: A Neural Pathway Transformer for Joining the Dots of Human\n Connectomes","summary":" Although modern imaging technologies allow us to study connectivity between\ntwo distinct brain regions in-vivo, an in-depth understanding of how anatomical\nstructure supports brain function and how spontaneous functional fluctuations\nemerge remarkable cognition is still elusive. Meanwhile, tremendous efforts\nhave been made in the realm of machine learning to establish the nonlinear\nmapping between neuroimaging data and phenotypic traits. However, the absence\nof neuroscience insight in the current approaches poses significant challenges\nin understanding cognitive behavior from transient neural activities. To\naddress this challenge, we put the spotlight on the coupling mechanism of\nstructural connectivity (SC) and functional connectivity (FC) by formulating\nsuch network neuroscience question into an expressive graph representation\nlearning problem for high-order topology. Specifically, we introduce the\nconcept of topological detour to characterize how a ubiquitous instance of FC\n(direct link) is supported by neural pathways (detour) physically wired by SC,\nwhich forms a cyclic loop interacted by brain structure and function. In the\nclich\\'e of machine learning, the multi-hop detour pathway underlying SC-FC\ncoupling allows us to devise a novel multi-head self-attention mechanism within\nTransformer to capture multi-modal feature representation from paired graphs of\nSC and FC. Taken together, we propose a biological-inspired deep model, coined\nas NeuroPath, to find putative connectomic feature representations from the\nunprecedented amount of neuroimages, which can be plugged into various\ndownstream applications such as task recognition and disease diagnosis. We have\nevaluated NeuroPath on large-scale public datasets including HCP and UK Biobank\nunder supervised and zero-shot learning, where the state-of-the-art performance\nby our NeuroPath indicates great potential in network neuroscience.\n","authors":["Ziquan Wei","Tingting Dan","Jiaqi Ding","Paul J Laurienti","Guorong Wu"],"pdf_url":"https://arxiv.org/pdf/2409.17510v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17508v1","updated":"2024-09-26T03:33:26Z","published":"2024-09-26T03:33:26Z","title":"Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task\n Learning Via Connector-MoE","summary":" Multi-modal large language models (MLLMs) have shown impressive capabilities\nas a general-purpose interface for various visual and linguistic tasks.\nHowever, building a unified MLLM for multi-task learning in the medical field\nremains a thorny challenge. To mitigate the tug-of-war problem of multi-modal\nmulti-task optimization, recent advances primarily focus on improving the LLM\ncomponents, while neglecting the connector that bridges the gap between\nmodalities. In this paper, we introduce Uni-Med, a novel medical generalist\nfoundation model which consists of a universal visual feature extraction\nmodule, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting\nfrom the proposed CMoE that leverages a well-designed router with a mixture of\nprojection experts at the connector, Uni-Med achieves efficient solution to the\ntug-of-war problem and can perform six different medical tasks including\nquestion answering, visual question answering, report generation, referring\nexpression comprehension, referring expression generation and image\nclassification. To the best of our knowledge, Uni-Med is the first effort to\ntackle multi-task interference at the connector. Extensive ablation experiments\nvalidate the effectiveness of introducing CMoE under any configuration, with up\nto an average 8% performance gains. We further provide interpretation analysis\nof the tug-of-war problem from the perspective of gradient optimization and\nparameter statistics. Compared to previous state-of-the-art medical MLLMs,\nUni-Med achieves competitive or superior evaluation metrics on diverse tasks.\nCode, data and model will be soon available at GitHub.\n","authors":["Xun Zhu","Ying Hu","Fanbin Mo","Miao Li","Ji Wu"],"pdf_url":"https://arxiv.org/pdf/2409.17508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17505v1","updated":"2024-09-26T03:24:59Z","published":"2024-09-26T03:24:59Z","title":"Sequential Kernelized Stein Discrepancy","summary":" We present a sequential version of the kernelized Stein discrepancy, which\nallows for conducting goodness-of-fit tests for unnormalized densities that are\ncontinuously monitored and adaptively stopped. That is, the sample size need\nnot be fixed prior to data collection; the practitioner can choose whether to\nstop the test or continue to gather evidence at any time while controlling the\nfalse discovery rate. In stark contrast to related literature, we do not impose\nuniform boundedness on the Stein kernel. Instead, we exploit the potential\nboundedness of the Stein kernel at arbitrary point evaluations to define test\nmartingales, that give way to the subsequent novel sequential tests. We prove\nthe validity of the test, as well as an asymptotic lower bound for the\nlogarithmic growth of the wealth process under the alternative. We further\nillustrate the empirical performance of the test with a variety of\ndistributions, including restricted Boltzmann machines.\n","authors":["Diego Martinez-Taboada","Aaditya Ramdas"],"pdf_url":"https://arxiv.org/pdf/2409.17505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17504v1","updated":"2024-09-26T03:22:09Z","published":"2024-09-26T03:22:09Z","title":"HaloScope: Harnessing Unlabeled LLM Generations for Hallucination\n Detection","summary":" The surge in applications of large language models (LLMs) has prompted\nconcerns about the generation of misleading or fabricated information, known as\nhallucinations. Therefore, detecting hallucinations has become critical to\nmaintaining trust in LLM-generated content. A primary challenge in learning a\ntruthfulness classifier is the lack of a large amount of labeled truthful and\nhallucinated data. To address the challenge, we introduce HaloScope, a novel\nlearning framework that leverages the unlabeled LLM generations in the wild for\nhallucination detection. Such unlabeled data arises freely upon deploying LLMs\nin the open world, and consists of both truthful and hallucinated information.\nTo harness the unlabeled data, we present an automated membership estimation\nscore for distinguishing between truthful and untruthful generations within\nunlabeled mixture data, thereby enabling the training of a binary truthfulness\nclassifier on top. Importantly, our framework does not require extra data\ncollection and human annotations, offering strong flexibility and practicality\nfor real-world applications. Extensive experiments show that HaloScope can\nachieve superior hallucination detection performance, outperforming the\ncompetitive rivals by a significant margin. Code is available at\nhttps://github.com/deeplearningwisc/haloscope.\n","authors":["Xuefeng Du","Chaowei Xiao","Yixuan Li"],"pdf_url":"https://arxiv.org/pdf/2409.17504v1.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2409.17502v1","updated":"2024-09-26T03:20:09Z","published":"2024-09-26T03:20:09Z","title":"Broadcast Product: Shape-aligned Element-wise Multiplication and Beyond","summary":" We propose a new operator defined between two tensors, the broadcast product.\nThe broadcast product calculates the Hadamard product after duplicating\nelements to align the shapes of the two tensors. Complex tensor operations in\nlibraries like \\texttt{numpy} can be succinctly represented as mathematical\nexpressions using the broadcast product. Finally, we propose a novel tensor\ndecomposition using the broadcast product, highlighting its potential\napplications in dimensionality reduction.\n","authors":["Yusuke Matsui","Tatsuya Yokota"],"pdf_url":"https://arxiv.org/pdf/2409.17502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17499v1","updated":"2024-09-26T03:12:20Z","published":"2024-09-26T03:12:20Z","title":"Does Worst-Performing Agent Lead the Pack? Analyzing Agent Dynamics in\n Unified Distributed SGD","summary":" Distributed learning is essential to train machine learning algorithms across\nheterogeneous agents while maintaining data privacy. We conduct an asymptotic\nanalysis of Unified Distributed SGD (UD-SGD), exploring a variety of\ncommunication patterns, including decentralized SGD and local SGD within\nFederated Learning (FL), as well as the increasing communication interval in\nthe FL setting. In this study, we assess how different sampling strategies,\nsuch as i.i.d. sampling, shuffling, and Markovian sampling, affect the\nconvergence speed of UD-SGD by considering the impact of agent dynamics on the\nlimiting covariance matrix as described in the Central Limit Theorem (CLT). Our\nfindings not only support existing theories on linear speedup and asymptotic\nnetwork independence, but also theoretically and empirically show how efficient\nsampling strategies employed by individual agents contribute to overall\nconvergence in UD-SGD. Simulations reveal that a few agents using highly\nefficient sampling can achieve or surpass the performance of the majority\nemploying moderately improved strategies, providing new insights beyond\ntraditional analyses focusing on the worst-performing agent.\n","authors":["Jie Hu","Yi-Ting Ma","Do Young Eun"],"pdf_url":"https://arxiv.org/pdf/2409.17499v1.pdf","comment":"To appear in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.14578v4","updated":"2024-09-26T02:59:44Z","published":"2024-05-23T13:52:36Z","title":"Surge Phenomenon in Optimal Learning Rate and Batch Size Scaling","summary":" In current deep learning tasks, Adam style optimizers such as Adam, Adagrad,\nRMSProp, Adafactor, and Lion have been widely used as alternatives to SGD style\noptimizers. These optimizers typically update model parameters using the sign\nof gradients, resulting in more stable convergence curves. The learning rate\nand the batch size are the most critical hyperparameters for optimizers, which\nrequire careful tuning to enable effective convergence. Previous research has\nshown that the optimal learning rate increases linearly or follows similar\nrules with batch size for SGD style optimizers. However, this conclusion is not\napplicable to Adam style optimizers. In this paper, we elucidate the connection\nbetween optimal learning rates and batch sizes for Adam style optimizers\nthrough both theoretical analysis and extensive experiments. First, we raise\nthe scaling law between batch sizes and optimal learning rates in the sign of\ngradient case, in which we prove that the optimal learning rate first rises and\nthen falls as the batch size increases. Moreover, the peak value of the surge\nwill gradually move toward the larger batch size as training progresses.\nSecond, we conducted experiments on various CV and NLP tasks and verified the\ncorrectness of the scaling law.\n","authors":["Shuaipeng Li","Penghao Zhao","Hailin Zhang","Xingwu Sun","Hao Wu","Dian Jiao","Weiyan Wang","Chengjun Liu","Zheng Fang","Jinbao Xue","Yangyu Tao","Bin Cui","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2405.14578v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16663v2","updated":"2024-09-26T02:57:52Z","published":"2024-09-25T06:48:25Z","title":"Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles\n Using Latent Space Generative World Models","summary":" We propose the use of latent space generative world models to address the\ncovariate shift problem in autonomous driving. A world model is a neural\nnetwork capable of predicting an agent's next state given past states and\nactions. By leveraging a world model during training, the driving policy\neffectively mitigates covariate shift without requiring an excessive amount of\ntraining data. During end-to-end training, our policy learns how to recover\nfrom errors by aligning with states observed in human demonstrations, so that\nat runtime it can recover from perturbations outside the training distribution.\nAdditionally, we introduce a novel transformer-based perception encoder that\nemploys multi-view cross-attention and a learned scene query. We present\nqualitative and quantitative results, demonstrating significant improvements\nupon prior state of the art in closed-loop testing in the CARLA simulator, as\nwell as showing the ability to handle perturbations in both CARLA and NVIDIA's\nDRIVE Sim.\n","authors":["Alexander Popov","Alperen Degirmenci","David Wehr","Shashank Hegde","Ryan Oldja","Alexey Kamenev","Bertrand Douillard","David Nistér","Urs Muller","Ruchi Bhargava","Stan Birchfield","Nikolai Smolyanskiy"],"pdf_url":"https://arxiv.org/pdf/2409.16663v2.pdf","comment":"7 pages, 6 figures, for ICRA 2025 conference, for associated video\n file, see https://youtu.be/fO7RZ57gVxk"},{"id":"http://arxiv.org/abs/2409.17490v1","updated":"2024-09-26T02:54:19Z","published":"2024-09-26T02:54:19Z","title":"MathDSL: A Domain-Specific Language for Concise Mathematical Solutions\n Via Program Synthesis","summary":" We present MathDSL, a Domain-Specific Language (DSL) for mathematical\nequation solving, which, when deployed in program synthesis models, outperforms\nstate-of-the-art reinforcement-learning-based methods. We also introduce a\nquantitative metric for measuring the conciseness of a mathematical solution\nand demonstrate the improvement in the quality of generated solutions compared\nto other methods. Our system demonstrates that a program synthesis system\n(DreamCoder) using MathDSL can generate programs that solve linear equations\nwith greater accuracy and conciseness than using reinforcement learning\nsystems. Additionally, we demonstrate that if we use the action spaces of\nprevious reinforcement learning systems as DSLs, MathDSL outperforms the\naction-space-DSLs. We use DreamCoder to store equation-solving strategies as\nlearned abstractions in its program library and demonstrate that by using\nMathDSL, these can be converted into human-interpretable solution strategies\nthat could have applications in mathematical education.\n","authors":["Sagnik Anupam","Maddy Bowers","Omar Costilla-Reyes","Armando Solar-Lezama"],"pdf_url":"https://arxiv.org/pdf/2409.17490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04585v2","updated":"2024-09-26T02:53:15Z","published":"2024-01-09T14:42:49Z","title":"EDA-DM: Enhanced Distribution Alignment for Post-Training Quantization\n of Diffusion Models","summary":" Diffusion models have achieved great success in image generation tasks\nthrough iterative noise estimation. However, the heavy denoising process and\ncomplex neural networks hinder their low-latency applications in real-world\nscenarios. Quantization can effectively reduce model complexity, and\npost-training quantization (PTQ), which does not require fine-tuning, is highly\npromising for compressing and accelerating diffusion models. Unfortunately, we\nfind that due to the highly dynamic distribution of activations in different\ndenoising steps, existing PTQ methods for diffusion models suffer from\ndistribution mismatch issues at both calibration sample level and\nreconstruction output level, which makes the performance far from satisfactory,\nespecially in low-bit cases. In this paper, we propose Enhanced Distribution\nAlignment for Post-Training Quantization of Diffusion Models (EDA-DM) to\naddress the above issues. Specifically, at the calibration sample level, we\nselect calibration samples based on the density and variety in the latent\nspace, thus facilitating the alignment of their distribution with the overall\nsamples; and at the reconstruction output level, we modify the loss of block\nreconstruction with the losses of layers, aligning the outputs of quantized\nmodel and full-precision model at different network granularity. Extensive\nexperiments demonstrate that EDA-DM significantly outperforms the existing PTQ\nmethods across various models (DDIM, LDM-4, LDM-8, Stable-Diffusion) and\ndifferent datasets (CIFAR-10, LSUN-Bedroom, LSUN-Church, ImageNet, MS-COCO).\n","authors":["Xuewen Liu","Zhikai Li","Junrui Xiao","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2401.04585v2.pdf","comment":"Code: http://github.com/BienLuky/EDA-DM"}],"Multimedia":[{"id":"http://arxiv.org/abs/2311.04591v4","updated":"2024-09-26T16:35:58Z","published":"2023-11-08T10:45:09Z","title":"Exploring Event-based Human Pose Estimation with 3D Event\n Representations","summary":" Human pose estimation is a fundamental and appealing task in computer vision.\nAlthough traditional cameras are commonly applied, their reliability decreases\nin scenarios under high dynamic range or heavy motion blur, where event cameras\noffer a robust solution. Predominant event-based methods accumulate events into\nframes, ignoring the asynchronous and high temporal resolution that is crucial\nfor distinguishing distinct actions. To address this issue and to unlock the 3D\npotential of event information, we introduce two 3D event representations: the\nRasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The\nRasEPC aggregates events within concise temporal slices at identical positions,\npreserving their 3D attributes along with statistical information, thereby\nsignificantly reducing memory and computational demands. Meanwhile, the DEV\nrepresentation discretizes events into voxels and projects them across three\northogonal planes, utilizing decoupled event attention to retrieve 3D cues from\nthe 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic\nevent-based dataset crafted to facilitate training and quantitative analysis in\noutdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD\ndataset, and our EV-3DPW dataset, with further qualitative validation via a\nderived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our\ncode and dataset have been made publicly available at\nhttps://github.com/MasterHow/EventPointPose.\n","authors":["Xiaoting Yin","Hao Shi","Jiaan Chen","Ze Wang","Yaozu Ye","Kailun Yang","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2311.04591v4.pdf","comment":"Accepted to Computer Vision and Image Understanding (CVPU). Extended\n version of arXiv:2206.04511. The code and dataset are available at\n https://github.com/MasterHow/EventPointPose"},{"id":"http://arxiv.org/abs/2409.17899v1","updated":"2024-09-26T14:49:09Z","published":"2024-09-26T14:49:09Z","title":"Revisiting Acoustic Similarity in Emotional Speech and Music via\n Self-Supervised Representations","summary":" Emotion recognition from speech and music shares similarities due to their\nacoustic overlap, which has led to interest in transferring knowledge between\nthese domains. However, the shared acoustic cues between speech and music,\nparticularly those encoded by Self-Supervised Learning (SSL) models, remain\nlargely unexplored, given the fact that SSL models for speech and music have\nrarely been applied in cross-domain research. In this work, we revisit the\nacoustic similarity between emotion speech and music, starting with an analysis\nof the layerwise behavior of SSL models for Speech Emotion Recognition (SER)\nand Music Emotion Recognition (MER). Furthermore, we perform cross-domain\nadaptation by comparing several approaches in a two-stage fine-tuning process,\nexamining effective ways to utilize music for SER and speech for MER. Lastly,\nwe explore the acoustic similarities between emotional speech and music using\nFrechet audio distance for individual emotions, uncovering the issue of emotion\nbias in both speech and music SSL models. Our findings reveal that while speech\nand music SSL models do capture shared acoustic features, their behaviors can\nvary depending on different emotions due to their training strategies and\ndomain-specificities. Additionally, parameter-efficient fine-tuning can enhance\nSER and MER performance by leveraging knowledge from each other. This study\nprovides new insights into the acoustic similarity between emotional speech and\nmusic, and highlights the potential for cross-domain generalization to improve\nSER and MER systems.\n","authors":["Yujia Sun","Zeyu Zhao","Korin Richmond","Yuanchao Li"],"pdf_url":"https://arxiv.org/pdf/2409.17899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17864v1","updated":"2024-09-26T14:12:23Z","published":"2024-09-26T14:12:23Z","title":"A Multimodal Single-Branch Embedding Network for Recommendation in\n Cold-Start and Missing Modality Scenarios","summary":" Most recommender systems adopt collaborative filtering (CF) and provide\nrecommendations based on past collective interactions. Therefore, the\nperformance of CF algorithms degrades when few or no interactions are\navailable, a scenario referred to as cold-start. To address this issue,\nprevious work relies on models leveraging both collaborative data and side\ninformation on the users or items. Similar to multimodal learning, these models\naim at combining collaborative and content representations in a shared\nembedding space. In this work we propose a novel technique for multimodal\nrecommendation, relying on a multimodal Single-Branch embedding network for\nRecommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction\ndata as well as multimodal side information using the same single-branch\nembedding network on different modalities. This makes SiBraR effective in\nscenarios of missing modality, including cold start. Our extensive experiments\non large-scale recommendation datasets from three different recommendation\ndomains (music, movie, and e-commerce) and providing multimodal content\ninformation (audio, text, image, labels, and interactions) show that SiBraR\nsignificantly outperforms CF as well as state-of-the-art content-based RSs in\ncold-start scenarios, and is competitive in warm scenarios. We show that\nSiBraR's recommendations are accurate in missing modality scenarios, and that\nthe model is able to map different modalities to the same region of the shared\nembedding space, hence reducing the modality gap.\n","authors":["Christian Ganhör","Marta Moscati","Anna Hausberger","Shah Nawaz","Markus Schedl"],"pdf_url":"https://arxiv.org/pdf/2409.17864v1.pdf","comment":"Accepted at 18th ACM Conference on Recommender Systems (RecSys '24)"},{"id":"http://arxiv.org/abs/2409.17678v1","updated":"2024-09-26T09:37:04Z","published":"2024-09-26T09:37:04Z","title":"Modeling the Popularity of Events on Web by Sparsity and\n Mutual-Excitation Guided Graph Neural Network","summary":" The content of a webpage described or posted an event in the cyberspace\ninevitably reflects viewpoints, values and trends of the physical society.\nMapping an event on web to the popularity score plays a pivot role to sense the\nsocial trends from the cyberspace. However, the complex semantic correspondence\nbetween texts and images, as well as the implicit text-image-popularity mapping\nmechanics pose a significant challenge to this non-trivial task. In this paper,\nwe address this problem from a viewpoint of understanding the interpretable\nmapping mechanics. Concretely, we organize the keywords from different events\ninto an unified graph. The unified graph facilitates to model the popularity of\nevents via two-level mappings, i.e., the self excitation and the mutual\nexcitation. The self-excitation assumes that each keyword forms the popularity\nwhile the mutual-excitation models that two keywords would excite each other to\ndetermine the popularity of an event. Specifically, we use Graph Neural Network\n(GNN) as the backbone to model the self-excitation, the mutual excitation and\nthe context of images into a sparse and deep factor model. Besides, to our best\nknowledge, we release a challenge web event dataset for the popularity\nprediction task. The experimental results on three public datasets demonstrate\nthat our method achieves significant improvements and outperforms the\nstate-of-the-art methods. Dataset is publicly available at:\nhttps://github.com/pangjunbiao/Hot-events-dataset.\n","authors":["Jiaxin Deng","Linlin Jia","Junbiao Pang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17596v1","updated":"2024-09-26T07:22:38Z","published":"2024-09-26T07:22:38Z","title":"Subjective and Objective Quality-of-Experience Evaluation Study for Live\n Video Streaming","summary":" In recent years, live video streaming has gained widespread popularity across\nvarious social media platforms. Quality of experience (QoE), which reflects\nend-users' satisfaction and overall experience, plays a critical role for media\nservice providers to optimize large-scale live compression and transmission\nstrategies to achieve perceptually optimal rate-distortion trade-off. Although\nmany QoE metrics for video-on-demand (VoD) have been proposed, there remain\nsignificant challenges in developing QoE metrics for live video streaming. To\nbridge this gap, we conduct a comprehensive study of subjective and objective\nQoE evaluations for live video streaming. For the subjective QoE study, we\nintroduce the first live video streaming QoE dataset, TaoLive QoE, which\nconsists of $42$ source videos collected from real live broadcasts and $1,155$\ncorresponding distorted ones degraded due to a variety of streaming\ndistortions, including conventional streaming distortions such as compression,\nstalling, as well as live streaming-specific distortions like frame skipping,\nvariable frame rate, etc. Subsequently, a human study was conducted to derive\nsubjective QoE scores of videos in the TaoLive QoE dataset. For the objective\nQoE study, we benchmark existing QoE models on the TaoLive QoE dataset as well\nas publicly available QoE datasets for VoD scenarios, highlighting that current\nmodels struggle to accurately assess video QoE, particularly for live content.\nHence, we propose an end-to-end QoE evaluation model, Tao-QoE, which integrates\nmulti-scale semantic features and optical flow-based motion features to\npredicting a retrospective QoE score, eliminating reliance on statistical\nquality of service (QoS) features.\n","authors":["Zehao Zhu","Wei Sun","Jun Jia","Wei Wu","Sibin Deng","Kai Li","Ying Chen","Xiongkuo Min","Jia Wang","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2409.17596v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17550v1","updated":"2024-09-26T05:39:52Z","published":"2024-09-26T05:39:52Z","title":"A Simple but Strong Baseline for Sounding Video Generation: Effective\n Adaptation of Audio and Video Diffusion Models for Joint Generation","summary":" In this work, we build a simple but strong baseline for sounding video\ngeneration. Given base diffusion models for audio and video, we integrate them\nwith additional modules into a single model and train it to make the model\njointly generate audio and video. To enhance alignment between audio-video\npairs, we introduce two novel mechanisms in our model. The first one is\ntimestep adjustment, which provides different timestep information to each base\nmodel. It is designed to align how samples are generated along with timesteps\nacross modalities. The second one is a new design of the additional modules,\ntermed Cross-Modal Conditioning as Positional Encoding (CMC-PE). In CMC-PE,\ncross-modal information is embedded as if it represents temporal position\ninformation, and the embeddings are fed into the model like positional\nencoding. Compared with the popular cross-attention mechanism, CMC-PE provides\na better inductive bias for temporal alignment in the generated data.\nExperimental results validate the effectiveness of the two newly introduced\nmechanisms and also demonstrate that our method outperforms existing methods.\n","authors":["Masato Ishii","Akio Hayakawa","Takashi Shibuya","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2409.17550v1.pdf","comment":"The source code will be released soon"},{"id":"http://arxiv.org/abs/2408.00970v2","updated":"2024-09-26T03:56:00Z","published":"2024-08-02T01:30:18Z","title":"Multimodal Fusion via Hypergraph Autoencoder and Contrastive Learning\n for Emotion Recognition in Conversation","summary":" Multimodal emotion recognition in conversation (MERC) seeks to identify the\nspeakers' emotions expressed in each utterance, offering significant potential\nacross diverse fields. The challenge of MERC lies in balancing speaker modeling\nand context modeling, encompassing both long-distance and short-distance\ncontexts, as well as addressing the complexity of multimodal information\nfusion. Recent research adopts graph-based methods to model intricate\nconversational relationships effectively. Nevertheless, the majority of these\nmethods utilize a fixed fully connected structure to link all utterances,\nrelying on convolution to interpret complex context. This approach can\ninherently heighten the redundancy in contextual messages and excessive graph\nnetwork smoothing, particularly in the context of long-distance conversations.\nTo address this issue, we propose a framework that dynamically adjusts\nhypergraph connections by variational hypergraph autoencoder (VHGAE), and\nemploys contrastive learning to mitigate uncertainty factors during the\nreconstruction process. Experimental results demonstrate the effectiveness of\nour proposal against the state-of-the-art methods on IEMOCAP and MELD datasets.\nWe release the code to support the reproducibility of this work at\nhttps://github.com/yzjred/-HAUCL.\n","authors":["Zijian Yi","Ziming Zhao","Zhishu Shen","Tiehua Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.00970v2.pdf","comment":"Accepted by ACM MULTIMEDIA 2024"},{"id":"http://arxiv.org/abs/2404.09245v2","updated":"2024-09-26T01:25:22Z","published":"2024-04-14T13:14:13Z","title":"Arena: A Patch-of-Interest ViT Inference Acceleration System for\n Edge-Assisted Video Analytics","summary":" The advent of edge computing has made real-time intelligent video analytics\nfeasible. Previous works, based on traditional model architecture (e.g., CNN,\nRNN, etc.), employ various strategies to filter out non-region-of-interest\ncontent to minimize bandwidth and computation consumption but show inferior\nperformance in adverse environments. Recently, visual foundation models based\non transformers have shown great performance in adverse environments due to\ntheir amazing generalization capability. However, they require a large amount\nof computation power, which limits their applications in real-time intelligent\nvideo analytics. In this paper, we find visual foundation models like Vision\nTransformer (ViT) also have a dedicated acceleration mechanism for video\nanalytics. To this end, we introduce Arena, an end-to-end edge-assisted video\ninference acceleration system based on ViT. We leverage the capability of ViT\nthat can be accelerated through token pruning by only offloading and feeding\nPatches-of-Interest to the downstream models. Additionally, we design an\nadaptive keyframe inference switching algorithm tailored to different videos,\ncapable of adapting to the current video content to jointly optimize accuracy\nand bandwidth. Through extensive experiments, our findings reveal that Arena\ncan boost inference speeds by up to 1.58\\(\\times\\) and 1.82\\(\\times\\) on\naverage while consuming only 47\\% and 31\\% of the bandwidth, respectively, all\nwith high inference accuracy.\n","authors":["Haosong Peng","Wei Feng","Hao Li","Yufeng Zhan","Ren Jin","Yuanqing Xia"],"pdf_url":"https://arxiv.org/pdf/2404.09245v2.pdf","comment":null}],"Robotics":[{"id":"http://arxiv.org/abs/2409.18122v1","updated":"2024-09-26T17:58:05Z","published":"2024-09-26T17:58:05Z","title":"RT-GuIDE: Real-Time Gaussian splatting for Information-Driven\n Exploration","summary":" We propose a framework for active mapping and exploration that leverages\nGaussian splatting for constructing information-rich maps. Further, we develop\na parallelized motion planning algorithm that can exploit the Gaussian map for\nreal-time navigation. The Gaussian map constructed onboard the robot is\noptimized for both photometric and geometric quality while enabling real-time\nsituational awareness for autonomy. We show through simulation experiments that\nour method is competitive with approaches that use alternate information gain\nmetrics, while being orders of magnitude faster to compute. In real-world\nexperiments, our algorithm achieves better map quality (10% higher Peak\nSignal-to-Noise Ratio (PSNR) and 30% higher geometric reconstruction accuracy)\nthan Gaussian maps constructed by traditional exploration baselines. Experiment\nvideos and more details can be found on our project page:\nhttps://tyuezhan.github.io/RT_GuIDE/\n","authors":["Yuezhan Tao","Dexter Ong","Varun Murali","Igor Spasojevic","Pratik Chaudhari","Vijay Kumar"],"pdf_url":"https://arxiv.org/pdf/2409.18122v1.pdf","comment":"Submitted to ICRA2025"},{"id":"http://arxiv.org/abs/2409.18121v1","updated":"2024-09-26T17:57:16Z","published":"2024-09-26T17:57:16Z","title":"Robot See Robot Do: Imitating Articulated Object Manipulation with\n Monocular 4D Reconstruction","summary":" Humans can learn to manipulate new objects by simply watching others;\nproviding robots with the ability to learn from such demonstrations would\nenable a natural interface specifying new behaviors. This work develops Robot\nSee Robot Do (RSRD), a method for imitating articulated object manipulation\nfrom a single monocular RGB human demonstration given a single static\nmulti-view object scan. We first propose 4D Differentiable Part Models\n(4D-DPM), a method for recovering 3D part motion from a monocular video with\ndifferentiable rendering. This analysis-by-synthesis approach uses part-centric\nfeature fields in an iterative optimization which enables the use of geometric\nregularizers to recover 3D motions from only a single video. Given this 4D\nreconstruction, the robot replicates object trajectories by planning bimanual\narm motions that induce the demonstrated object part motion. By representing\ndemonstrations as part-centric trajectories, RSRD focuses on replicating the\ndemonstration's intended behavior while considering the robot's own\nmorphological limits, rather than attempting to reproduce the hand's motion. We\nevaluate 4D-DPM's 3D tracking accuracy on ground truth annotated 3D part\ntrajectories and RSRD's physical execution performance on 9 objects across 10\ntrials each on a bimanual YuMi robot. Each phase of RSRD achieves an average of\n87% success rate, for a total end-to-end success rate of 60% across 90 trials.\nNotably, this is accomplished using only feature fields distilled from large\npretrained vision models -- without any task-specific training, fine-tuning,\ndataset collection, or annotation. Project page:\nhttps://robot-see-robot-do.github.io\n","authors":["Justin Kerr","Chung Min Kim","Mingxuan Wu","Brent Yi","Qianqian Wang","Ken Goldberg","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2409.18121v1.pdf","comment":"CoRL 2024, Project page: https://robot-see-robot-do.github.io"},{"id":"http://arxiv.org/abs/2409.18120v1","updated":"2024-09-26T17:57:15Z","published":"2024-09-26T17:57:15Z","title":"EvMAPPER: High Altitude Orthomapping with Event Cameras","summary":" Traditionally, unmanned aerial vehicles (UAVs) rely on CMOS-based cameras to\ncollect images about the world below. One of the most successful applications\nof UAVs is to generate orthomosaics or orthomaps, in which a series of images\nare integrated together to develop a larger map. However, the use of CMOS-based\ncameras with global or rolling shutters mean that orthomaps are vulnerable to\nchallenging light conditions, motion blur, and high-speed motion of\nindependently moving objects under the camera. Event cameras are less sensitive\nto these issues, as their pixels are able to trigger asynchronously on\nbrightness changes. This work introduces the first orthomosaic approach using\nevent cameras. In contrast to existing methods relying only on CMOS cameras,\nour approach enables map generation even in challenging light conditions,\nincluding direct sunlight and after sunset.\n","authors":["Fernando Cladera","Kenneth Chaney","M. Ani Hsieh","Camillo J. Taylor","Vijay Kumar"],"pdf_url":"https://arxiv.org/pdf/2409.18120v1.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.18108v1","updated":"2024-09-26T17:51:31Z","published":"2024-09-26T17:51:31Z","title":"Language-Embedded Gaussian Splats (LEGS): Incrementally Building\n Room-Scale Representations with a Mobile Robot","summary":" Building semantic 3D maps is valuable for searching for objects of interest\nin offices, warehouses, stores, and homes. We present a mapping system that\nincrementally builds a Language-Embedded Gaussian Splat (LEGS): a detailed 3D\nscene representation that encodes both appearance and semantics in a unified\nrepresentation. LEGS is trained online as a robot traverses its environment to\nenable localization of open-vocabulary object queries. We evaluate LEGS on 4\nroom-scale scenes where we query for objects in the scene to assess how LEGS\ncan capture semantic meaning. We compare LEGS to LERF and find that while both\nsystems have comparable object query success rates, LEGS trains over 3.5x\nfaster than LERF. Results suggest that a multi-camera setup and incremental\nbundle adjustment can boost visual reconstruction quality in constrained robot\ntrajectories, and suggest LEGS can localize open-vocabulary and long-tail\nobject queries with up to 66% accuracy.\n","authors":["Justin Yu","Kush Hari","Kishore Srinivas","Karim El-Refai","Adam Rashid","Chung Min Kim","Justin Kerr","Richard Cheng","Muhammad Zubair Irshad","Ashwin Balakrishna","Thomas Kollar","Ken Goldberg"],"pdf_url":"https://arxiv.org/pdf/2409.18108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18098v1","updated":"2024-09-26T17:41:35Z","published":"2024-09-26T17:41:35Z","title":"StackGen: Generating Stable Structures from Silhouettes via Diffusion","summary":" Humans naturally obtain intuition about the interactions between and the\nstability of rigid objects by observing and interacting with the world. It is\nthis intuition that governs the way in which we regularly configure objects in\nour environment, allowing us to build complex structures from simple, everyday\nobjects. Robotic agents, on the other hand, traditionally require an explicit\nmodel of the world that includes the detailed geometry of each object and an\nanalytical model of the environment dynamics, which are difficult to scale and\npreclude generalization. Instead, robots would benefit from an awareness of\nintuitive physics that enables them to similarly reason over the stable\ninteraction of objects in their environment. Towards that goal, we propose\nStackGen, a diffusion model that generates diverse stable configurations of\nbuilding blocks matching a target silhouette. To demonstrate the capability of\nthe method, we evaluate it in a simulated environment and deploy it in the real\nsetting using a robotic arm to assemble structures generated by the model.\n","authors":["Luzhe Sun","Takuma Yoneda","Samuel W. Wheeler","Tianchong Jiang","Matthew R. Walter"],"pdf_url":"https://arxiv.org/pdf/2409.18098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18097v1","updated":"2024-09-26T17:41:04Z","published":"2024-09-26T17:41:04Z","title":"A Sim-to-Real Vision-based Lane Keeping System for a 1:10-scale\n Autonomous Vehicle","summary":" In recent years, several competitions have highlighted the need to\ninvestigate vision-based solutions to address scenarios with functional\ninsufficiencies in perception, world modeling and localization. This article\npresents the Vision-based Lane Keeping System (VbLKS) developed by the\nDEI-Unipd Team within the context of the Bosch Future Mobility Challenge 2022.\nThe main contribution lies in a Simulation-to-Reality (Sim2Real) GPS-denied\nVbLKS for a 1:10-scale autonomous vehicle. In this VbLKS, the input to a\ntailored Pure Pursuit (PP) based control strategy, namely the Lookahead Heading\nError (LHE), is estimated at a constant lookahead distance employing a\nConvolutional Neural Network (CNN). A training strategy for a compact CNN is\nproposed, emphasizing data generation and augmentation on simulated camera\nimages from a 3D Gazebo simulator, and enabling real-time operation on\nlow-level hardware. A tailored PP-based lateral controller equipped with a\nderivative action and a PP-based velocity reference generation are implemented.\nTuning ranges are established through a systematic time-delay stability\nanalysis. Validation in a representative controlled laboratory setting is\nprovided.\n","authors":["Antonio Gallina","Matteo Grandin","Angelo Cenedese","Mattia Bruschetta"],"pdf_url":"https://arxiv.org/pdf/2409.18097v1.pdf","comment":"16 pages, 23 figures"},{"id":"http://arxiv.org/abs/2409.18092v1","updated":"2024-09-26T17:39:05Z","published":"2024-09-26T17:39:05Z","title":"DiffSSC: Semantic LiDAR Scan Completion using Denoising Diffusion\n Probabilistic Models","summary":" Perception systems play a crucial role in autonomous driving, incorporating\nmultiple sensors and corresponding computer vision algorithms. 3D LiDAR sensors\nare widely used to capture sparse point clouds of the vehicle's surroundings.\nHowever, such systems struggle to perceive occluded areas and gaps in the scene\ndue to the sparsity of these point clouds and their lack of semantics. To\naddress these challenges, Semantic Scene Completion (SSC) jointly predicts\nunobserved geometry and semantics in the scene given raw LiDAR measurements,\naiming for a more complete scene representation. Building on promising results\nof diffusion models in image generation and super-resolution tasks, we propose\ntheir extension to SSC by implementing the noising and denoising diffusion\nprocesses in the point and semantic spaces individually. To control the\ngeneration, we employ semantic LiDAR point clouds as conditional input and\ndesign local and global regularization losses to stabilize the denoising\nprocess. We evaluate our approach on autonomous driving datasets and our\napproach outperforms the state-of-the-art for SSC.\n","authors":["Helin Cao","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2409.18092v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2409.18084v1","updated":"2024-09-26T17:27:15Z","published":"2024-09-26T17:27:15Z","title":"GSON: A Group-based Social Navigation Framework with Large Multimodal\n Model","summary":" As the number of service robots and autonomous vehicles in human-centered\nenvironments grows, their requirements go beyond simply navigating to a\ndestination. They must also take into account dynamic social contexts and\nensure respect and comfort for others in shared spaces, which poses significant\nchallenges for perception and planning. In this paper, we present a group-based\nsocial navigation framework GSON to enable mobile robots to perceive and\nexploit the social group of their surroundings by leveling the visual reasoning\ncapability of the Large Multimodal Model (LMM). For perception, we apply visual\nprompting techniques to zero-shot extract the social relationship among\npedestrians and combine the result with a robust pedestrian detection and\ntracking pipeline to alleviate the problem of low inference speed of the LMM.\nGiven the perception result, the planning system is designed to avoid\ndisrupting the current social structure. We adopt a social structure-based\nmid-level planner as a bridge between global path planning and local motion\nplanning to preserve the global context and reactive response. The proposed\nmethod is validated on real-world mobile robot navigation tasks involving\ncomplex social structure understanding and reasoning. Experimental results\ndemonstrate the effectiveness of the system in these scenarios compared with\nseveral baselines.\n","authors":["Shangyi Luo","Ji Zhu","Peng Sun","Yuhong Deng","Cunjun Yu","Anxing Xiao","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18082v1","updated":"2024-09-26T17:26:16Z","published":"2024-09-26T17:26:16Z","title":"SKT: Integrating State-Aware Keypoint Trajectories with Vision-Language\n Models for Robotic Garment Manipulation","summary":" Automating garment manipulation poses a significant challenge for assistive\nrobotics due to the diverse and deformable nature of garments. Traditional\napproaches typically require separate models for each garment type, which\nlimits scalability and adaptability. In contrast, this paper presents a unified\napproach using vision-language models (VLMs) to improve keypoint prediction\nacross various garment categories. By interpreting both visual and semantic\ninformation, our model enables robots to manage different garment states with a\nsingle model. We created a large-scale synthetic dataset using advanced\nsimulation techniques, allowing scalable training without extensive real-world\ndata. Experimental results indicate that the VLM-based method significantly\nenhances keypoint detection accuracy and task success rates, providing a more\nflexible and general solution for robotic garment manipulation. In addition,\nthis research also underscores the potential of VLMs to unify various garment\nmanipulation tasks within a single framework, paving the way for broader\napplications in home automation and assistive robotics for future.\n","authors":["Xin Li","Siyuan Huang","Qiaojun Yu","Zhengkai Jiang","Ce Hao","Yimeng Zhu","Hongsheng Li","Peng Gao","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2409.18082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18053v1","updated":"2024-09-26T16:58:04Z","published":"2024-09-26T16:58:04Z","title":"DualAD: Dual-Layer Planning for Reasoning in Autonomous Driving","summary":" We present a novel autonomous driving framework, DualAD, designed to imitate\nhuman reasoning during driving. DualAD comprises two layers: a rule-based\nmotion planner at the bottom layer that handles routine driving tasks requiring\nminimal reasoning, and an upper layer featuring a rule-based text encoder that\nconverts driving scenarios from absolute states into text description. This\ntext is then processed by a large language model (LLM) to make driving\ndecisions. The upper layer intervenes in the bottom layer's decisions when\npotential danger is detected, mimicking human reasoning in critical situations.\nClosed-loop experiments demonstrate that DualAD, using a zero-shot pre-trained\nmodel, significantly outperforms rule-based motion planners that lack reasoning\nabilities. Our experiments also highlight the effectiveness of the text\nencoder, which considerably enhances the model's scenario understanding.\nAdditionally, the integrated DualAD model improves with stronger LLMs,\nindicating the framework's potential for further enhancement. We make code and\nbenchmarks publicly available.\n","authors":["Dingrui Wang","Marc Kaufeld","Johannes Betz"],"pdf_url":"https://arxiv.org/pdf/2409.18053v1.pdf","comment":"Autonomous Driving, Large Language Models (LLMs), Human Reasoning,\n Critical Scenario"},{"id":"http://arxiv.org/abs/2409.18052v1","updated":"2024-09-26T16:55:44Z","published":"2024-09-26T16:55:44Z","title":"Explaining Explaining","summary":" Explanation is key to people having confidence in high-stakes AI systems.\nHowever, machine-learning-based systems - which account for almost all current\nAI - can't explain because they are usually black boxes. The explainable AI\n(XAI) movement hedges this problem by redefining \"explanation\". The\nhuman-centered explainable AI (HCXAI) movement identifies the\nexplanation-oriented needs of users but can't fulfill them because of its\ncommitment to machine learning. In order to achieve the kinds of explanations\nneeded by real people operating in critical domains, we must rethink how to\napproach AI. We describe a hybrid approach to developing cognitive agents that\nuses a knowledge-based infrastructure supplemented by data obtained through\nmachine learning when applicable. These agents will serve as assistants to\nhumans who will bear ultimate responsibility for the decisions and actions of\nthe human-robot team. We illustrate the explanatory potential of such agents\nusing the under-the-hood panels of a demonstration system in which a team of\nsimulated robots collaborates on a search task assigned by a human.\n","authors":["Sergei Nirenburg","Marjorie McShane","Kenneth W. Goodman","Sanjay Oruganti"],"pdf_url":"https://arxiv.org/pdf/2409.18052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18049v1","updated":"2024-09-26T16:49:58Z","published":"2024-09-26T16:49:58Z","title":"Revisit Anything: Visual Place Recognition via Image Segment Retrieval","summary":" Accurately recognizing a revisited place is crucial for embodied agents to\nlocalize and navigate. This requires visual representations to be distinct,\ndespite strong variations in camera viewpoint and scene appearance. Existing\nvisual place recognition pipelines encode the \"whole\" image and search for\nmatches. This poses a fundamental challenge in matching two images of the same\nplace captured from different camera viewpoints: \"the similarity of what\noverlaps can be dominated by the dissimilarity of what does not overlap\". We\naddress this by encoding and searching for \"image segments\" instead of the\nwhole images. We propose to use open-set image segmentation to decompose an\nimage into `meaningful' entities (i.e., things and stuff). This enables us to\ncreate a novel image representation as a collection of multiple overlapping\nsubgraphs connecting a segment with its neighboring segments, dubbed\nSuperSegment. Furthermore, to efficiently encode these SuperSegments into\ncompact vector representations, we propose a novel factorized representation of\nfeature aggregation. We show that retrieving these partial representations\nleads to significantly higher recognition recall than the typical whole image\nbased retrieval. Our segments-based approach, dubbed SegVLAD, sets a new\nstate-of-the-art in place recognition on a diverse selection of benchmark\ndatasets, while being applicable to both generic and task-specialized image\nencoders. Finally, we demonstrate the potential of our method to ``revisit\nanything'' by evaluating our method on an object instance retrieval task, which\nbridges the two disparate areas of research: visual place recognition and\nobject-goal navigation, through their common aim of recognizing goal objects\nspecific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything.\n","authors":["Kartik Garg","Sai Shubodh Puligilla","Shishir Kolathaya","Madhava Krishna","Sourav Garg"],"pdf_url":"https://arxiv.org/pdf/2409.18049v1.pdf","comment":"Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures"},{"id":"http://arxiv.org/abs/2409.18047v1","updated":"2024-09-26T16:48:21Z","published":"2024-09-26T16:48:21Z","title":"HARMONIC: Cognitive and Control Collaboration in Human-Robotic Teams","summary":" This paper presents a novel approach to multi-robot planning and\ncollaboration. We demonstrate a cognitive strategy for robots in human-robot\nteams that incorporates metacognition, natural language communication, and\nexplainability. The system is embodied using the HARMONIC architecture that\nflexibly integrates cognitive and control capabilities across the team. We\nevaluate our approach through simulation experiments involving a joint search\ntask by a team of heterogeneous robots (a UGV and a drone) and a human. We\ndetail the system's handling of complex, real-world scenarios, effective action\ncoordination between robots with different capabilities, and natural\nhuman-robot communication. This work demonstrates that the robots' ability to\nreason about plans, goals, and attitudes, and to provide explanations for\nactions and decisions are essential prerequisites for realistic human-robot\nteaming.\n","authors":["Sanjay Oruganti","Sergei Nirenburg","Marjorie McShane","Jesse English","Michael K. Roberts","Christian Arndt"],"pdf_url":"https://arxiv.org/pdf/2409.18047v1.pdf","comment":"Submitted to ICRA 2025 Conference, Atlanta, GA, USA"},{"id":"http://arxiv.org/abs/2409.18038v1","updated":"2024-09-26T16:42:53Z","published":"2024-09-26T16:42:53Z","title":"MMDVS-LF: A Multi-Modal Dynamic-Vision-Sensor Line Following Dataset","summary":" Dynamic Vision Sensors (DVS), offer a unique advantage in control\napplications, due to their high temporal resolution, and asynchronous\nevent-based data. Still, their adoption in machine learning algorithms remains\nlimited. To address this gap, and promote the development of models that\nleverage the specific characteristics of DVS data, we introduce the Multi-Modal\nDynamic-Vision-Sensor Line Following dataset (MMDVS-LF). This comprehensive\ndataset, is the first to integrate multiple sensor modalities, including DVS\nrecordings, RGB video, odometry, and Inertial Measurement Unit (IMU) data, from\na small-scale standardized vehicle. Additionally, the dataset includes\neye-tracking and demographic data of drivers performing a Line Following task\non a track. With its diverse range of data, MMDVS-LF opens new opportunities\nfor developing deep learning algorithms, and conducting data science projects\nacross various domains, supporting innovation in autonomous systems and control\napplications.\n","authors":["Felix Resch","Mónika Farsang","Radu Grosu"],"pdf_url":"https://arxiv.org/pdf/2409.18038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18037v1","updated":"2024-09-26T16:42:13Z","published":"2024-09-26T16:42:13Z","title":"HARMONIC: A Framework for Explanatory Cognitive Robots","summary":" We present HARMONIC, a framework for implementing cognitive robots that\ntransforms general-purpose robots into trusted teammates capable of complex\ndecision-making, natural communication and human-level explanation. The\nframework supports interoperability between a strategic (cognitive) layer for\nhigh-level decision-making and a tactical (robot) layer for low-level control\nand execution. We describe the core features of the framework and our initial\nimplementation, in which HARMONIC was deployed on a simulated UGV and drone\ninvolved in a multi-robot search and retrieval task.\n","authors":["Sanjay Oruganti","Sergei Nirenburg","Marjorie McShane","Jesse English","Michael K. Roberts","Christian Arndt"],"pdf_url":"https://arxiv.org/pdf/2409.18037v1.pdf","comment":"Accepted for presentation at ICRA@40. 23-26 September 2024,\n Rotterdam, Netherlands"},{"id":"http://arxiv.org/abs/2409.18031v1","updated":"2024-09-26T16:38:44Z","published":"2024-09-26T16:38:44Z","title":"Reasoning Multi-Agent Behavioral Topology for Interactive Autonomous\n Driving","summary":" Autonomous driving system aims for safe and social-consistent driving through\nthe behavioral integration among interactive agents. However, challenges remain\ndue to multi-agent scene uncertainty and heterogeneous interaction. Current\ndense and sparse behavioral representations struggle with inefficiency and\ninconsistency in multi-agent modeling, leading to instability of collective\nbehavioral patterns when integrating prediction and planning (IPP). To address\nthis, we initiate a topological formation that serves as a compliant behavioral\nforeground to guide downstream trajectory generations. Specifically, we\nintroduce Behavioral Topology (BeTop), a pivotal topological formulation that\nexplicitly represents the consensual behavioral pattern among multi-agent\nfuture. BeTop is derived from braid theory to distill compliant interactive\ntopology from multi-agent future trajectories. A synergistic learning framework\n(BeTopNet) supervised by BeTop facilitates the consistency of behavior\nprediction and planning within the predicted topology priors. Through imitative\ncontingency learning, BeTop also effectively manages behavioral uncertainty for\nprediction and planning. Extensive verification on large-scale real-world\ndatasets, including nuPlan and WOMD, demonstrates that BeTop achieves\nstate-of-the-art performance in both prediction and planning tasks. Further\nvalidations on the proposed interactive scenario benchmark showcase planning\ncompliance in interactive cases.\n","authors":["Haochen Liu","Li Chen","Yu Qiao","Chen Lv","Hongyang Li"],"pdf_url":"https://arxiv.org/pdf/2409.18031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04591v4","updated":"2024-09-26T16:35:58Z","published":"2023-11-08T10:45:09Z","title":"Exploring Event-based Human Pose Estimation with 3D Event\n Representations","summary":" Human pose estimation is a fundamental and appealing task in computer vision.\nAlthough traditional cameras are commonly applied, their reliability decreases\nin scenarios under high dynamic range or heavy motion blur, where event cameras\noffer a robust solution. Predominant event-based methods accumulate events into\nframes, ignoring the asynchronous and high temporal resolution that is crucial\nfor distinguishing distinct actions. To address this issue and to unlock the 3D\npotential of event information, we introduce two 3D event representations: the\nRasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The\nRasEPC aggregates events within concise temporal slices at identical positions,\npreserving their 3D attributes along with statistical information, thereby\nsignificantly reducing memory and computational demands. Meanwhile, the DEV\nrepresentation discretizes events into voxels and projects them across three\northogonal planes, utilizing decoupled event attention to retrieve 3D cues from\nthe 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic\nevent-based dataset crafted to facilitate training and quantitative analysis in\noutdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD\ndataset, and our EV-3DPW dataset, with further qualitative validation via a\nderived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our\ncode and dataset have been made publicly available at\nhttps://github.com/MasterHow/EventPointPose.\n","authors":["Xiaoting Yin","Hao Shi","Jiaan Chen","Ze Wang","Yaozu Ye","Kailun Yang","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2311.04591v4.pdf","comment":"Accepted to Computer Vision and Image Understanding (CVPU). Extended\n version of arXiv:2206.04511. The code and dataset are available at\n https://github.com/MasterHow/EventPointPose"},{"id":"http://arxiv.org/abs/2409.18026v1","updated":"2024-09-26T16:33:16Z","published":"2024-09-26T16:33:16Z","title":"ReliOcc: Towards Reliable Semantic Occupancy Prediction via Uncertainty\n Learning","summary":" Vision-centric semantic occupancy prediction plays a crucial role in\nautonomous driving, which requires accurate and reliable predictions from\nlow-cost sensors. Although having notably narrowed the accuracy gap with LiDAR,\nthere is still few research effort to explore the reliability in predicting\nsemantic occupancy from camera. In this paper, we conduct a comprehensive\nevaluation of existing semantic occupancy prediction models from a reliability\nperspective for the first time. Despite the gradual alignment of camera-based\nmodels with LiDAR in term of accuracy, a significant reliability gap persists.\nTo addresses this concern, we propose ReliOcc, a method designed to enhance the\nreliability of camera-based occupancy networks. ReliOcc provides a\nplug-and-play scheme for existing models, which integrates hybrid uncertainty\nfrom individual voxels with sampling-based noise and relative voxels through\nmix-up learning. Besides, an uncertainty-aware calibration strategy is devised\nto further enhance model reliability in offline mode. Extensive experiments\nunder various settings demonstrate that ReliOcc significantly enhances model\nreliability while maintaining the accuracy of both geometric and semantic\npredictions. Importantly, our proposed approach exhibits robustness to sensor\nfailures and out of domain noises during inference.\n","authors":["Song Wang","Zhongdao Wang","Jiawei Yu","Wentong Li","Bailan Feng","Junbo Chen","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.18026v1.pdf","comment":"Technical report. Work in progress"},{"id":"http://arxiv.org/abs/2409.18009v1","updated":"2024-09-26T16:19:37Z","published":"2024-09-26T16:19:37Z","title":"Control Industrial Automation System with Large Language Models","summary":" Traditional industrial automation systems require specialized expertise to\noperate and complex reprogramming to adapt to new processes. Large language\nmodels offer the intelligence to make them more flexible and easier to use.\nHowever, LLMs' application in industrial settings is underexplored. This paper\nintroduces a framework for integrating LLMs to achieve end-to-end control of\nindustrial automation systems. At the core of the framework are an agent system\ndesigned for industrial tasks, a structured prompting method, and an\nevent-driven information modeling mechanism that provides real-time data for\nLLM inference. The framework supplies LLMs with real-time events on different\ncontext semantic levels, allowing them to interpret the information, generate\nproduction plans, and control operations on the automation system. It also\nsupports structured dataset creation for fine-tuning on this downstream\napplication of LLMs. Our contribution includes a formal system design,\nproof-of-concept implementation, and a method for generating task-specific\ndatasets for LLM fine-tuning and testing. This approach enables a more adaptive\nautomation system that can respond to spontaneous events, while allowing easier\noperation and configuration through natural language for more intuitive\nhuman-machine interaction. We provide demo videos and detailed data on GitHub:\nhttps://github.com/YuchenXia/LLM4IAS\n","authors":["Yuchen Xia","Nasser Jazdi","Jize Zhang","Chaitanya Shah","Michael Weyrich"],"pdf_url":"https://arxiv.org/pdf/2409.18009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08113v3","updated":"2024-09-26T16:14:54Z","published":"2024-06-12T11:50:51Z","title":"Valeo4Cast: A Modular Approach to End-to-End Forecasting","summary":" Motion forecasting is crucial in autonomous driving systems to anticipate the\nfuture trajectories of surrounding agents such as pedestrians, vehicles, and\ntraffic signals. In end-to-end forecasting, the model must jointly detect and\ntrack from sensor data (cameras or LiDARs) the past trajectories of the\ndifferent elements of the scene and predict their future locations. We depart\nfrom the current trend of tackling this task via end-to-end training from\nperception to forecasting, and instead use a modular approach. We individually\nbuild and train detection, tracking and forecasting modules. We then only use\nconsecutive finetuning steps to integrate the modules better and alleviate\ncompounding errors. We conduct an in-depth study on the finetuning strategies\nand it reveals that our simple yet effective approach significantly improves\nperformance on the end-to-end forecasting benchmark. Consequently, our solution\nranks first in the Argoverse 2 End-to-end Forecasting Challenge, with 63.82\nmAPf. We surpass forecasting results by +17.1 points over last year's winner\nand by +13.3 points over this year's runner-up. This remarkable performance in\nforecasting can be explained by our modular paradigm, which integrates\nfinetuning strategies and significantly outperforms the end-to-end-trained\ncounterparts. The code, model weights and results are made available\nhttps://github.com/valeoai/valeo4cast.\n","authors":["Yihong Xu","Éloi Zablocki","Alexandre Boulch","Gilles Puy","Mickael Chen","Florent Bartoccioni","Nermin Samet","Oriane Siméoni","Spyros Gidaris","Tuan-Hung Vu","Andrei Bursuc","Eduardo Valle","Renaud Marlet","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2406.08113v3.pdf","comment":"Winning solution of the Argoverse 2 \"Unified Detection, Tracking, and\n Forecasting\" challenge; work accepted at Road++ ECCVW 2024"},{"id":"http://arxiv.org/abs/2409.17995v1","updated":"2024-09-26T16:07:20Z","published":"2024-09-26T16:07:20Z","title":"Joint Localization and Planning using Diffusion","summary":" Diffusion models have been successfully applied to robotics problems such as\nmanipulation and vehicle path planning. In this work, we explore their\napplication to end-to-end navigation -- including both perception and planning\n-- by considering the problem of jointly performing global localization and\npath planning in known but arbitrary 2D environments. In particular, we\nintroduce a diffusion model which produces collision-free paths in a global\nreference frame given an egocentric LIDAR scan, an arbitrary map, and a desired\ngoal position. To this end, we implement diffusion in the space of paths in\nSE(2), and describe how to condition the denoising process on both obstacles\nand sensor observations. In our evaluation, we show that the proposed\nconditioning techniques enable generalization to realistic maps of considerably\ndifferent appearance than the training environment, demonstrate our model's\nability to accurately describe ambiguous solutions, and run extensive\nsimulation experiments showcasing our model's use as a real-time, end-to-end\nlocalization and planning stack.\n","authors":["L. Lao Beyer","S. Karaman"],"pdf_url":"https://arxiv.org/pdf/2409.17995v1.pdf","comment":"7 pages, 9 figures. Submitted to ICRA 2025, under review"},{"id":"http://arxiv.org/abs/2409.17992v1","updated":"2024-09-26T16:02:25Z","published":"2024-09-26T16:02:25Z","title":"LoopSR: Looping Sim-and-Real for Lifelong Policy Adaptation of Legged\n Robots","summary":" Reinforcement Learning (RL) has shown its remarkable and generalizable\ncapability in legged locomotion through sim-to-real transfer. However, while\nadaptive methods like domain randomization are expected to make policy more\nrobust to diverse environments, such comprehensiveness potentially detracts\nfrom the policy's performance in any specific environment according to the No\nFree Lunch theorem, leading to a suboptimal solution once deployed in the real\nworld. To address this issue, we propose a lifelong policy adaptation framework\nnamed LoopSR, which utilizes a transformer-based encoder to project real-world\ntrajectories into a latent space, and accordingly reconstruct the real-world\nenvironments back in simulation for further improvement. Autoencoder\narchitecture and contrastive learning methods are adopted to better extract the\ncharacteristics of real-world dynamics. The simulation parameters for continual\ntraining are derived by combining predicted parameters from the decoder with\nretrieved parameters from the simulation trajectory dataset. By leveraging the\ncontinual training, LoopSR achieves superior data efficiency compared with\nstrong baselines, with only a limited amount of data to yield eminent\nperformance in both sim-to-sim and sim-to-real experiments.\n","authors":["Peilin Wu","Weiji Xie","Jiahang Cao","Hang Lai","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17992v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2409.17988v1","updated":"2024-09-26T15:57:20Z","published":"2024-09-26T15:57:20Z","title":"Deblur e-NeRF: NeRF from Motion-Blurred Events under High-speed or\n Low-light Conditions","summary":" The stark contrast in the design philosophy of an event camera makes it\nparticularly ideal for operating under high-speed, high dynamic range and\nlow-light conditions, where standard cameras underperform. Nonetheless, event\ncameras still suffer from some amount of motion blur, especially under these\nchallenging conditions, in contrary to what most think. This is attributed to\nthe limited bandwidth of the event sensor pixel, which is mostly proportional\nto the light intensity. Thus, to ensure that event cameras can truly excel in\nsuch conditions where it has an edge over standard cameras, it is crucial to\naccount for event motion blur in downstream applications, especially\nreconstruction. However, none of the recent works on reconstructing Neural\nRadiance Fields (NeRFs) from events, nor event simulators, have considered the\nfull effects of event motion blur. To this end, we propose, Deblur e-NeRF, a\nnovel method to directly and effectively reconstruct blur-minimal NeRFs from\nmotion-blurred events generated under high-speed motion or low-light\nconditions. The core component of this work is a physically-accurate pixel\nbandwidth model proposed to account for event motion blur under arbitrary speed\nand lighting conditions. We also introduce a novel threshold-normalized total\nvariation loss to improve the regularization of large textureless patches.\nExperiments on real and novel realistically simulated sequences verify our\neffectiveness. Our code, event simulator and synthetic event dataset will be\nopen-sourced.\n","authors":["Weng Fei Low","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2409.17988v1.pdf","comment":"Accepted to ECCV 2024. Project website is accessible at\n https://wengflow.github.io/deblur-e-nerf. arXiv admin note: text overlap with\n arXiv:2006.07722 by other authors"},{"id":"http://arxiv.org/abs/2312.14950v2","updated":"2024-09-26T15:45:13Z","published":"2023-12-08T15:57:18Z","title":"TypeFly: Flying Drones with Large Language Model","summary":" Recent advancements in robot control using large language models (LLMs) have\ndemonstrated significant potential, primarily due to LLMs' capabilities to\nunderstand natural language commands and generate executable plans in various\nlanguages. However, in real-time and interactive applications involving mobile\nrobots, particularly drones, the sequential token generation process inherent\nto LLMs introduces substantial latency, i.e. response time, in control plan\ngeneration.\n In this paper, we present a system called ChatFly that tackles this problem\nusing a combination of a novel programming language called MiniSpec and its\nruntime to reduce the plan generation time and drone response time. That is,\ninstead of asking an LLM to write a program (robotic plan) in the popular but\nverbose Python, ChatFly gets it to do it in MiniSpec specially designed for\ntoken efficiency and stream interpretation. Using a set of challenging drone\ntasks, we show that design choices made by ChatFly can reduce up to 62%\nresponse time and provide a more consistent user experience, enabling\nresponsive and intelligent LLM-based drone control with efficient completion.\n","authors":["Guojun Chen","Xiaojing Yu","Neiwen Ling","Lin Zhong"],"pdf_url":"https://arxiv.org/pdf/2312.14950v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14115v4","updated":"2024-09-26T15:30:00Z","published":"2023-12-21T18:40:34Z","title":"LingoQA: Visual Question Answering for Autonomous Driving","summary":" We introduce LingoQA, a novel dataset and benchmark for visual question\nanswering in autonomous driving. The dataset contains 28K unique short video\nscenarios, and 419K annotations. Evaluating state-of-the-art vision-language\nmodels on our benchmark shows that their performance is below human\ncapabilities, with GPT-4V responding truthfully to 59.6% of the questions\ncompared to 96.6% for humans. For evaluation, we propose a truthfulness\nclassifier, called Lingo-Judge, that achieves a 0.95 Spearman correlation\ncoefficient to human evaluations, surpassing existing techniques like METEOR,\nBLEU, CIDEr, and GPT-4. We establish a baseline vision-language model and run\nextensive ablation studies to understand its performance. We release our\ndataset and benchmark as an evaluation platform for vision-language models in\nautonomous driving.\n","authors":["Ana-Maria Marcu","Long Chen","Jan Hünermann","Alice Karnsund","Benoit Hanotte","Prajwal Chidananda","Saurabh Nair","Vijay Badrinarayanan","Alex Kendall","Jamie Shotton","Elahe Arani","Oleg Sinavski"],"pdf_url":"https://arxiv.org/pdf/2312.14115v4.pdf","comment":"Accepted to ECCV 2024. Benchmark and dataset are available at\n https://github.com/wayveai/LingoQA/"},{"id":"http://arxiv.org/abs/2409.17896v1","updated":"2024-09-26T14:47:14Z","published":"2024-09-26T14:47:14Z","title":"Model-Free versus Model-Based Reinforcement Learning for Fixed-Wing UAV\n Attitude Control Under Varying Wind Conditions","summary":" This paper evaluates and compares the performance of model-free and\nmodel-based reinforcement learning for the attitude control of fixed-wing\nunmanned aerial vehicles using PID as a reference point. The comparison focuses\non their ability to handle varying flight dynamics and wind disturbances in a\nsimulated environment. Our results show that the Temporal Difference Model\nPredictive Control agent outperforms both the PID controller and other\nmodel-free reinforcement learning methods in terms of tracking accuracy and\nrobustness over different reference difficulties, particularly in nonlinear\nflight regimes. Furthermore, we introduce actuation fluctuation as a key metric\nto assess energy efficiency and actuator wear, and we test two different\napproaches from the literature: action variation penalty and conditioning for\naction policy smoothness. We also evaluate all control methods when subject to\nstochastic turbulence and gusts separately, so as to measure their effects on\ntracking performance, observe their limitations and outline their implications\non the Markov decision process formalism.\n","authors":["David Olivares","Pierre Fournier","Pavan Vasishta","Julien Marzat"],"pdf_url":"https://arxiv.org/pdf/2409.17896v1.pdf","comment":"Published at ICINCO 2024"},{"id":"http://arxiv.org/abs/2404.00769v2","updated":"2024-09-26T14:18:03Z","published":"2024-03-31T18:51:52Z","title":"An Active Perception Game for Robust Information Gathering","summary":" Active perception approaches select future viewpoints by using some estimate\nof the information gain. An inaccurate estimate can be detrimental in critical\nsituations, e.g., locating a person in distress. However the true information\ngained can only be calculated post hoc, i.e., after the observation is\nrealized. We present an approach for estimating the discrepancy between the\ninformation gain (which is the average over putative future observations) and\nthe true information gain. The key idea is to analyze the mathematical\nrelationship between active perception and the estimation error of the\ninformation gain in a game-theoretic setting. Using this, we develop an online\nestimation approach that achieves sub-linear regret (in the number of\ntime-steps) for the estimation of the true information gain and reduces the\nsub-optimality of active perception systems.\n We demonstrate our approach for active perception using a comprehensive set\nof experiments on: (a) different types of environments, including a quadrotor\nin a photorealistic simulation, real-world robotic data, and real-world\nexperiments with ground robots exploring indoor and outdoor scenes; (b)\ndifferent types of robotic perception data; and (c) different map\nrepresentations. On average, our approach reduces information gain estimation\nerrors by 42%, increases the information gain by 7%, PSNR by 5%, and semantic\naccuracy (measured as the number of objects that are localized correctly) by\n6%. In real-world experiments with a Jackal ground robot, our approach\ndemonstrated complex trajectories to explore occluded regions.\n","authors":["Siming He","Yuezhan Tao","Igor Spasojevic","Vijay Kumar","Pratik Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2404.00769v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04693v2","updated":"2024-09-26T13:53:33Z","published":"2024-04-06T17:41:36Z","title":"OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera\n Fusion for Colorizing Point Clouds","summary":" A Colored point cloud, as a simple and efficient 3D representation, has many\nadvantages in various fields, including robotic navigation and scene\nreconstruction. This representation is now commonly used in 3D reconstruction\ntasks relying on cameras and LiDARs. However, fusing data from these two types\nof sensors is poorly performed in many existing frameworks, leading to\nunsatisfactory mapping results, mainly due to inaccurate camera poses. This\npaper presents OmniColor, a novel and efficient algorithm to colorize point\nclouds using an independent 360-degree camera. Given a LiDAR-based point cloud\nand a sequence of panorama images with initial coarse camera poses, our\nobjective is to jointly optimize the poses of all frames for mapping images\nonto geometric reconstructions. Our pipeline works in an off-the-shelf manner\nthat does not require any feature extraction or matching process. Instead, we\nfind optimal poses by directly maximizing the photometric consistency of LiDAR\nmaps. In experiments, we show that our method can overcome the severe visual\ndistortion of omnidirectional images and greatly benefit from the wide field of\nview (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy\nand stability. The code will be released at\nhttps://github.com/liubonan123/OmniColor/.\n","authors":["Bonan Liu","Guoyang Zhao","Jianhao Jiao","Guang Cai","Chengyang Li","Handi Yin","Yuyang Wang","Ming Liu","Pan Hui"],"pdf_url":"https://arxiv.org/pdf/2404.04693v2.pdf","comment":"2024 IEEE International Conference on Robotics and Automation (ICRA)"},{"id":"http://arxiv.org/abs/2406.10615v2","updated":"2024-09-26T12:55:43Z","published":"2024-06-15T12:27:35Z","title":"Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation","summary":" Given the high cost of collecting robotic data in the real world, sample\nefficiency is a consistently compelling pursuit in robotics. In this paper, we\nintroduce SGRv2, an imitation learning framework that enhances sample\nefficiency through improved visual and action representations. Central to the\ndesign of SGRv2 is the incorporation of a critical inductive bias-action\nlocality, which posits that robot's actions are predominantly influenced by the\ntarget object and its interactions with the local environment. Extensive\nexperiments in both simulated and real-world settings demonstrate that action\nlocality is essential for boosting sample efficiency. SGRv2 excels in RLBench\ntasks with keyframe control using merely 5 demonstrations and surpasses the RVT\nbaseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and\nMimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR.\nIn real-world environments, with only eight demonstrations, SGRv2 can perform a\nvariety of tasks at a markedly higher success rate compared to baseline models.\nProject website: http://sgrv2-robot.github.io\n","authors":["Tong Zhang","Yingdong Hu","Jiacheng You","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2406.10615v2.pdf","comment":"CoRL 2024. Project website: http://sgrv2-robot.github.io"},{"id":"http://arxiv.org/abs/2409.17798v1","updated":"2024-09-26T12:47:36Z","published":"2024-09-26T12:47:36Z","title":"Swarm-LIO2: Decentralized, Efficient LiDAR-inertial Odometry for UAV\n Swarms","summary":" Aerial swarm systems possess immense potential in various aspects, such as\ncooperative exploration, target tracking, search and rescue. Efficient,\naccurate self and mutual state estimation are the critical preconditions for\ncompleting these swarm tasks, which remain challenging research topics. This\npaper proposes Swarm-LIO2: a fully decentralized, plug-and-play,\ncomputationally efficient, and bandwidth-efficient LiDAR-inertial odometry for\naerial swarm systems. Swarm-LIO2 uses a decentralized, plug-and-play network as\nthe communication infrastructure. Only bandwidth-efficient and low-dimensional\ninformation is exchanged, including identity, ego-state, mutual observation\nmeasurements, and global extrinsic transformations. To support the\nplug-and-play of new teammate participants, Swarm-LIO2 detects potential\nteammate UAVs and initializes the temporal offset and global extrinsic\ntransformation all automatically. To enhance the initialization efficiency,\nnovel reflectivity-based UAV detection, trajectory matching, and factor graph\noptimization methods are proposed. For state estimation, Swarm-LIO2 fuses\nLiDAR, IMU, and mutual observation measurements within an efficient ESIKF\nframework, with careful compensation of temporal delay and modeling of\nmeasurements to enhance the accuracy and consistency.\n","authors":["Fangcheng Zhu","Yunfan Ren","Longji Yin","Fanze Kong","Qingbo Liu","Ruize Xue","Wenyi Liu","Yixi Cai","Guozheng Lu","Haotian Li","Fu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17798v1.pdf","comment":"23 Pages"},{"id":"http://arxiv.org/abs/2404.06926v2","updated":"2024-09-26T12:31:23Z","published":"2024-04-10T11:24:34Z","title":"Gaussian-LIC: Real-Time Photo-Realistic SLAM with Gaussian Splatting and\n LiDAR-Inertial-Camera Fusion","summary":" In this paper, we present a real-time photo-realistic SLAM method based on\nmarrying Gaussian Splatting with LiDAR-Inertial-Camera SLAM. Most existing\nradiance-field-based SLAM systems mainly focus on bounded indoor environments,\nequipped with RGB-D or RGB sensors. However, they are prone to decline when\nexpanding to unbounded scenes or encountering adverse conditions, such as\nviolent motions and changing illumination. In contrast, oriented to general\nscenarios, our approach additionally tightly fuses LiDAR, IMU, and camera for\nrobust pose estimation and photo-realistic online mapping. To compensate for\nregions unobserved by the LiDAR, we propose to integrate both the triangulated\nvisual points from images and LiDAR points for initializing 3D Gaussians. In\naddition, the modeling of the sky and varying camera exposure have been\nrealized for high-quality rendering. Notably, we implement our system purely\nwith C++ and CUDA, and meticulously design a series of strategies to accelerate\nthe online optimization of the Gaussian-based scene representation. Extensive\nexperiments demonstrate that our method outperforms its counterparts while\nmaintaining real-time capability. Impressively, regarding photo-realistic\nmapping, our method with our estimated poses even surpasses all the compared\napproaches that utilize privileged ground-truth poses for mapping. Our code\nwill be released on project page https://xingxingzuo.github.io/gaussian_lic.\n","authors":["Xiaolei Lang","Laijian Li","Chenming Wu","Chen Zhao","Lina Liu","Yong Liu","Jiajun Lv","Xingxing Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.06926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07865v4","updated":"2024-09-26T12:18:49Z","published":"2024-05-13T15:53:18Z","title":"AnoVox: A Benchmark for Multimodal Anomaly Detection in Autonomous\n Driving","summary":" The scale-up of autonomous vehicles depends heavily on their ability to deal\nwith anomalies, such as rare objects on the road. In order to handle such\nsituations, it is necessary to detect anomalies in the first place. Anomaly\ndetection for autonomous driving has made great progress in the past years but\nsuffers from poorly designed benchmarks with a strong focus on camera data. In\nthis work, we propose AnoVox, the largest benchmark for ANOmaly detection in\nautonomous driving to date. AnoVox incorporates large-scale multimodal sensor\ndata and spatial VOXel ground truth, allowing for the comparison of methods\nindependent of their used sensor. We propose a formal definition of normality\nand provide a compliant training dataset. AnoVox is the first benchmark to\ncontain both content and temporal anomalies.\n","authors":["Daniel Bogdoll","Iramm Hamdard","Lukas Namgyu Rößler","Felix Geisler","Muhammed Bayram","Felix Wang","Jan Imhof","Miguel de Campos","Anushervon Tabarov","Yitian Yang","Hanno Gottschalk","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2405.07865v4.pdf","comment":"Daniel Bogdoll, Iramm Hamdard, and Lukas Namgyu R\\\"o{\\ss}ler\n contributed equally. Accepted for publication at ECCV 2024 W-CODA workshop"},{"id":"http://arxiv.org/abs/2409.17755v1","updated":"2024-09-26T11:40:07Z","published":"2024-09-26T11:40:07Z","title":"SECURE: Semantics-aware Embodied Conversation under Unawareness for\n Lifelong Robot Learning","summary":" This paper addresses a challenging interactive task learning scenario we call\nrearrangement under unawareness: to manipulate a rigid-body environment in a\ncontext where the robot is unaware of a concept that's key to solving the\ninstructed task. We propose SECURE, an interactive task learning framework\ndesigned to solve such problems by fixing a deficient domain model using\nembodied conversation. Through dialogue, the robot discovers and then learns to\nexploit unforeseen possibilities. Using SECURE, the robot not only learns from\nthe user's corrective feedback when it makes a mistake, but it also learns to\nmake strategic dialogue decisions for revealing useful evidence about novel\nconcepts for solving the instructed task. Together, these abilities allow the\nrobot to generalise to subsequent tasks using newly acquired knowledge. We\ndemonstrate that a robot that is semantics-aware -- that is, it exploits the\nlogical consequences of both sentence and discourse semantics in the learning\nand inference process -- learns to solve rearrangement under unawareness more\neffectively than a robot that lacks such capabilities.\n","authors":["Rimvydas Rubavicius","Peter David Fagan","Alex Lascarides","Subramanian Ramamoorthy"],"pdf_url":"https://arxiv.org/pdf/2409.17755v1.pdf","comment":"10 pages,4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2406.10759v2","updated":"2024-09-26T11:13:10Z","published":"2024-06-15T23:21:10Z","title":"Humanoid Parkour Learning","summary":" Parkour is a grand challenge for legged locomotion, even for quadruped\nrobots, requiring active perception and various maneuvers to overcome multiple\nchallenging obstacles. Existing methods for humanoid locomotion either optimize\na trajectory for a single parkour track or train a reinforcement learning\npolicy only to walk with a significant amount of motion references. In this\nwork, we propose a framework for learning an end-to-end vision-based\nwhole-body-control parkour policy for humanoid robots that overcomes multiple\nparkour skills without any motion prior. Using the parkour policy, the humanoid\nrobot can jump on a 0.42m platform, leap over hurdles, 0.8m gaps, and much\nmore. It can also run at 1.8m/s in the wild and walk robustly on different\nterrains. We test our policy in indoor and outdoor environments to demonstrate\nthat it can autonomously select parkour skills while following the rotation\ncommand of the joystick. We override the arm actions and show that this\nframework can easily transfer to humanoid mobile manipulation tasks. Videos can\nbe found at https://humanoid4parkour.github.io\n","authors":["Ziwen Zhuang","Shenzhe Yao","Hang Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.10759v2.pdf","comment":"Published on CoRL 2024"},{"id":"http://arxiv.org/abs/2409.17731v1","updated":"2024-09-26T11:01:00Z","published":"2024-09-26T11:01:00Z","title":"Robust Ladder Climbing with a Quadrupedal Robot","summary":" Quadruped robots are proliferating in industrial environments where they\ncarry sensor suites and serve as autonomous inspection platforms. Despite the\nadvantages of legged robots over their wheeled counterparts on rough and uneven\nterrain, they are still yet to be able to reliably negotiate ubiquitous\nfeatures of industrial infrastructure: ladders. Inability to traverse ladders\nprevents quadrupeds from inspecting dangerous locations, puts humans in harm's\nway, and reduces industrial site productivity. In this paper, we learn\nquadrupedal ladder climbing via a reinforcement learning-based control policy\nand a complementary hooked end-effector. We evaluate the robustness in\nsimulation across different ladder inclinations, rung geometries, and\ninter-rung spacings. On hardware, we demonstrate zero-shot transfer with an\noverall 90% success rate at ladder angles ranging from 70{\\deg} to 90{\\deg},\nconsistent climbing performance during unmodeled perturbations, and climbing\nspeeds 232x faster than the state of the art. This work expands the scope of\nindustrial quadruped robot applications beyond inspection on nominal terrains\nto challenging infrastructural features in the environment, highlighting\nsynergies between robot morphology and control policy when performing complex\nskills. More information can be found at the project website:\nhttps://sites.google.com/leggedrobotics.com/climbingladders.\n","authors":["Dylan Vogel","Robert Baines","Joseph Church","Julian Lotzer","Karl Werner","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2409.17731v1.pdf","comment":"Project website:\n https://sites.google.com/leggedrobotics.com/climbingladders"},{"id":"http://arxiv.org/abs/2409.17727v1","updated":"2024-09-26T10:56:35Z","published":"2024-09-26T10:56:35Z","title":"Robotic-CLIP: Fine-tuning CLIP on Action Data for Robotic Applications","summary":" Vision language models have played a key role in extracting meaningful\nfeatures for various robotic applications. Among these, Contrastive\nLanguage-Image Pretraining (CLIP) is widely used in robotic tasks that require\nboth vision and natural language understanding. However, CLIP was trained\nsolely on static images paired with text prompts and has not yet been fully\nadapted for robotic tasks involving dynamic actions. In this paper, we\nintroduce Robotic-CLIP to enhance robotic perception capabilities. We first\ngather and label large-scale action data, and then build our Robotic-CLIP by\nfine-tuning CLIP on 309,433 videos (~7.4 million frames) of action data using\ncontrastive learning. By leveraging action data, Robotic-CLIP inherits CLIP's\nstrong image performance while gaining the ability to understand actions in\nrobotic contexts. Intensive experiments show that our Robotic-CLIP outperforms\nother CLIP-based models across various language-driven robotic tasks.\nAdditionally, we demonstrate the practical effectiveness of Robotic-CLIP in\nreal-world grasping applications.\n","authors":["Nghia Nguyen","Minh Nhat Vu","Tung D. Ta","Baoru Huang","Thieu Vo","Ngan Le","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.17727v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2409.17725v1","updated":"2024-09-26T10:55:07Z","published":"2024-09-26T10:55:07Z","title":"Stable Object Placement Under Geometric Uncertainty via Differentiable\n Contact Dynamics","summary":" From serving a cup of coffee to carefully rearranging delicate items, stable\nobject placement is a crucial skill for future robots. This skill is\nchallenging due to the required accuracy, which is difficult to achieve under\ngeometric uncertainty. We leverage differentiable contact dynamics to develop a\nprincipled method for stable object placement under geometric uncertainty. We\nestimate the geometric uncertainty by minimizing the discrepancy between the\nforce-torque sensor readings and the model predictions through gradient\ndescent. We further keep track of a belief over multiple possible geometric\nparameters to mitigate the gradient-based method's sensitivity to the\ninitialization. We verify our approach in the real world on various geometric\nuncertainties, including the in-hand pose uncertainty of the grasped object,\nthe object's shape uncertainty, and the environment's shape uncertainty.\n","authors":["Linfeng Li","Gang Yang","Lin Shao","David Hsu"],"pdf_url":"https://arxiv.org/pdf/2409.17725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08160v2","updated":"2024-09-26T10:54:32Z","published":"2024-08-15T13:49:14Z","title":"General-purpose Clothes Manipulation with Semantic Keypoints","summary":" Clothes manipulation is a critical skill for household robots. Recent\nadvancements have been made in task-specific clothes manipulation, such as\nfolding, flattening, and hanging. However, due to clothes' complex geometries\nand deformability, creating a general-purpose robot system that can manipulate\na diverse range of clothes in many ways remains challenging. Since clothes are\ntypically designed with specific structures, we propose identifying these\nspecific features like ``left sleeve'' as semantic keypoints. Semantic\nkeypoints can provide semantic cues for task planning and geometric cues for\nlow-level action generation. With this insight, we develop a hierarchical\nlearning framework using the large language model (LLM) for general-purpose\nCLothes mAnipulation with Semantic keyPoints (CLASP). Extensive simulation\nexperiments show that CLASP outperforms baseline methods on both seen and\nunseen tasks across various clothes manipulation tasks. Real-world experiments\nshow that CLASP can be directly deployed in the real world and applied to a\nwide variety of clothes.\n","authors":["Yuhong Deng","David Hsu"],"pdf_url":"https://arxiv.org/pdf/2408.08160v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17720v1","updated":"2024-09-26T10:43:09Z","published":"2024-09-26T10:43:09Z","title":"Scene Understanding in Pick-and-Place Tasks: Analyzing Transformations\n Between Initial and Final Scenes","summary":" With robots increasingly collaborating with humans in everyday tasks, it is\nimportant to take steps toward robotic systems capable of understanding the\nenvironment. This work focuses on scene understanding to detect pick and place\ntasks given initial and final images from the scene. To this end, a dataset is\ncollected for object detection and pick and place task detection. A YOLOv5\nnetwork is subsequently trained to detect the objects in the initial and final\nscenes. Given the detected objects and their bounding boxes, two methods are\nproposed to detect the pick and place tasks which transform the initial scene\ninto the final scene. A geometric method is proposed which tracks objects'\nmovements in the two scenes and works based on the intersection of the bounding\nboxes which moved within scenes. Contrarily, the CNN-based method utilizes a\nConvolutional Neural Network to classify objects with intersected bounding\nboxes into 5 classes, showing the spatial relationship between the involved\nobjects. The performed pick and place tasks are then derived from analyzing the\nexperiments with both scenes. Results show that the CNN-based method, using a\nVGG16 backbone, outscores the geometric method by roughly 12 percentage points\nin certain scenarios, with an overall success rate of 84.3%.\n","authors":["Seraj Ghasemi","Hamed Hosseini","MohammadHossein Koosheshi","Mehdi Tale Masouleh","Ahmad Kalhor"],"pdf_url":"https://arxiv.org/pdf/2409.17720v1.pdf","comment":"Conference Paper, ICEE 2024, 7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17702v1","updated":"2024-09-26T10:16:08Z","published":"2024-09-26T10:16:08Z","title":"Episodic Memory Verbalization using Hierarchical Representations of\n Life-Long Robot Experience","summary":" Verbalization of robot experience, i.e., summarization of and question\nanswering about a robot's past, is a crucial ability for improving human-robot\ninteraction. Previous works applied rule-based systems or fine-tuned deep\nmodels to verbalize short (several-minute-long) streams of episodic data,\nlimiting generalization and transferability. In our work, we apply large\npretrained models to tackle this task with zero or few examples, and\nspecifically focus on verbalizing life-long experiences. For this, we derive a\ntree-like data structure from episodic memory (EM), with lower levels\nrepresenting raw perception and proprioception data, and higher levels\nabstracting events to natural language concepts. Given such a hierarchical\nrepresentation built from the experience stream, we apply a large language\nmodel as an agent to interactively search the EM given a user's query,\ndynamically expanding (initially collapsed) tree nodes to find the relevant\ninformation. The approach keeps computational costs low even when scaling to\nmonths of robot experience data. We evaluate our method on simulated household\nrobot data, human egocentric videos, and real-world robot recordings,\ndemonstrating its flexibility and scalability.\n","authors":["Leonard Bärmann","Chad DeChant","Joana Plewnia","Fabian Peller-Konrad","Daniel Bauer","Tamim Asfour","Alex Waibel"],"pdf_url":"https://arxiv.org/pdf/2409.17702v1.pdf","comment":"Code, data and demo videos at https://hierarchical-emv.github.io"},{"id":"http://arxiv.org/abs/2409.17680v1","updated":"2024-09-26T09:43:50Z","published":"2024-09-26T09:43:50Z","title":"Event-based Stereo Depth Estimation: A Survey","summary":" Stereopsis has widespread appeal in robotics as it is the predominant way by\nwhich living beings perceive depth to navigate our 3D world. Event cameras are\nnovel bio-inspired sensors that detect per-pixel brightness changes\nasynchronously, with very high temporal resolution and high dynamic range,\nenabling machine perception in high-speed motion and broad illumination\nconditions. The high temporal precision also benefits stereo matching, making\ndisparity (depth) estimation a popular research area for event cameras ever\nsince its inception. Over the last 30 years, the field has evolved rapidly,\nfrom low-latency, low-power circuit design to current deep learning (DL)\napproaches driven by the computer vision community. The bibliography is vast\nand difficult to navigate for non-experts due its highly interdisciplinary\nnature. Past surveys have addressed distinct aspects of this topic, in the\ncontext of applications, or focusing only on a specific class of techniques,\nbut have overlooked stereo datasets. This survey provides a comprehensive\noverview, covering both instantaneous stereo and long-term methods suitable for\nsimultaneous localization and mapping (SLAM), along with theoretical and\nempirical comparisons. It is the first to extensively review DL methods as well\nas stereo datasets, even providing practical suggestions for creating new\nbenchmarks to advance the field. The main advantages and challenges faced by\nevent-based stereo depth estimation are also discussed. Despite significant\nprogress, challenges remain in achieving optimal performance in not only\naccuracy but also efficiency, a cornerstone of event-based computing. We\nidentify several gaps and propose future research directions. We hope this\nsurvey inspires future research in this area, by serving as an accessible entry\npoint for newcomers, as well as a practical guide for seasoned researchers in\nthe community.\n","authors":["Suman Ghosh","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2409.17680v1.pdf","comment":"28 pages, 20 figures, 7 tables"},{"id":"http://arxiv.org/abs/2409.17655v1","updated":"2024-09-26T09:06:56Z","published":"2024-09-26T09:06:56Z","title":"AssistantX: An LLM-Powered Proactive Assistant in Collaborative\n Human-Populated Environment","summary":" The increasing demand for intelligent assistants in human-populated\nenvironments has motivated significant research in autonomous robotic systems.\nTraditional service robots and virtual assistants, however, struggle with\nreal-world task execution due to their limited capacity for dynamic reasoning\nand interaction, particularly when human collaboration is required. Recent\ndevelopments in Large Language Models have opened new avenues for improving\nthese systems, enabling more sophisticated reasoning and natural interaction\ncapabilities. In this paper, we introduce AssistantX, an LLM-powered proactive\nassistant designed to operate autonomously in a physical office environment.\nUnlike conventional service robots, AssistantX leverages a novel multi-agent\narchitecture, PPDR4X, which provides advanced inference capabilities and\ncomprehensive collaboration awareness. By effectively bridging the gap between\nvirtual operations and physical interactions, AssistantX demonstrates robust\nperformance in managing complex real-world scenarios. Our evaluation highlights\nthe architecture's effectiveness, showing that AssistantX can respond to clear\ninstructions, actively retrieve supplementary information from memory, and\nproactively seek collaboration from team members to ensure successful task\ncompletion. More details and videos can be found at\nhttps://assistantx-agent.github.io/AssistantX/.\n","authors":["Nan Sun","Bo Mao","Yongchang Li","Lumeng Ma","Di Guo","Huaping Liu"],"pdf_url":"https://arxiv.org/pdf/2409.17655v1.pdf","comment":"6 pages, 8 figures, 4 tables"},{"id":"http://arxiv.org/abs/2312.15897v2","updated":"2024-09-26T09:04:58Z","published":"2023-12-26T06:20:55Z","title":"Recursive Distillation for Open-Set Distributed Robot Localization","summary":" A typical assumption in state-of-the-art self-localization models is that an\nannotated training dataset is available for the target workspace. However, this\nis not necessarily true when a robot travels around the general open world.\nThis work introduces a novel training scheme for open-world distributed robot\nsystems. In our scheme, a robot (``student\") can ask the other robots it meets\nat unfamiliar places (``teachers\") for guidance. Specifically, a\npseudo-training dataset is reconstructed from the teacher model and then used\nfor continual learning of the student model under domain, class, and vocabulary\nincremental setup. Unlike typical knowledge transfer schemes, our scheme\nintroduces only minimal assumptions on the teacher model, so that it can handle\nvarious types of open-set teachers, including those uncooperative, untrainable\n(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In\nthis paper, we investigate a ranking function as an instance of such generic\nmodels, using a challenging data-free recursive distillation scenario, where a\nstudent once trained can recursively join the next-generation open teacher set.\n","authors":["Kenta Tsukahara","Kanji Tanaka"],"pdf_url":"https://arxiv.org/pdf/2312.15897v2.pdf","comment":"5 pages, 4 figures, technical report"},{"id":"http://arxiv.org/abs/2409.17652v1","updated":"2024-09-26T09:00:30Z","published":"2024-09-26T09:00:30Z","title":"FactorSim: Generative Simulation via Factorized Representation","summary":" Generating simulations to train intelligent agents in game-playing and\nrobotics from natural language input, from user input or task documentation,\nremains an open-ended challenge. Existing approaches focus on parts of this\nchallenge, such as generating reward functions or task hyperparameters. Unlike\nprevious work, we introduce FACTORSIM that generates full simulations in code\nfrom language input that can be used to train agents. Exploiting the structural\nmodularity specific to coded simulations, we propose to use a factored\npartially observable Markov decision process representation that allows us to\nreduce context dependence during each step of the generation. For evaluation,\nwe introduce a generative simulation benchmark that assesses the generated\nsimulation code's accuracy and effectiveness in facilitating zero-shot\ntransfers in reinforcement learning settings. We show that FACTORSIM\noutperforms existing methods in generating simulations regarding prompt\nalignment (e.g., accuracy), zero-shot transfer abilities, and human evaluation.\nWe also demonstrate its effectiveness in generating robotic tasks.\n","authors":["Fan-Yun Sun","S. I. Harini","Angela Yi","Yihan Zhou","Alex Zook","Jonathan Tremblay","Logan Cross","Jiajun Wu","Nick Haber"],"pdf_url":"https://arxiv.org/pdf/2409.17652v1.pdf","comment":"neurips 2024, project website:\n https://cs.stanford.edu/~sunfanyun/factorsim/"},{"id":"http://arxiv.org/abs/2409.17641v1","updated":"2024-09-26T08:44:49Z","published":"2024-09-26T08:44:49Z","title":"AP-VLM: Active Perception Enabled by Vision-Language Models","summary":" Active perception enables robots to dynamically gather information by\nadjusting their viewpoints, a crucial capability for interacting with complex,\npartially observable environments. In this paper, we present AP-VLM, a novel\nframework that combines active perception with a Vision-Language Model (VLM) to\nguide robotic exploration and answer semantic queries. Using a 3D virtual grid\noverlaid on the scene and orientation adjustments, AP-VLM allows a robotic\nmanipulator to intelligently select optimal viewpoints and orientations to\nresolve challenging tasks, such as identifying objects in occluded or inclined\npositions. We evaluate our system on two robotic platforms: a 7-DOF Franka\nPanda and a 6-DOF UR5, across various scenes with differing object\nconfigurations. Our results demonstrate that AP-VLM significantly outperforms\npassive perception methods and baseline models, including Toward Grounded\nCommon Sense Reasoning (TGCSR), particularly in scenarios where fixed camera\nviews are inadequate. The adaptability of AP-VLM in real-world settings shows\npromise for enhancing robotic systems' understanding of complex environments,\nbridging the gap between high-level semantic reasoning and low-level control.\n","authors":["Venkatesh Sripada","Samuel Carter","Frank Guerin","Amir Ghalamzan"],"pdf_url":"https://arxiv.org/pdf/2409.17641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17630v1","updated":"2024-09-26T08:25:05Z","published":"2024-09-26T08:25:05Z","title":"System-Level Safety Monitoring and Recovery for Perception Failures in\n Autonomous Vehicles","summary":" The safety-critical nature of autonomous vehicle (AV) operation necessitates\ndevelopment of task-relevant algorithms that can reason about safety at the\nsystem level and not just at the component level. To reason about the impact of\na perception failure on the entire system performance, such task-relevant\nalgorithms must contend with various challenges: complexity of AV stacks, high\nuncertainty in the operating environments, and the need for real-time\nperformance. To overcome these challenges, in this work, we introduce a\nQ-network called SPARQ (abbreviation for Safety evaluation for Perception And\nRecovery Q-network) that evaluates the safety of a plan generated by a planning\nalgorithm, accounting for perception failures that the planning process may\nhave overlooked. This Q-network can be queried during system runtime to assess\nwhether a proposed plan is safe for execution or poses potential safety risks.\nIf a violation is detected, the network can then recommend a corrective plan\nwhile accounting for the perceptual failure. We validate our algorithm using\nthe NuPlan-Vegas dataset, demonstrating its ability to handle cases where a\nperception failure compromises a proposed plan while the corrective plan\nremains safe. We observe an overall accuracy and recall of 90% while sustaining\na frequency of 42Hz on the unseen testing dataset. We compare our performance\nto a popular reachability-based baseline and analyze some interesting\nproperties of our approach in improving the safety properties of an AV\npipeline.\n","authors":["Kaustav Chakraborty","Zeyuan Feng","Sushant Veer","Apoorva Sharma","Boris Ivanovic","Marco Pavone","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2409.17630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17624v1","updated":"2024-09-26T08:19:21Z","published":"2024-09-26T08:19:21Z","title":"HGS-Planner: Hierarchical Planning Framework for Active Scene\n Reconstruction Using 3D Gaussian Splatting","summary":" In complex missions such as search and rescue,robots must make intelligent\ndecisions in unknown environments, relying on their ability to perceive and\nunderstand their surroundings. High-quality and real-time reconstruction\nenhances situational awareness and is crucial for intelligent robotics.\nTraditional methods often struggle with poor scene representation or are too\nslow for real-time use. Inspired by the efficacy of 3D Gaussian Splatting\n(3DGS), we propose a hierarchical planning framework for fast and high-fidelity\nactive reconstruction. Our method evaluates completion and quality gain to\nadaptively guide reconstruction, integrating global and local planning for\nefficiency. Experiments in simulated and real-world environments show our\napproach outperforms existing real-time methods.\n","authors":["Zijun Xu","Rui Jin","Ke Wu","Yi Zhao","Zhiwei Zhang","Jieru Zhao","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2409.17624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17621v1","updated":"2024-09-26T08:16:53Z","published":"2024-09-26T08:16:53Z","title":"Leveraging Semantic and Geometric Information for Zero-Shot\n Robot-to-Human Handover","summary":" Human-robot interaction (HRI) encompasses a wide range of collaborative\ntasks, with handover being one of the most fundamental. As robots become more\nintegrated into human environments, the potential for service robots to assist\nin handing objects to humans is increasingly promising. In robot-to-human (R2H)\nhandover, selecting the optimal grasp is crucial for success, as it requires\navoiding interference with the humans preferred grasp region and minimizing\nintrusion into their workspace. Existing methods either inadequately consider\ngeometric information or rely on data-driven approaches, which often struggle\nto generalize across diverse objects. To address these limitations, we propose\na novel zero-shot system that combines semantic and geometric information to\ngenerate optimal handover grasps. Our method first identifies grasp regions\nusing semantic knowledge from vision-language models (VLMs) and, by\nincorporating customized visual prompts, achieves finer granularity in region\ngrounding. A grasp is then selected based on grasp distance and approach angle\nto maximize human ease and avoid interference. We validate our approach through\nablation studies and real-world comparison experiments. Results demonstrate\nthat our system improves handover success rates and provides a more\nuser-preferred interaction experience. Videos, appendixes and more are\navailable at https://sites.google.com/view/vlm-handover/.\n","authors":["Jiangshan Liu","Wenlong Dong","Jiankun Wang","Max Q. -H. Meng"],"pdf_url":"https://arxiv.org/pdf/2409.17621v1.pdf","comment":"6 pages, 5 figures, conference"},{"id":"http://arxiv.org/abs/2409.17618v1","updated":"2024-09-26T08:10:28Z","published":"2024-09-26T08:10:28Z","title":"Learning Occlusion-aware Decision-making from Agent Interaction via\n Active Perception","summary":" Occlusion-aware decision-making is essential in autonomous driving due to the\nhigh uncertainty of various occlusions. Recent occlusion-aware decision-making\nmethods encounter issues such as high computational complexity, scenario\nscalability challenges, or reliance on limited expert data. Benefiting from\nautomatically generating data by exploration randomization, we uncover that\nreinforcement learning (RL) may show promise in occlusion-aware\ndecision-making. However, previous occlusion-aware RL faces challenges in\nexpanding to various dynamic and static occlusion scenarios, low learning\nefficiency, and lack of predictive ability. To address these issues, we\nintroduce Pad-AI, a self-reinforcing framework to learn occlusion-aware\ndecision-making through active perception. Pad-AI utilizes vectorized\nrepresentation to represent occluded environments efficiently and learns over\nthe semantic motion primitives to focus on high-level active perception\nexploration. Furthermore, Pad-AI integrates prediction and RL within a unified\nframework to provide risk-aware learning and security guarantees. Our framework\nwas tested in challenging scenarios under both dynamic and static occlusions\nand demonstrated efficient and general perception-aware exploration performance\nto other strong baselines in closed-loop evaluations.\n","authors":["Jie Jia","Yiming Shu","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2409.17618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17562v1","updated":"2024-09-26T06:21:52Z","published":"2024-09-26T06:21:52Z","title":"Software for the SpaceDREAM Robotic Arm","summary":" Impedance-controlled robots are widely used on Earth to perform\ninteraction-rich tasks and will be a key enabler for In-Space Servicing,\nAssembly and Manufacturing (ISAM) activities. This paper introduces the\nsoftware architecture used on the On-Board Computer (OBC) for the planned\nSpaceDREAM mission aiming to validate such robotic arm in Lower Earth Orbit\n(LEO) conducted by the German Aerospace Center (DLR) in cooperation with\nKINETIK Space GmbH and the Technical University of Munich (TUM). During the\nmission several free motion as well as contact tasks are to be performed in\norder to verify proper functionality of the robot in position and impedance\ncontrol on joint level as well as in cartesian control. The tasks are selected\nto be representative for subsequent servicing missions e.g. requiring interface\ndocking or precise manipulation.\n The software on the OBC commands the robot's joints via SpaceWire to perform\nthose mission tasks, reads camera images and data from additional sensors and\nsends telemetry data through an Ethernet link via the spacecraft down to Earth.\nIt is set up to execute a predefined mission after receiving a start signal\nfrom the spacecraft while it should be extendable to receive commands from\nEarth for later missions. Core design principle was to reuse as much existing\nsoftware and to stay as close as possible to existing robot software stacks at\nDLR. This allowed for a quick full operational start of the robot arm compared\nto a custom development of all robot software, a lower entry barrier for\nsoftware developers as well as a reuse of existing libraries. While not every\nline of code can be tested with this design, most of the software has already\nproven its functionality through daily execution on multiple robot systems.\n","authors":["Maximilian Mühlbauer","Maxime Chalon","Maximilian Ulmer","Alin Albu-Schäffer"],"pdf_url":"https://arxiv.org/pdf/2409.17562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02569v2","updated":"2024-09-26T05:57:37Z","published":"2024-04-03T08:42:36Z","title":"SliceIt! -- A Dual Simulator Framework for Learning Robot Food Slicing","summary":" Cooking robots can enhance the home experience by reducing the burden of\ndaily chores. However, these robots must perform their tasks dexterously and\nsafely in shared human environments, especially when handling dangerous tools\nsuch as kitchen knives. This study focuses on enabling a robot to autonomously\nand safely learn food-cutting tasks. More specifically, our goal is to enable a\ncollaborative robot or industrial robot arm to perform food-slicing tasks by\nadapting to varying material properties using compliance control. Our approach\ninvolves using Reinforcement Learning (RL) to train a robot to compliantly\nmanipulate a knife, by reducing the contact forces exerted by the food items\nand by the cutting board. However, training the robot in the real world can be\ninefficient, and dangerous, and result in a lot of food waste. Therefore, we\nproposed SliceIt!, a framework for safely and efficiently learning robot\nfood-slicing tasks in simulation. Following a real2sim2real approach, our\nframework consists of collecting a few real food slicing data, calibrating our\ndual simulation environment (a high-fidelity cutting simulator and a robotic\nsimulator), learning compliant control policies on the calibrated simulation\nenvironment, and finally, deploying the policies on the real robot.\n","authors":["Cristian C. Beltran-Hernandez","Nicolas Erbetti","Masashi Hamaya"],"pdf_url":"https://arxiv.org/pdf/2404.02569v2.pdf","comment":"Accepted to ICRA 2024"},{"id":"http://arxiv.org/abs/2406.14990v2","updated":"2024-09-26T05:51:20Z","published":"2024-06-21T09:03:37Z","title":"Learning Variable Compliance Control From a Few Demonstrations for\n Bimanual Robot with Haptic Feedback Teleoperation System","summary":" Automating dexterous, contact-rich manipulation tasks using rigid robots is a\nsignificant challenge in robotics. Rigid robots, defined by their actuation\nthrough position commands, face issues of excessive contact forces due to their\ninability to adapt to contact with the environment, potentially causing damage.\nWhile compliance control schemes have been introduced to mitigate these issues\nby controlling forces via external sensors, they are hampered by the need for\nfine-tuning task-specific controller parameters. Learning from Demonstrations\n(LfD) offers an intuitive alternative, allowing robots to learn manipulations\nthrough observed actions. In this work, we introduce a novel system to enhance\nthe teaching of dexterous, contact-rich manipulations to rigid robots. Our\nsystem is twofold: firstly, it incorporates a teleoperation interface utilizing\nVirtual Reality (VR) controllers, designed to provide an intuitive and\ncost-effective method for task demonstration with haptic feedback. Secondly, we\npresent Comp-ACT (Compliance Control via Action Chunking with Transformers), a\nmethod that leverages the demonstrations to learn variable compliance control\nfrom a few demonstrations. Our methods have been validated across various\ncomplex contact-rich manipulation tasks using single-arm and bimanual robot\nsetups in simulated and real-world environments, demonstrating the\neffectiveness of our system in teaching robots dexterous manipulations with\nenhanced adaptability and safety. Code available at:\nhttps://github.com/omron-sinicx/CompACT\n","authors":["Tatsuya Kamijo","Cristian C. Beltran-Hernandez","Masashi Hamaya"],"pdf_url":"https://arxiv.org/pdf/2406.14990v2.pdf","comment":"Accepted to IROS 2024"},{"id":"http://arxiv.org/abs/2409.17549v1","updated":"2024-09-26T05:37:52Z","published":"2024-09-26T05:37:52Z","title":"Canonical Representation and Force-Based Pretraining of 3D Tactile for\n Dexterous Visuo-Tactile Policy Learning","summary":" Tactile sensing plays a vital role in enabling robots to perform\nfine-grained, contact-rich tasks. However, the high dimensionality of tactile\ndata, due to the large coverage on dexterous hands, poses significant\nchallenges for effective tactile feature learning, especially for 3D tactile\ndata, as there are no large standardized datasets and no strong pretrained\nbackbones. To address these challenges, we propose a novel canonical\nrepresentation that reduces the difficulty of 3D tactile feature learning and\nfurther introduces a force-based self-supervised pretraining task to capture\nboth local and net force features, which are crucial for dexterous\nmanipulation. Our method achieves an average success rate of 78% across four\nfine-grained, contact-rich dexterous manipulation tasks in real-world\nexperiments, demonstrating effectiveness and robustness compared to other\nmethods. Further analysis shows that our method fully utilizes both spatial and\nforce information from 3D tactile data to accomplish the tasks. The videos can\nbe viewed at https://3dtacdex.github.io.\n","authors":["Tianhao Wu","Jinzhou Li","Jiyao Zhang","Mingdong Wu","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2409.17549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15780v3","updated":"2024-09-26T04:38:13Z","published":"2024-09-24T06:22:28Z","title":"A Learning Framework for Diverse Legged Robot Locomotion Using\n Barrier-Based Style Rewards","summary":" This work introduces a model-free reinforcement learning framework that\nenables various modes of motion (quadruped, tripod, or biped) and diverse tasks\nfor legged robot locomotion. We employ a motion-style reward based on a relaxed\nlogarithmic barrier function as a soft constraint, to bias the learning process\ntoward the desired motion style, such as gait, foot clearance, joint position,\nor body height. The predefined gait cycle is encoded in a flexible manner,\nfacilitating gait adjustments throughout the learning process. Extensive\nexperiments demonstrate that KAIST HOUND, a 45 kg robotic system, can achieve\nbiped, tripod, and quadruped locomotion using the proposed framework;\nquadrupedal capabilities include traversing uneven terrain, galloping at 4.67\nm/s, and overcoming obstacles up to 58 cm (67 cm for HOUND2); bipedal\ncapabilities include running at 3.6 m/s, carrying a 7.5 kg object, and\nascending stairs-all performed without exteroceptive input.\n","authors":["Gijeong Kim","Yong-Hoon Lee","Hae-Won Park"],"pdf_url":"https://arxiv.org/pdf/2409.15780v3.pdf","comment":"7 pages, 5 figures, Videos at https://youtu.be/JV2_HfTlOKI"},{"id":"http://arxiv.org/abs/2409.17519v1","updated":"2024-09-26T04:02:20Z","published":"2024-09-26T04:02:20Z","title":"Robotic Environmental State Recognition with Pre-Trained Vision-Language\n Models and Black-Box Optimization","summary":" In order for robots to autonomously navigate and operate in diverse\nenvironments, it is essential for them to recognize the state of their\nenvironment. On the other hand, the environmental state recognition has\ntraditionally involved distinct methods tailored to each state to be\nrecognized. In this study, we perform a unified environmental state recognition\nfor robots through the spoken language with pre-trained large-scale\nvision-language models. We apply Visual Question Answering and Image-to-Text\nRetrieval, which are tasks of Vision-Language Models. We show that with our\nmethod, it is possible to recognize not only whether a room door is\nopen/closed, but also whether a transparent door is open/closed and whether\nwater is running in a sink, without training neural networks or manual\nprogramming. In addition, the recognition accuracy can be improved by selecting\nappropriate texts from the set of prepared texts based on black-box\noptimization. For each state recognition, only the text set and its weighting\nneed to be changed, eliminating the need to prepare multiple different models\nand programs, and facilitating the management of source code and computer\nresource. We experimentally demonstrate the effectiveness of our method and\napply it to the recognition behavior on a mobile robot, Fetch.\n","authors":["Kento Kawaharazuka","Yoshiki Obinata","Naoaki Kanazawa","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2409.17519v1.pdf","comment":"Accepted at Advanced Robotics, website -\n https://haraduka.github.io/vlm-bbo/"},{"id":"http://arxiv.org/abs/2409.17497v1","updated":"2024-09-26T03:11:41Z","published":"2024-09-26T03:11:41Z","title":"Precise Interception Flight Targets by Image-based Visual Servoing of\n Multicopter","summary":" Interception of low-altitude intruding targets with low-cost drones equipped\nstrapdown camera presents a competitive option. However, the malicious\nmaneuvers by the non-cooperative target and the coupling of the camera make the\ntask challenging. To solve this problem, an Image-Based Visual Servoing (IBVS)\ncontrol algorithm based on proportional navigation guidance with field-of-view\nholding capability is designed. The proposed controller reduces the miss\ndistance while improving the stability of the visual servo system during\ninterception. Software-in-the-loop (SITL) simulation experiments show a 72.8%\nreduction in the circular error probability (CEP) compared to the most recent\nstudy. This improvement enhances interception accuracy from the decimeter to\nthe centimeter level. Real-world experiments further validate the effectiveness\nof the proposed algorithm.\n","authors":["Hailong Yan","Kun Yang","Yixiao Cheng","Zihao Wang","Dawei Li"],"pdf_url":"https://arxiv.org/pdf/2409.17497v1.pdf","comment":"9 pages, 15 figures, In the process of being submitted to the Journal\n of IEEE Transactions on Industrial Electronics"},{"id":"http://arxiv.org/abs/2409.16663v2","updated":"2024-09-26T02:57:52Z","published":"2024-09-25T06:48:25Z","title":"Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles\n Using Latent Space Generative World Models","summary":" We propose the use of latent space generative world models to address the\ncovariate shift problem in autonomous driving. A world model is a neural\nnetwork capable of predicting an agent's next state given past states and\nactions. By leveraging a world model during training, the driving policy\neffectively mitigates covariate shift without requiring an excessive amount of\ntraining data. During end-to-end training, our policy learns how to recover\nfrom errors by aligning with states observed in human demonstrations, so that\nat runtime it can recover from perturbations outside the training distribution.\nAdditionally, we introduce a novel transformer-based perception encoder that\nemploys multi-view cross-attention and a learned scene query. We present\nqualitative and quantitative results, demonstrating significant improvements\nupon prior state of the art in closed-loop testing in the CARLA simulator, as\nwell as showing the ability to handle perturbations in both CARLA and NVIDIA's\nDRIVE Sim.\n","authors":["Alexander Popov","Alperen Degirmenci","David Wehr","Shashank Hegde","Ryan Oldja","Alexey Kamenev","Bertrand Douillard","David Nistér","Urs Muller","Ruchi Bhargava","Stan Birchfield","Nikolai Smolyanskiy"],"pdf_url":"https://arxiv.org/pdf/2409.16663v2.pdf","comment":"7 pages, 6 figures, for ICRA 2025 conference, for associated video\n file, see https://youtu.be/fO7RZ57gVxk"},{"id":"http://arxiv.org/abs/2409.17479v1","updated":"2024-09-26T02:30:17Z","published":"2024-09-26T02:30:17Z","title":"Traverse the Non-Traversable: Estimating Traversability for Wheeled\n Mobility on Vertically Challenging Terrain","summary":" Most traversability estimation techniques divide off-road terrain into\ntraversable (e.g., pavement, gravel, and grass) and non-traversable (e.g.,\nboulders, vegetation, and ditches) regions and then inform subsequent planners\nto produce trajectories on the traversable part. However, recent research\ndemonstrated that wheeled robots can traverse vertically challenging terrain\n(e.g., extremely rugged boulders comparable in size to the vehicles\nthemselves), which unfortunately would be deemed as non-traversable by existing\ntechniques. Motivated by such limitations, this work aims at identifying the\ntraversable from the seemingly non-traversable, vertically challenging terrain\nbased on past kinodynamic vehicle-terrain interactions in a data-driven manner.\nOur new Traverse the Non-Traversable(TNT) traversability estimator can\nefficiently guide a down-stream sampling-based planner containing a\nhigh-precision 6-DoF kinodynamic model, which becomes deployable onboard a\nsmall-scale vehicle. Additionally, the estimated traversability can also be\nused as a costmap to plan global and local paths without sampling. Our\nexperiment results show that TNT can improve planning performance, efficiency,\nand stability by 50%, 26.7%, and 9.2% respectively on a physical robot\nplatform.\n","authors":["Chenhui Pan","Aniket Datar","Anuj Pokhrel","Matthew Choulas","Mohammad Nazeri","Xuesu Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.17479v1.pdf","comment":"for associated video file, see\n https://www.youtube.com/watch?v=Shcalb8sGcA"},{"id":"http://arxiv.org/abs/2409.17470v1","updated":"2024-09-26T02:14:50Z","published":"2024-09-26T02:14:50Z","title":"Tactile Probabilistic Contact Dynamics Estimation of Unknown Objects","summary":" We study the problem of rapidly identifying contact dynamics of unknown\nobjects in partially known environments. The key innovation of our method is a\nnovel formulation of the contact dynamics estimation problem as the joint\nestimation of contact geometries and physical parameters. We leverage DeepSDF,\na compact and expressive neural-network-based geometry representation over a\ndistribution of geometries, and adopt a particle filter to estimate both the\ngeometries in contact and the physical parameters. In addition, we couple the\nestimator with an active exploration strategy that plans information-gathering\nmoves to further expedite online estimation. Through simulation and physical\nexperiments, we show that our method estimates accurate contact dynamics with\nfewer than 30 exploration moves for unknown objects touching partially known\nenvironments.\n","authors":["Jinhoo Kim","Yifan Zhu","Aaron Dollar"],"pdf_url":"https://arxiv.org/pdf/2409.17470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17469v1","updated":"2024-09-26T02:02:58Z","published":"2024-09-26T02:02:58Z","title":"Verti-Selector: Automatic Curriculum Learning for Wheeled Mobility on\n Vertically Challenging Terrain","summary":" Reinforcement Learning (RL) has the potential to enable extreme off-road\nmobility by circumventing complex kinodynamic modeling, planning, and control\nby simulated end-to-end trial-and-error learning experiences. However, most RL\nmethods are sample-inefficient when training in a large amount of manually\ndesigned simulation environments and struggle at generalizing to the real\nworld. To address these issues, we introduce Verti-Selector (VS), an automatic\ncurriculum learning framework designed to enhance learning efficiency and\ngeneralization by selectively sampling training terrain. VS prioritizes\nvertically challenging terrain with higher Temporal Difference (TD) errors when\nrevisited, thereby allowing robots to learn at the edge of their evolving\ncapabilities. By dynamically adjusting the sampling focus, VS significantly\nboosts sample efficiency and generalization within the VW-Chrono simulator\nbuilt on the Chrono multi-physics engine. Furthermore, we provide simulation\nand physical results using VS on a Verti-4-Wheeler platform. These results\ndemonstrate that VS can achieve 23.08% improvement in terms of success rate by\nefficiently sampling during training and robustly generalizing to the real\nworld.\n","authors":["Tong Xu","Chenhui Pan","Xuesu Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.17469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16162v3","updated":"2024-09-26T00:47:25Z","published":"2024-07-23T04:06:23Z","title":"Plant Robots: Harnessing Growth Actuation of Plants for Locomotion and\n Object Manipulation","summary":" Plants display physical displacements during their growth due to\nphotosynthesis, which converts light into chemical energy. This can be\ninterpreted as plants acting as actuators with a built-in power source. This\npaper presents a method to create plant robots that move and perform tasks by\nharnessing the actuation output of plants: displacement and force generated\nfrom the growing process. As the target plant, radish sprouts are employed, and\ntheir displacement and force are characterized, followed by the calculation of\npower and energy densities. Based on the characterization, two different plant\nrobots are designed and fabricated: a rotational robot and a gripper. The\nformer demonstrates ground locomotion, achieving a travel distance of 14.6 mm\nwith an average speed of 0.8 mm/h. The latter demonstrates the picking and\nplacing of an object with a 0.1-g mass by the light-controlled open-close\nmotion of plant fingers. A good agreement between the experimental and model\nvalues is observed in the specific data of the mobile robot, suggesting that\nobtaining the actuation characteristics of plants can enable the design and\nprediction of behavior in plant robots. These results pave the way for the\nrealization of novel types of environmentally friendly and sustainable robots.\n","authors":["Kazuya Murakami","Misao Sato","Momoki Kubota","Jun Shintake"],"pdf_url":"https://arxiv.org/pdf/2407.16162v3.pdf","comment":"16 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17443v1","updated":"2024-09-26T00:32:56Z","published":"2024-09-26T00:32:56Z","title":"Cat-and-Mouse Satellite Dynamics: Divergent Adversarial Reinforcement\n Learning for Contested Multi-Agent Space Operations","summary":" As space becomes increasingly crowded and contested, robust autonomous\ncapabilities for multi-agent environments are gaining critical importance.\nCurrent autonomous systems in space primarily rely on optimization-based path\nplanning or long-range orbital maneuvers, which have not yet proven effective\nin adversarial scenarios where one satellite is actively pursuing another. We\nintroduce Divergent Adversarial Reinforcement Learning (DARL), a two-stage\nMulti-Agent Reinforcement Learning (MARL) approach designed to train autonomous\nevasion strategies for satellites engaged with multiple adversarial spacecraft.\nOur method enhances exploration during training by promoting diverse\nadversarial strategies, leading to more robust and adaptable evader models. We\nvalidate DARL through a cat-and-mouse satellite scenario, modeled as a\npartially observable multi-agent capture the flag game where two adversarial\n`cat' spacecraft pursue a single `mouse' evader. DARL's performance is compared\nagainst several benchmarks, including an optimization-based satellite path\nplanner, demonstrating its ability to produce highly robust models for\nadversarial multi-agent space environments.\n","authors":["Cameron Mehlman","Joseph Abramov","Gregory Falco"],"pdf_url":"https://arxiv.org/pdf/2409.17443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17435v1","updated":"2024-09-26T00:05:36Z","published":"2024-09-26T00:05:36Z","title":"Active Vision Might Be All You Need: Exploring Active Vision in Bimanual\n Robotic Manipulation","summary":" Imitation learning has demonstrated significant potential in performing\nhigh-precision manipulation tasks using visual feedback from cameras. However,\nit is common practice in imitation learning for cameras to be fixed in place,\nresulting in issues like occlusion and limited field of view. Furthermore,\ncameras are often placed in broad, general locations, without an effective\nviewpoint specific to the robot's task. In this work, we investigate the\nutility of active vision (AV) for imitation learning and manipulation, in\nwhich, in addition to the manipulation policy, the robot learns an AV policy\nfrom human demonstrations to dynamically change the robot's camera viewpoint to\nobtain better information about its environment and the given task. We\nintroduce AV-ALOHA, a new bimanual teleoperation robot system with AV, an\nextension of the ALOHA 2 robot system, incorporating an additional 7-DoF robot\narm that only carries a stereo camera and is solely tasked with finding the\nbest viewpoint. This camera streams stereo video to an operator wearing a\nvirtual reality (VR) headset, allowing the operator to control the camera pose\nusing head and body movements. The system provides an immersive teleoperation\nexperience, with bimanual first-person control, enabling the operator to\ndynamically explore and search the scene and simultaneously interact with the\nenvironment. We conduct imitation learning experiments of our system both in\nreal-world and in simulation, across a variety of tasks that emphasize\nviewpoint planning. Our results demonstrate the effectiveness of human-guided\nAV for imitation learning, showing significant improvements over fixed cameras\nin tasks with limited visibility. Project website:\nhttps://soltanilara.github.io/av-aloha/\n","authors":["Ian Chuang","Andrew Lee","Dechen Gao","Iman Soltani"],"pdf_url":"https://arxiv.org/pdf/2409.17435v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.18347v1","updated":"2024-09-26T23:53:23Z","published":"2024-09-26T23:53:23Z","title":"Progress Towards Submersible Microrobots: A Novel 13-mg Low-Power\n SMA-Based Actuator for Underwater Propulsion","summary":" We introduce a new low-power 13-mg microactuator driven by shape-memory alloy\n(SMA) wires for underwater operation. The development of this device was\nmotivated by the recent creation of microswimmers such as the FRISHBot,\nWaterStrider, VLEIBot, VLEIBot+, and VLEIBot++. The first four of these robots,\nranging from 30 to 90 mg, function tethered to an electrical power supply while\nthe last platform is an 810-mg fully autonomous system. These five robots are\ndriven by dry SMA-based microactuators first developed for microrobotic\ncrawlers such as the SMALLBug and SMARTI. As shown in this abstract, dry\nSMA-based actuators do not operate efficiently under water due to high\nheat-transfer rates in this medium; for example, the actuators that drive the\nVLEIBot++ require about 40 mW of average power at 1 Hz in dry air while\nrequiring about 900 mW of average power at 1 Hz in water. In contrast, the\nmicroactuator presented in this abstract consumes about 150 mW of average power\nat 1 Hz in both dry air and water; additionally, it can be excited directly\nusing an onboard battery through simple power electronics implemented on a\ncustom-built printed circuit board (PCB). This technological breakthrough was\nenabled by the integration of a soft structure that encapsulates the SMA wires\nthat drive the actuator in order to passively control the rates of heat\ntransfer. The results presented here represent preliminary, yet compelling,\nexperimental evidence that the proposed actuation approach will enable the\ndevelopment of fully autonomous and controllable submersible microswimmers. To\naccomplish this objective, we will evolve the current version of the VLEIBot++\nand introduce new bioinspired underwater propulsion mechanisms.\n","authors":["Cody R. Longwell","Conor K. Trygstad","Francisco M. F. R. Goncalves","Ke Xu","Nestor O. Perez-Arancibia"],"pdf_url":"https://arxiv.org/pdf/2409.18347v1.pdf","comment":"Presented at 40th Anniversary of the IEEE International Conference on\n Robotics and Automation (ICRA@40)"},{"id":"http://arxiv.org/abs/2409.18327v1","updated":"2024-09-26T22:45:24Z","published":"2024-09-26T22:45:24Z","title":"Accelerated gradient descent for high frequency Model Predictive Control","summary":" The recent promises of Model Predictive Control in robotics have motivated\nthe development of tailored second-order methods to solve optimal control\nproblems efficiently. While those methods benefit from strong convergence\nproperties, tailored efficient implementations are challenging to derive. In\nthis work, we study the potential effectiveness of first-order methods and show\non a torque controlled manipulator that they can equal the performances of\nsecond-order methods.\n","authors":["Jianghan Zhang","Armand Jordana","Ludovic Righetti"],"pdf_url":"https://arxiv.org/pdf/2409.18327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18313v1","updated":"2024-09-26T21:44:11Z","published":"2024-09-26T21:44:11Z","title":"Embodied-RAG: General non-parametric Embodied Memory for Retrieval and\n Generation","summary":" There is no limit to how much a robot might explore and learn, but all of\nthat knowledge needs to be searchable and actionable. Within language research,\nretrieval augmented generation (RAG) has become the workhouse of large-scale\nnon-parametric knowledge, however existing techniques do not directly transfer\nto the embodied domain, which is multimodal, data is highly correlated, and\nperception requires abstraction.\n To address these challenges, we introduce Embodied-RAG, a framework that\nenhances the foundational model of an embodied agent with a non-parametric\nmemory system capable of autonomously constructing hierarchical knowledge for\nboth navigation and language generation. Embodied-RAG handles a full range of\nspatial and semantic resolutions across diverse environments and query types,\nwhether for a specific object or a holistic description of ambiance. At its\ncore, Embodied-RAG's memory is structured as a semantic forest, storing\nlanguage descriptions at varying levels of detail. This hierarchical\norganization allows the system to efficiently generate context-sensitive\noutputs across different robotic platforms. We demonstrate that Embodied-RAG\neffectively bridges RAG to the robotics domain, successfully handling over 200\nexplanation and navigation queries across 19 environments, highlighting its\npromise for general-purpose non-parametric system for embodied agents.\n","authors":["Quanting Xie","So Yeon Min","Tianyi Zhang","Aarav Bajaj","Ruslan Salakhutdinov","Matthew Johnson-Roberson","Yonatan Bisk"],"pdf_url":"https://arxiv.org/pdf/2409.18313v1.pdf","comment":"Web: https://quanting-xie.github.io/Embodied-RAG-web/"},{"id":"http://arxiv.org/abs/2406.02722v2","updated":"2024-09-26T21:26:15Z","published":"2024-06-04T19:06:31Z","title":"Model Predictive Control for Magnetically-Actuated Cellbots","summary":" This paper presents a control framework for magnetically actuated cellbots,\nwhich combines Model Predictive Control (MPC) with Gaussian Processes (GPs) as\na disturbance estimator for precise trajectory tracking. To address the\nchallenges posed by unmodeled dynamics, we integrate data-driven modeling with\nmodel-based control to accurately track desired trajectories using relatively\nsmall data. To the best of our knowledge, this is the first work to integrate\ndata-driven modeling with model-based control for the magnetic actuation of\ncellbots. The GP effectively learns and predicts unmodeled disturbances,\nproviding uncertainty bounds as well. We validate our method through\nexperiments with cellbots, demonstrating improved trajectory tracking accuracy.\n","authors":["Mehdi Kermanshah","Logan E. Beaver","Max Sokolich","Fatma Ceren Kirmizitas","Sambeeta Das","Roberto Tron","Ron Weiss","Calin Belta"],"pdf_url":"https://arxiv.org/pdf/2406.02722v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18300v1","updated":"2024-09-26T21:15:22Z","published":"2024-09-26T21:15:22Z","title":"SOAR: Self-supervision Optimized UAV Action Recognition with Efficient\n Object-Aware Pretraining","summary":" We introduce SOAR, a novel Self-supervised pretraining algorithm for aerial\nfootage captured by Unmanned Aerial Vehicles (UAVs). We incorporate human\nobject knowledge throughout the pretraining process to enhance UAV video\npretraining efficiency and downstream action recognition performance. This is\nin contrast to prior works that primarily incorporate object information during\nthe fine-tuning stage. Specifically, we first propose a novel object-aware\nmasking strategy designed to retain the visibility of certain patches related\nto objects throughout the pretraining phase. Second, we introduce an\nobject-aware loss function that utilizes object information to adjust the\nreconstruction loss, preventing bias towards less informative background\npatches. In practice, SOAR with a vanilla ViT backbone, outperforms best UAV\naction recognition models, recording a 9.7% and 21.4% boost in top-1 accuracy\non the NEC-Drone and UAV-Human datasets, while delivering an inference speed of\n18.7ms per video, making it 2x to 5x faster. Additionally, SOAR obtains\ncomparable accuracy to prior self-supervised learning (SSL) methods while\nrequiring 87.5% less pretraining time and 25% less memory usage\n","authors":["Ruiqi Xian","Xiyang Wu","Tianrui Guan","Xijun Wang","Boqing Gong","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2409.18300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18297v1","updated":"2024-09-26T21:10:17Z","published":"2024-09-26T21:10:17Z","title":"Flat'n'Fold: A Diverse Multi-Modal Dataset for Garment Perception and\n Manipulation","summary":" We present Flat'n'Fold, a novel large-scale dataset for garment manipulation\nthat addresses critical gaps in existing datasets. Comprising 1,212 human and\n887 robot demonstrations of flattening and folding 44 unique garments across 8\ncategories, Flat'n'Fold surpasses prior datasets in size, scope, and diversity.\nOur dataset uniquely captures the entire manipulation process from crumpled to\nfolded states, providing synchronized multi-view RGB-D images, point clouds,\nand action data, including hand or gripper positions and rotations. We quantify\nthe dataset's diversity and complexity compared to existing benchmarks and show\nthat our dataset features natural and diverse manipulations of real-world\ndemonstrations of human and robot demonstrations in terms of visual and action\ninformation. To showcase Flat'n'Fold's utility, we establish new benchmarks for\ngrasping point prediction and subtask decomposition. Our evaluation of\nstate-of-the-art models on these tasks reveals significant room for\nimprovement. This underscores Flat'n'Fold's potential to drive advances in\nrobotic perception and manipulation of deformable objects. Our dataset can be\ndownloaded at https://cvas-ug.github.io/flat-n-fold\n","authors":["Lipeng Zhuang","Shiyu Fan","Yingdong Ru","Florent Audonnet","Paul Henderson","Gerardo Aragon-Camarasa"],"pdf_url":"https://arxiv.org/pdf/2409.18297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18293v1","updated":"2024-09-26T21:03:02Z","published":"2024-09-26T21:03:02Z","title":"Towards Safe and Efficient Through-the-Canopy Autonomous Fruit Counting\n with UAVs","summary":" We present an autonomous aerial system for safe and efficient\nthrough-the-canopy fruit counting. Aerial robot applications in large-scale\norchards face significant challenges due to the complexity of fine-tuning\nflight paths based on orchard layouts, canopy density, and plant variability.\nThrough-the-canopy navigation is crucial for minimizing occlusion by leaves and\nbranches but is more challenging due to the complex and dense environment\ncompared to traditional over-the-canopy flights. Our system addresses these\nchallenges by integrating: i) a high-fidelity simulation framework for\noptimizing flight trajectories, ii) a low-cost autonomy stack for canopy-level\nnavigation and data collection, and iii) a robust workflow for fruit detection\nand counting using RGB images. We validate our approach through fruit counting\nwith canopy-level aerial images and by demonstrating the autonomous navigation\ncapabilities of our experimental vehicle.\n","authors":["Teaya Yang","Roman Ibrahimov","Mark W. Mueller"],"pdf_url":"https://arxiv.org/pdf/2409.18293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13696v4","updated":"2024-09-26T20:34:51Z","published":"2024-04-21T15:50:40Z","title":"Clio: Real-time Task-Driven Open-Set 3D Scene Graphs","summary":" Modern tools for class-agnostic image segmentation (e.g., SegmentAnything)\nand open-set semantic understanding (e.g., CLIP) provide unprecedented\nopportunities for robot perception and mapping. While traditional closed-set\nmetric-semantic maps were restricted to tens or hundreds of semantic classes,\nwe can now build maps with a plethora of objects and countless semantic\nvariations. This leaves us with a fundamental question: what is the right\ngranularity for the objects (and, more generally, for the semantic concepts)\nthe robot has to include in its map representation? While related work\nimplicitly chooses a level of granularity by tuning thresholds for object\ndetection, we argue that such a choice is intrinsically task-dependent. The\nfirst contribution of this paper is to propose a task-driven 3D scene\nunderstanding problem, where the robot is given a list of tasks in natural\nlanguage and has to select the granularity and the subset of objects and scene\nstructure to retain in its map that is sufficient to complete the tasks. We\nshow that this problem can be naturally formulated using the Information\nBottleneck (IB), an established information-theoretic framework. The second\ncontribution is an algorithm for task-driven 3D scene understanding based on an\nAgglomerative IB approach, that is able to cluster 3D primitives in the\nenvironment into task-relevant objects and regions and executes incrementally.\nThe third contribution is to integrate our task-driven clustering algorithm\ninto a real-time pipeline, named Clio, that constructs a hierarchical 3D scene\ngraph of the environment online using only onboard compute, as the robot\nexplores it. Our final contribution is an extensive experimental campaign\nshowing that Clio not only allows real-time construction of compact open-set 3D\nscene graphs, but also improves the accuracy of task execution by limiting the\nmap to relevant semantic concepts.\n","authors":["Dominic Maggio","Yun Chang","Nathan Hughes","Matthew Trang","Dan Griffith","Carlyn Dougherty","Eric Cristofalo","Lukas Schmid","Luca Carlone"],"pdf_url":"https://arxiv.org/pdf/2404.13696v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18273v1","updated":"2024-09-26T20:34:39Z","published":"2024-09-26T20:34:39Z","title":"Autonomous Excavation of Challenging Terrain using Oscillatory\n Primitives and Adaptive Impedance Control","summary":" This paper addresses the challenge of autonomous excavation of challenging\nterrains, in particular those that are prone to jamming and inter-particle\nadhesion when tackled by a standard penetrate-drag-scoop motion pattern.\nInspired by human excavation strategies, our approach incorporates oscillatory\nrotation elements -- including swivel, twist, and dive motions -- to break up\ncompacted, tangled grains and reduce jamming. We also present an adaptive\nimpedance control method, the Reactive Attractor Impedance Controller (RAIC),\nthat adapts a motion trajectory to unexpected forces during loading in a manner\nthat tracks a trajectory closely when loads are low, but avoids excessive loads\nwhen significant resistance is met. Our method is evaluated on four terrains\nusing a robotic arm, demonstrating improved excavation performance across\nmultiple metrics, including volume scooped, protective stop rate, and\ntrajectory completion percentage.\n","authors":["Noah Franceschini","Pranay Thangeda","Melkior Ornik","Kris Hauser"],"pdf_url":"https://arxiv.org/pdf/2409.18273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04378v4","updated":"2024-09-26T20:32:55Z","published":"2024-05-07T15:00:19Z","title":"Splat-MOVER: Multi-Stage, Open-Vocabulary Robotic Manipulation via\n Editable Gaussian Splatting","summary":" We present Splat-MOVER, a modular robotics stack for open-vocabulary robotic\nmanipulation, which leverages the editability of Gaussian Splatting (GSplat)\nscene representations to enable multi-stage manipulation tasks. Splat-MOVER\nconsists of: (i) ASK-Splat, a GSplat representation that distills semantic and\ngrasp affordance features into the 3D scene. ASK-Splat enables geometric,\nsemantic, and affordance understanding of 3D scenes, which is critical in many\nrobotics tasks; (ii) SEE-Splat, a real-time scene-editing module using 3D\nsemantic masking and infilling to visualize the motions of objects that result\nfrom robot interactions in the real-world. SEE-Splat creates a \"digital twin\"\nof the evolving environment throughout the manipulation task; and (iii)\nGrasp-Splat, a grasp generation module that uses ASK-Splat and SEE-Splat to\npropose affordance-aligned candidate grasps for open-world objects. ASK-Splat\nis trained in real-time from RGB images in a brief scanning phase prior to\noperation, while SEE-Splat and Grasp-Splat run in real-time during operation.\nWe demonstrate the superior performance of Splat-MOVER in hardware experiments\non a Kinova robot compared to two recent baselines in four single-stage,\nopen-vocabulary manipulation tasks and in four multi-stage manipulation tasks,\nusing the edited scene to reflect changes due to prior manipulation stages,\nwhich is not possible with existing baselines. Video demonstrations and the\ncode for the project are available at https://splatmover.github.io.\n","authors":["Ola Shorinwa","Johnathan Tucker","Aliyah Smith","Aiden Swann","Timothy Chen","Roya Firoozi","Monroe Kennedy III","Mac Schwager"],"pdf_url":"https://arxiv.org/pdf/2405.04378v4.pdf","comment":"https://splatmover.github.io"},{"id":"http://arxiv.org/abs/2409.18253v1","updated":"2024-09-26T19:54:24Z","published":"2024-09-26T19:54:24Z","title":"UAV-Assisted Self-Supervised Terrain Awareness for Off-Road Navigation","summary":" Terrain awareness is an essential milestone to enable truly autonomous\noff-road navigation. Accurately predicting terrain characteristics allows\noptimizing a vehicle's path against potential hazards. Recent methods use deep\nneural networks to predict traversability-related terrain properties in a\nself-supervised manner, relying on proprioception as a training signal.\nHowever, onboard cameras are inherently limited by their point-of-view relative\nto the ground, suffering from occlusions and vanishing pixel density with\ndistance. This paper introduces a novel approach for self-supervised terrain\ncharacterization using an aerial perspective from a hovering drone. We capture\nterrain-aligned images while sampling the environment with a ground vehicle,\neffectively training a simple predictor for vibrations, bumpiness, and energy\nconsumption. Our dataset includes 2.8 km of off-road data collected in forest\nenvironment, comprising 13 484 ground-based images and 12 935 aerial images.\nOur findings show that drone imagery improves terrain property prediction by\n21.37 % on the whole dataset and 37.35 % in high vegetation, compared to ground\nrobot images. We conduct ablation studies to identify the main causes of these\nperformance improvements. We also demonstrate the real-world applicability of\nour approach by scouting an unseen area with a drone, planning and executing an\noptimized path on the ground.\n","authors":["Jean-Michel Fortin","Olivier Gamache","William Fecteau","Effie Daum","William Larrivée-Hardy","François Pomerleau","Philippe Giguère"],"pdf_url":"https://arxiv.org/pdf/2409.18253v1.pdf","comment":"7 pages, 5 figures, submitted to ICRA 2025"},{"id":"http://arxiv.org/abs/2409.18231v1","updated":"2024-09-26T19:21:06Z","published":"2024-09-26T19:21:06Z","title":"ReloPush: Multi-object Rearrangement in Confined Spaces with a\n Nonholonomic Mobile Robot Pusher","summary":" We focus on the problem of rearranging a set of objects within a confined\nspace with a nonholonomically constrained mobile robot pusher. This problem is\nrelevant to many real-world domains, including warehouse automation and\nconstruction. These domains give rise to instances involving a combination of\ngeometric, kinematic, and physics constraints, which make planning particularly\nchallenging. Prior work often makes simplifying assumptions like the use of\nholonomic mobile robots or dexterous manipulators capable of unconstrained\noverhand reaching. Our key insight is we can empower even a constrained mobile\npusher to tackle complex rearrangement tasks by enabling it to modify the\nenvironment to its favor in a constraint-aware fashion. To this end, we\ndescribe a Push-Traversability graph, whose vertices represent poses that the\npusher can push objects from and edges represent optimal, kinematically\nfeasible, and stable push-rearrangements of objects. Based on this graph, we\ndevelop ReloPush, a planning framework that leverages Dubins curves and\nstandard graph search techniques to generate an efficient sequence of object\nrearrangements to be executed by the pusher. We evaluate ReloPush across a\nseries of challenging scenarios, involving the rearrangement of densely\ncluttered workspaces with up to eight objects by a 1tenth mobile robot pusher.\nReloPush exhibits orders of magnitude faster runtimes and significantly more\nrobust execution in the real world, evidenced in lower execution times and\nfewer losses of object contact, compared to two baselines lacking our proposed\ngraph structure.\n","authors":["Jeeho Ahn","Christoforos Mavrogiannis"],"pdf_url":"https://arxiv.org/pdf/2409.18231v1.pdf","comment":"Submitted to ICRA 2025"},{"id":"http://arxiv.org/abs/2409.14262v2","updated":"2024-09-26T19:08:40Z","published":"2024-09-21T23:06:14Z","title":"GND: Global Navigation Dataset with Multi-Modal Perception and\n Multi-Category Traversability in Outdoor Campus Environments","summary":" Navigating large-scale outdoor environments requires complex reasoning in\nterms of geometric structures, environmental semantics, and terrain\ncharacteristics, which are typically captured by onboard sensors such as LiDAR\nand cameras. While current mobile robots can navigate such environments using\npre-defined, high-precision maps based on hand-crafted rules catered for the\nspecific environment, they lack commonsense reasoning capabilities that most\nhumans possess when navigating unknown outdoor spaces. To address this gap, we\nintroduce the Global Navigation Dataset (GND), a large-scale dataset that\nintegrates multi-modal sensory data, including 3D LiDAR point clouds and RGB\nand 360-degree images, as well as multi-category traversability maps\n(pedestrian walkways, vehicle roadways, stairs, off-road terrain, and\nobstacles) from ten university campuses. These environments encompass a variety\nof parks, urban settings, elevation changes, and campus layouts of different\nscales. The dataset covers approximately 2.7km2 and includes at least 350\nbuildings in total. We also present a set of novel applications of GND to\nshowcase its utility to enable global robot navigation, such as map-based\nglobal navigation, mapless navigation, and global place recognition.\n","authors":["Jing Liang","Dibyendu Das","Daeun Song","Md Nahid Hasan Shuvo","Mohammad Durrani","Karthik Taranath","Ivan Penskiy","Dinesh Manocha","Xuesu Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.14262v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18218v1","updated":"2024-09-26T18:55:38Z","published":"2024-09-26T18:55:38Z","title":"Learning to Drive via Asymmetric Self-Play","summary":" Large-scale data is crucial for learning realistic and capable driving\npolicies. However, it can be impractical to rely on scaling datasets with real\ndata alone. The majority of driving data is uninteresting, and deliberately\ncollecting new long-tail scenarios is expensive and unsafe. We propose\nasymmetric self-play to scale beyond real data with additional challenging,\nsolvable, and realistic synthetic scenarios. Our approach pairs a teacher that\nlearns to generate scenarios it can solve but the student cannot, with a\nstudent that learns to solve them. When applied to traffic simulation, we learn\nrealistic policies with significantly fewer collisions in both nominal and\nlong-tail scenarios. Our policies further zero-shot transfer to generate\ntraining data for end-to-end autonomy, significantly outperforming\nstate-of-the-art adversarial approaches, or using real data alone. For more\ninformation, visit https://waabi.ai/selfplay .\n","authors":["Chris Zhang","Sourav Biswas","Kelvin Wong","Kion Fallah","Lunjun Zhang","Dian Chen","Sergio Casas","Raquel Urtasun"],"pdf_url":"https://arxiv.org/pdf/2409.18218v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2403.11898v2","updated":"2024-09-26T18:47:04Z","published":"2024-03-18T15:56:44Z","title":"VITaL Pretraining: Visuo-Tactile Pretraining for Tactile and Non-Tactile\n Manipulation Policies","summary":" Tactile information is a critical tool for dexterous manipulation. As humans,\nwe rely heavily on tactile information to understand objects in our\nenvironments and how to interact with them. We use touch not only to perform\nmanipulation tasks but also to learn how to perform these tasks. Therefore, to\ncreate robotic agents that can learn to complete manipulation tasks at a human\nor super-human level of performance, we need to properly incorporate tactile\ninformation into both skill execution and skill learning. In this paper, we\ninvestigate how we can incorporate tactile information into imitation learning\nplatforms to improve performance on manipulation tasks. We show that\nincorporating visuo-tactile pretraining improves imitation learning\nperformance, not only for tactile agents (policies that use tactile information\nat inference), but also for non-tactile agents (policies that do not use\ntactile information at inference). For these non-tactile agents, pretraining\nwith tactile information significantly improved performance (for example,\nimproving the accuracy on USB plugging from 20% to 85%), reaching a level on\npar with visuo-tactile agents, and even surpassing them in some cases. For\ndemonstration videos and access to our codebase, see the project website:\nhttps://sites.google.com/andrew.cmu.edu/visuo-tactile-pretraining\n","authors":["Abraham George","Selam Gano","Pranav Katragadda","Amir Barati Farimani"],"pdf_url":"https://arxiv.org/pdf/2403.11898v2.pdf","comment":"7 pages, 6 figures, submitted to ICRA 2025. Prior version named\n \"Visuo-Tactile Pretraining for Cable Plugging\""},{"id":"http://arxiv.org/abs/2311.12015v4","updated":"2024-09-26T18:35:52Z","published":"2023-11-20T18:54:39Z","title":"GPT-4V(ision) for Robotics: Multimodal Task Planning from Human\n Demonstration","summary":" We introduce a pipeline that enhances a general-purpose Vision Language\nModel, GPT-4V(ision), to facilitate one-shot visual teaching for robotic\nmanipulation. This system analyzes videos of humans performing tasks and\noutputs executable robot programs that incorporate insights into affordances.\nThe process begins with GPT-4V analyzing the videos to obtain textual\nexplanations of environmental and action details. A GPT-4-based task planner\nthen encodes these details into a symbolic task plan. Subsequently, vision\nsystems spatially and temporally ground the task plan in the videos. Objects\nare identified using an open-vocabulary object detector, and hand-object\ninteractions are analyzed to pinpoint moments of grasping and releasing. This\nspatiotemporal grounding allows for the gathering of affordance information\n(e.g., grasp types, waypoints, and body postures) critical for robot execution.\nExperiments across various scenarios demonstrate the method's efficacy in\nenabling real robots to operate from one-shot human demonstrations. Meanwhile,\nquantitative tests have revealed instances of hallucination in GPT-4V,\nhighlighting the importance of incorporating human supervision within the\npipeline. The prompts of GPT-4V/GPT-4 are available at this project page:\nhttps://microsoft.github.io/GPT4Vision-Robot-Manipulation-Prompts/\n","authors":["Naoki Wake","Atsushi Kanehira","Kazuhiro Sasabuchi","Jun Takamatsu","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2311.12015v4.pdf","comment":"8 pages, 10 figures, 3 tables. Published in IEEE Robotics and\n Automation Letters (RA-L) (in press). Last updated on September 26th, 2024"},{"id":"http://arxiv.org/abs/2409.18737v1","updated":"2024-09-26T03:16:39Z","published":"2024-09-26T03:16:39Z","title":"MemFusionMap: Working Memory Fusion for Online Vectorized HD Map\n Construction","summary":" High-definition (HD) maps provide environmental information for autonomous\ndriving systems and are essential for safe planning. While existing methods\nwith single-frame input achieve impressive performance for online vectorized HD\nmap construction, they still struggle with complex scenarios and occlusions. We\npropose MemFusionMap, a novel temporal fusion model with enhanced temporal\nreasoning capabilities for online HD map construction. Specifically, we\ncontribute a working memory fusion module that improves the model's memory\ncapacity to reason across history frames. We also design a novel temporal\noverlap heatmap to explicitly inform the model about the temporal overlap\ninformation and vehicle trajectory in the Bird's Eye View space. By integrating\nthese two designs, MemFusionMap significantly outperforms existing methods\nwhile also maintaining a versatile design for scalability. We conduct extensive\nevaluation on open-source benchmarks and demonstrate a maximum improvement of\n5.4% in mAP over state-of-the-art methods. The code for MemFusionMap will be\nmade open-source upon publication of this paper.\n","authors":["Jingyu Song","Xudong Chen","Liupei Lu","Jie Li","Katherine A. Skinner"],"pdf_url":"https://arxiv.org/pdf/2409.18737v1.pdf","comment":null}],"Systems and Control":[{"id":"http://arxiv.org/abs/2409.18097v1","updated":"2024-09-26T17:41:04Z","published":"2024-09-26T17:41:04Z","title":"A Sim-to-Real Vision-based Lane Keeping System for a 1:10-scale\n Autonomous Vehicle","summary":" In recent years, several competitions have highlighted the need to\ninvestigate vision-based solutions to address scenarios with functional\ninsufficiencies in perception, world modeling and localization. This article\npresents the Vision-based Lane Keeping System (VbLKS) developed by the\nDEI-Unipd Team within the context of the Bosch Future Mobility Challenge 2022.\nThe main contribution lies in a Simulation-to-Reality (Sim2Real) GPS-denied\nVbLKS for a 1:10-scale autonomous vehicle. In this VbLKS, the input to a\ntailored Pure Pursuit (PP) based control strategy, namely the Lookahead Heading\nError (LHE), is estimated at a constant lookahead distance employing a\nConvolutional Neural Network (CNN). A training strategy for a compact CNN is\nproposed, emphasizing data generation and augmentation on simulated camera\nimages from a 3D Gazebo simulator, and enabling real-time operation on\nlow-level hardware. A tailored PP-based lateral controller equipped with a\nderivative action and a PP-based velocity reference generation are implemented.\nTuning ranges are established through a systematic time-delay stability\nanalysis. Validation in a representative controlled laboratory setting is\nprovided.\n","authors":["Antonio Gallina","Matteo Grandin","Angelo Cenedese","Mattia Bruschetta"],"pdf_url":"https://arxiv.org/pdf/2409.18097v1.pdf","comment":"16 pages, 23 figures"},{"id":"http://arxiv.org/abs/2409.18010v1","updated":"2024-09-26T16:19:49Z","published":"2024-09-26T16:19:49Z","title":"End-to-end guarantees for indirect data-driven control of bilinear\n systems with finite stochastic data","summary":" In this paper we propose an end-to-end algorithm for indirect data-driven\ncontrol for bilinear systems with stability guarantees. We consider the case\nwhere the collected i.i.d. data is affected by probabilistic noise with\npossibly unbounded support and leverage tools from statistical learning theory\nto derive finite sample identification error bounds. To this end, we solve the\nbilinear identification problem by solving a set of linear and affine\nidentification problems, by a particular choice of a control input during the\ndata collection phase. We provide a priori as well as data-dependent finite\nsample identification error bounds on the individual matrices as well as\nellipsoidal bounds, both of which are structurally suitable for control.\nFurther, we integrate the structure of the derived identification error bounds\nin a robust controller design to obtain an exponentially stable closed-loop. By\nmeans of an extensive numerical study we showcase the interplay between the\ncontroller design and the derived identification error bounds. Moreover, we\nnote appealing connections of our results to indirect data-driven control of\ngeneral nonlinear systems through Koopman operator theory and discuss how our\nresults may be applied in this setup.\n","authors":["Nicolas Chatzikiriakos","Robin Strässer","Frank Allgöwer","Andrea Iannelli"],"pdf_url":"https://arxiv.org/pdf/2409.18010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18009v1","updated":"2024-09-26T16:19:37Z","published":"2024-09-26T16:19:37Z","title":"Control Industrial Automation System with Large Language Models","summary":" Traditional industrial automation systems require specialized expertise to\noperate and complex reprogramming to adapt to new processes. Large language\nmodels offer the intelligence to make them more flexible and easier to use.\nHowever, LLMs' application in industrial settings is underexplored. This paper\nintroduces a framework for integrating LLMs to achieve end-to-end control of\nindustrial automation systems. At the core of the framework are an agent system\ndesigned for industrial tasks, a structured prompting method, and an\nevent-driven information modeling mechanism that provides real-time data for\nLLM inference. The framework supplies LLMs with real-time events on different\ncontext semantic levels, allowing them to interpret the information, generate\nproduction plans, and control operations on the automation system. It also\nsupports structured dataset creation for fine-tuning on this downstream\napplication of LLMs. Our contribution includes a formal system design,\nproof-of-concept implementation, and a method for generating task-specific\ndatasets for LLM fine-tuning and testing. This approach enables a more adaptive\nautomation system that can respond to spontaneous events, while allowing easier\noperation and configuration through natural language for more intuitive\nhuman-machine interaction. We provide demo videos and detailed data on GitHub:\nhttps://github.com/YuchenXia/LLM4IAS\n","authors":["Yuchen Xia","Nasser Jazdi","Jize Zhang","Chaitanya Shah","Michael Weyrich"],"pdf_url":"https://arxiv.org/pdf/2409.18009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17997v1","updated":"2024-09-26T16:07:38Z","published":"2024-09-26T16:07:38Z","title":"Distributed Invariant Unscented Kalman Filter based on Inverse\n Covariance Intersection with Intermittent Measurements","summary":" This paper studies the problem of distributed state estimation (DSE) over\nsensor networks on matrix Lie groups, which is crucial for applications where\nsystem states evolve on Lie groups rather than vector spaces. We propose a\ndiffusion-based distributed invariant Unscented Kalman Filter using the inverse\ncovariance intersection (DIUKF-ICI) method to address target tracking in 3D\nenvironments. Unlike existing distributed UKFs confined to vector spaces, our\napproach extends the distributed UKF framework to Lie groups, enabling local\nestimates to be fused with intermediate information from neighboring agents on\nLie groups. To handle the unknown correlations across local estimates, we\nextend the ICI fusion strategy to matrix Lie groups for the first time and\nintegrate it into the diffusion algorithm. We demonstrate that the estimation\nerror of the proposed method is bounded. Additionally, the algorithm is fully\ndistributed, robust against intermittent measurements, and adaptable to\ntime-varying communication topologies. The effectiveness of the proposed method\nis validated through extensive Monte-Carlo simulations.\n","authors":["Zhian Ruan","Yizhi Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.17997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17988v1","updated":"2024-09-26T15:57:20Z","published":"2024-09-26T15:57:20Z","title":"Deblur e-NeRF: NeRF from Motion-Blurred Events under High-speed or\n Low-light Conditions","summary":" The stark contrast in the design philosophy of an event camera makes it\nparticularly ideal for operating under high-speed, high dynamic range and\nlow-light conditions, where standard cameras underperform. Nonetheless, event\ncameras still suffer from some amount of motion blur, especially under these\nchallenging conditions, in contrary to what most think. This is attributed to\nthe limited bandwidth of the event sensor pixel, which is mostly proportional\nto the light intensity. Thus, to ensure that event cameras can truly excel in\nsuch conditions where it has an edge over standard cameras, it is crucial to\naccount for event motion blur in downstream applications, especially\nreconstruction. However, none of the recent works on reconstructing Neural\nRadiance Fields (NeRFs) from events, nor event simulators, have considered the\nfull effects of event motion blur. To this end, we propose, Deblur e-NeRF, a\nnovel method to directly and effectively reconstruct blur-minimal NeRFs from\nmotion-blurred events generated under high-speed motion or low-light\nconditions. The core component of this work is a physically-accurate pixel\nbandwidth model proposed to account for event motion blur under arbitrary speed\nand lighting conditions. We also introduce a novel threshold-normalized total\nvariation loss to improve the regularization of large textureless patches.\nExperiments on real and novel realistically simulated sequences verify our\neffectiveness. Our code, event simulator and synthetic event dataset will be\nopen-sourced.\n","authors":["Weng Fei Low","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2409.17988v1.pdf","comment":"Accepted to ECCV 2024. Project website is accessible at\n https://wengflow.github.io/deblur-e-nerf. arXiv admin note: text overlap with\n arXiv:2006.07722 by other authors"},{"id":"http://arxiv.org/abs/2409.17931v1","updated":"2024-09-26T15:08:38Z","published":"2024-09-26T15:08:38Z","title":"Intelligent Energy Management: Remaining Useful Life Prediction and\n Charging Automation System Comprised of Deep Learning and the Internet of\n Things","summary":" Remaining Useful Life (RUL) of battery is an important parameter to know the\nbattery's remaining life and need for recharge. The goal of this research\nproject is to develop machine learning-based models for the battery RUL\ndataset. Different ML models are developed to classify the RUL of the vehicle,\nand the IoT (Internet of Things) concept is simulated for automating the\ncharging system and managing any faults aligning. The graphs plotted depict the\nrelationship between various vehicle parameters using the Blynk IoT platform.\nResults show that the catboost, Multi-Layer Perceptron (MLP), Gated Recurrent\nUnit (GRU), and hybrid model developed could classify RUL into three classes\nwith 99% more accuracy. The data is fed using the tkinter GUI for simulating\nartificial intelligence (AI)-based charging, and with a pyserial backend, data\ncan be entered into the Esp-32 microcontroller for making charge discharge\npossible with the model's predictions. Also, with an IoT system, the charging\ncan be disconnected, monitored, and analyzed for automation. The results show\nthat an accuracy of 99% can be obtained on models MLP, catboost model and\nsimilar accuracy on GRU model can be obtained, and finally relay-based\ntriggering can be made by prediction through the model used for automating the\ncharging and energy-saving mechanism. By showcasing an exemplary Blynk\nplatform-based monitoring and automation phenomenon, we further present\ninnovative ways of monitoring parameters and automating the system.\n","authors":["Biplov Paneru","Bishwash Paneru","DP Sharma Mainali"],"pdf_url":"https://arxiv.org/pdf/2409.17931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17916v1","updated":"2024-09-26T15:02:11Z","published":"2024-09-26T15:02:11Z","title":"Observer-Based Discontinuous Communication in the Secondary Control of\n AC Microgrids","summary":" This paper proposes an observer-based event-driven approach to decrease the\noveruse of communication networks. The suggested approach aims to estimate the\nrequired data for sharing between units in line with as much communication\nreduction as possible. In other words, the proposed approach effectively\ndetermines which state variables should be shared (observer concept) among the\nunits during specific time intervals (event-triggered concept). This strategy\nsignificantly reduces the overall communication load. It is shown that the\nestimation error remains bounded and Zeno behavior, characterized by an endless\nnumber of transmissions occurring within a limited time frame, does not occur.\nThe proposed methodology can be systematically applied to any\ncommunication-based secondary controller in alternating current (AC)\nmicrogrids. Simulation results demonstrate a high degree of precision in\nestimating the states under the proposed approach. Also, the secondary\ncontroller performance under the proposed method is evaluated in\nMATLAB/Simulink environment.\n","authors":["Shahabeddin Najafi","Yazdan Batmani","Pouya Shafiee","Charalambos Konstantinou"],"pdf_url":"https://arxiv.org/pdf/2409.17916v1.pdf","comment":"2024 IEEE PES Innovative Smart Grid Technologies Europe (ISGT Europe)"},{"id":"http://arxiv.org/abs/2409.17907v1","updated":"2024-09-26T14:52:51Z","published":"2024-09-26T14:52:51Z","title":"PhantomLiDAR: Cross-modality Signal Injection Attacks against LiDAR","summary":" LiDAR (Light Detection and Ranging) is a pivotal sensor for autonomous\ndriving, offering precise 3D spatial information. Previous signal attacks\nagainst LiDAR systems mainly exploit laser signals. In this paper, we\ninvestigate the possibility of cross-modality signal injection attacks, i.e.,\ninjecting intentional electromagnetic interference (IEMI) to manipulate LiDAR\noutput. Our insight is that the internal modules of a LiDAR, i.e., the laser\nreceiving circuit, the monitoring sensors, and the beam-steering modules, even\nwith strict electromagnetic compatibility (EMC) testing, can still couple with\nthe IEMI attack signals and result in the malfunction of LiDAR systems. Based\non the above attack surfaces, we propose the PhantomLiDAR attack, which\nmanipulates LiDAR output in terms of Points Interference, Points Injection,\nPoints Removal, and even LiDAR Power-Off. We evaluate and demonstrate the\neffectiveness of PhantomLiDAR with both simulated and real-world experiments on\nfive COTS LiDAR systems. We also conduct feasibility experiments in real-world\nmoving scenarios. We provide potential defense measures that can be implemented\nat both the sensor level and the vehicle system level to mitigate the risks\nassociated with IEMI attacks. Video demonstrations can be viewed at\nhttps://sites.google.com/view/phantomlidar.\n","authors":["Zizhi Jin","Qinhong Jiang","Xuancun Lu","Chen Yan","Xiaoyu Ji","Wenyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16899v2","updated":"2024-09-26T14:47:38Z","published":"2024-08-29T20:53:03Z","title":"Network-aware Recommender System via Online Feedback Optimization","summary":" Personalized content on social platforms can exacerbate negative phenomena\nsuch as polarization, partly due to the feedback interactions between\nrecommendations and the users. In this paper, we present a control-theoretic\nrecommender system that explicitly accounts for this feedback loop to mitigate\npolarization. Our approach extends online feedback optimization - a control\nparadigm for steady-state optimization of dynamical systems - to develop a\nrecommender system that trades off users engagement and polarization reduction,\nwhile relying solely on online click data. We establish theoretical guarantees\nfor optimality and stability of the proposed design and validate its\neffectiveness via numerical experiments with a user population governed by\nFriedkin-Johnsen dynamics. Our results show these \"network-aware\"\nrecommendations can significantly reduce polarization while maintaining high\nlevels of user engagement.\n","authors":["Sanjay Chandrasekaran","Giulia De Pasquale","Giuseppe Belgioioso","Florian Dörfler"],"pdf_url":"https://arxiv.org/pdf/2408.16899v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17896v1","updated":"2024-09-26T14:47:14Z","published":"2024-09-26T14:47:14Z","title":"Model-Free versus Model-Based Reinforcement Learning for Fixed-Wing UAV\n Attitude Control Under Varying Wind Conditions","summary":" This paper evaluates and compares the performance of model-free and\nmodel-based reinforcement learning for the attitude control of fixed-wing\nunmanned aerial vehicles using PID as a reference point. The comparison focuses\non their ability to handle varying flight dynamics and wind disturbances in a\nsimulated environment. Our results show that the Temporal Difference Model\nPredictive Control agent outperforms both the PID controller and other\nmodel-free reinforcement learning methods in terms of tracking accuracy and\nrobustness over different reference difficulties, particularly in nonlinear\nflight regimes. Furthermore, we introduce actuation fluctuation as a key metric\nto assess energy efficiency and actuator wear, and we test two different\napproaches from the literature: action variation penalty and conditioning for\naction policy smoothness. We also evaluate all control methods when subject to\nstochastic turbulence and gusts separately, so as to measure their effects on\ntracking performance, observe their limitations and outline their implications\non the Markov decision process formalism.\n","authors":["David Olivares","Pierre Fournier","Pavan Vasishta","Julien Marzat"],"pdf_url":"https://arxiv.org/pdf/2409.17896v1.pdf","comment":"Published at ICINCO 2024"},{"id":"http://arxiv.org/abs/2409.17881v1","updated":"2024-09-26T14:28:20Z","published":"2024-09-26T14:28:20Z","title":"Discontinuous Reception with Adjustable Inactivity Timer for IIoT","summary":" Discontinuous reception (DRX) is a key technology for reducing the energy\nconsumption of industrial Internet of Things (IIoT) devices. Specifically, DRX\nallows the devices to operate in a low-power mode when no data reception is\nscheduled, and its effectiveness depends on the proper configuration of the DRX\nparameters. In this paper, we characterize the DRX process departing from a\nsemi-Markov chain modeling. We detail two ways to set DRX parameters to\nminimize the device power consumption while meeting a mean delay constraint.\nThe first method exhaustively searches for the optimal configuration. In\ncontrast, the second method uses a low-complexity metaheuristic to find a\nsub-optimal configuration, thus considering ideal and practical DRX\nconfigurations. Notably, within the DRX parameters, the inactivity timer (IT)\nis a caution time that specifies how long a device remains active after the\nlast information exchange. Traditionally, a device implementing DRX will\nrestart the IT after each data reception as a precedent to a low-power mode.\nThe usual approach lies in restarting the IT whenever new data is received\nduring this cautious period, which might sometimes needlessly extend the active\ntime. Herein, we propose a more efficient method in which the transmit base\nstation (BS) explicitly indicates restarting the timer through the control\nchannel only when appropriate. The decision is taken based on the BS's\nknowledge about its buffer status. We consider Poisson and bursty traffic\nmodels, which are typical in IIoT setups, and verify the suitability of our\nproposal for reducing the energy consumption of the devices without\nsignificantly compromising the communication latency through extensive\nnumerical simulations. Specifically, energy-saving gains of up to 30% can be\nobtained regardless of the arrival rate and delay constraints.\n","authors":["David E. Ruíz-Guirola","Carlos A. Rodríguez-López","Onel L. A. López","Samuel Montejo-Sánchez","Vitalio Alfonso Reguera","Matti Latva-aho"],"pdf_url":"https://arxiv.org/pdf/2409.17881v1.pdf","comment":"IEEE Transactions on Industrial Informatics (2024)"},{"id":"http://arxiv.org/abs/2409.03175v2","updated":"2024-09-26T12:05:28Z","published":"2024-09-05T02:08:30Z","title":"Data-based approaches to learning and control by similarity between\n heterogeneous systems","summary":" This paper proposes basic definitions of similarity and similarity indexes\nbetween admissible behaviors of heterogeneous host and guest systems and\nfurther presents a similarity-based learning control framework by exploiting\nthe offline sampled data. By exploring helpful geometric properties of the\nadmissible behavior and decomposing it into the subspace and offset components,\nthe similarity indexes between two admissible behaviors are defined as the\nprincipal angles between their corresponding subspace components. By\nreconstructing the admissible behaviors leveraging sampled data, an efficient\nstrategy for calculating the similarity indexes is developed, based on which a\nsimilarity-based learning control framework is proposed. It is shown that, with\nthe application of similarity-based learning control, the host system can\ndirectly accomplish the same control tasks by utilizing the successful\nexperience provided by the guest system, without having to undergo the\ntrial-and-error process. All results in this paper are supported by simulation\nexamples.\n","authors":["Chenchao Wang","Deyuan Meng"],"pdf_url":"https://arxiv.org/pdf/2409.03175v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17720v1","updated":"2024-09-26T10:43:09Z","published":"2024-09-26T10:43:09Z","title":"Scene Understanding in Pick-and-Place Tasks: Analyzing Transformations\n Between Initial and Final Scenes","summary":" With robots increasingly collaborating with humans in everyday tasks, it is\nimportant to take steps toward robotic systems capable of understanding the\nenvironment. This work focuses on scene understanding to detect pick and place\ntasks given initial and final images from the scene. To this end, a dataset is\ncollected for object detection and pick and place task detection. A YOLOv5\nnetwork is subsequently trained to detect the objects in the initial and final\nscenes. Given the detected objects and their bounding boxes, two methods are\nproposed to detect the pick and place tasks which transform the initial scene\ninto the final scene. A geometric method is proposed which tracks objects'\nmovements in the two scenes and works based on the intersection of the bounding\nboxes which moved within scenes. Contrarily, the CNN-based method utilizes a\nConvolutional Neural Network to classify objects with intersected bounding\nboxes into 5 classes, showing the spatial relationship between the involved\nobjects. The performed pick and place tasks are then derived from analyzing the\nexperiments with both scenes. Results show that the CNN-based method, using a\nVGG16 backbone, outscores the geometric method by roughly 12 percentage points\nin certain scenarios, with an overall success rate of 84.3%.\n","authors":["Seraj Ghasemi","Hamed Hosseini","MohammadHossein Koosheshi","Mehdi Tale Masouleh","Ahmad Kalhor"],"pdf_url":"https://arxiv.org/pdf/2409.17720v1.pdf","comment":"Conference Paper, ICEE 2024, 7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17705v1","updated":"2024-09-26T10:21:39Z","published":"2024-09-26T10:21:39Z","title":"On the Output Redundancy of LTI Systems: A Geometric Approach with\n Application to Privacy","summary":" This paper examines the properties of output-redundant systems, that is,\nsystems possessing a larger number of outputs than inputs, through the lenses\nof the geometric approach of Wonham et al. We begin by formulating a simple\noutput allocation synthesis problem, which involves ``concealing\" input\ninformation from a malicious eavesdropper having access to the system output,\nwhile still allowing for a legitimate user to reconstruct it. It is shown that\nthe solvability of this problem requires the availability of a redundant set of\noutputs. This very problem is instrumental to unveiling the fundamental\ngeometric properties of output-redundant systems, which form the basis for our\nsubsequent constructions and results. As a direct application, we demonstrate\nhow output allocation can be employed to effectively protect the information of\ninput information from certain output eavesdroppers with guaranteed results.\n","authors":["Guitao Yang","Alexander J. Gallo","Angelo Barboni","Riccardo M. G. Ferrari","Andrea Serrani","Thomas Parisini"],"pdf_url":"https://arxiv.org/pdf/2409.17705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10668v2","updated":"2024-09-26T09:38:01Z","published":"2024-02-16T13:19:04Z","title":"Data-Driven Abstractions for Control Systems via Random Exploration","summary":" At the intersection of dynamical systems, control theory, and formal methods\nlies the construction of symbolic abstractions: these typically represent\nsimpler, finite-state models whose behavior mimics that of an underlying\nconcrete system but are easier to analyse. Building an abstraction usually\nrequires an accurate knowledge of the underlying model: this knowledge may be\ncostly to gather, especially in real-life applications. We aim to bridge this\ngap by building abstractions based on sampling finite length trajectories. To\nrefine a controller built for the abstraction to one for the concrete system,\nwe newly define a notion of probabilistic alternating simulation, and provide\nProbably Approximately Correct (PAC) guarantees that the constructed\nabstraction includes all behaviors of the concrete system and that it is\nsuitable for control design, for arbitrarily long time horizons, leveraging\nscenario theory. Our method is then tested on several numerical benchmarks.\n","authors":["Rudi Coppola","Andrea Peruffo","Manuel Mazo Jr"],"pdf_url":"https://arxiv.org/pdf/2402.10668v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17672v1","updated":"2024-09-26T09:31:42Z","published":"2024-09-26T09:31:42Z","title":"Semantic model for the description of energy data in the Module Type\n Package","summary":" Modular production systems that employ the Module Type Package (MTP) to\ndescribe module interfaces can, at present, only communicate energy data\nthrough proprietary solutions. Due to this limitation, users face additional\neffort when calculating energy KPIs for modules or determining the energy\nefficiency of modules. To address this issue, we present a model that\nfacilitates energy data to be described semantically and uniformly in the MTP\non the basis of an industrial standard (OPC 34100). MTPs incorporating this\nmodel can transmit semantically consistent energy data from modules to the\nprocess control system, making the data available for further applications,\nsuch as monitoring or optimization.\n","authors":["Leif-Thore Reiche","Felix Gehlhoff","Alexander Fay"],"pdf_url":"https://arxiv.org/pdf/2409.17672v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17613v1","updated":"2024-09-26T08:03:33Z","published":"2024-09-26T08:03:33Z","title":"Stereographic Projection of Probabilistic Frequency-Domain Uncertainty","summary":" This paper investigates the stereographic projection of points along the\nNyquist plots of single input single output (SISO) linear time invariant (LTI)\nsystems subject to probabilistic uncertainty. At each frequency, there\ncorresponds a complex-valued random variable with given probability\ndistribution in the complex plane. The chordal distance between the\nstereographic projections of this complex value and the corresponding value for\na nominal model, as per the well-known Nu-Gap metric of Vinnicombe, is also a\nrandom quantity. The main result provides the cumulative density function (CDF)\nof the chordal distance at a given frequency. Such a stochastic distance\nframework opens up a fresh and a fertile research direction on probabilistic\nrobust control theory.\n","authors":["Anton Nystrom","Venkatraman Renganathan","Michael Cantoni"],"pdf_url":"https://arxiv.org/pdf/2409.17613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04436v2","updated":"2024-09-26T07:43:44Z","published":"2023-09-30T12:54:55Z","title":"Adaptive Control of an Inverted Pendulum by a Reinforcement\n Learning-based LQR Method","summary":" Inverted pendulums constitute one of the popular systems for benchmarking\ncontrol algorithms. Several methods have been proposed for the control of this\nsystem, the majority of which rely on the availability of a mathematical model.\nHowever, deriving a mathematical model using physical parameters or system\nidentification techniques requires manual effort. Moreover, the designed\ncontrollers may perform poorly if system parameters change. To mitigate these\nproblems, recently, some studies used Reinforcement Learning (RL) based\napproaches for the control of inverted pendulum systems. Unfortunately, these\nmethods suffer from slow convergence and local minimum problems. Moreover, they\nmay require hyperparameter tuning which complicates the design process\nsignificantly. To alleviate these problems, the present study proposes an\nLQR-based RL method for adaptive balancing control of an inverted pendulum. As\nshown by numerical experiments, the algorithm stabilizes the system very fast\nwithout requiring a mathematical model or extensive hyperparameter tuning. In\naddition, it can adapt to parametric changes online.\n","authors":["Ugur Yildiran"],"pdf_url":"https://arxiv.org/pdf/2310.04436v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15958v2","updated":"2024-09-26T06:41:11Z","published":"2024-03-23T23:27:37Z","title":"Convection-Enabled Boundary Control of a 2D Channel Flow","summary":" Nonlinear convection, the source of turbulence in fluid flows, may hold the\nkey to stabilizing turbulence by solving a specific cubic polynomial equation.\nWe consider the incompressible Navier-Stokes equations in a two-dimensional\nchannel. The tangential and normal velocities are assumed to be periodic in the\nstreamwise direction. The pressure difference between the left and right ends\nof the channel is constant. Moreover, we consider no-slip boundary conditions,\nthat is, zero tangential velocity, at the top and bottom walls of the channel,\nand normal velocity actuation at the top and bottom walls. We design the\nboundary control inputs to achieve global exponential stabilization, in the L2\nsense, of a chosen Poiseuille equilibrium profile for an arbitrarily large\nReynolds number. The key idea behind our approach is to select the boundary\ncontrollers such that they have zero spatial mean (to guarantee mass\nconservation) but non-zero spatial cubic mean. We reveal that, because of\nconvection, the time derivative of the L2 energy of the regulation error is a\ncubic polynomial in the cubic mean of the boundary inputs. Regulation is then\nachieved by solving a specific cubic equation, using the Cardano root formula.\nThe results are illustrated via a numerical example.\n","authors":["Mohamed Camil Belhadjoudja","Miroslav Krstic","Emmanuel Witrant"],"pdf_url":"https://arxiv.org/pdf/2403.15958v2.pdf","comment":"To be presented at the 63rd IEEE Conference on Decision and Control\n (CDC 2024)"},{"id":"http://arxiv.org/abs/2409.13624v2","updated":"2024-09-26T06:40:18Z","published":"2024-09-20T16:34:31Z","title":"Safe stabilization using generalized Lyapunov barrier function","summary":" This paper addresses the safe stabilization problem, focusing on controlling\nthe system state to the origin while avoiding entry into unsafe state sets. The\ncurrent methods for solving this issue rely on smooth Lyapunov and barrier\nfunctions, which do not always ensure the existence of an effective controller\neven when such smooth functions are created. To tackle this challenge, we\nintroduce the concept of a generalized (nonsmooth) Lyapunov barrier function\n(GenLBF), which guarantees the existence of a safe and stable controller. We\noutline a systematic approach for constructing a GenLBF, including a technique\nfor efficiently calculating the upper generalized derivative of the GenLBF.\nUsing the constructed GenLBF, we propose a method for certifying safe\nstabilization of autonomous systems and design a piecewise continuous feedback\ncontrol to achieve safe stabilization of non-autonomous systems. A general\ncontroller refinement strategy is further proposed to help the state trajectory\nescape from undesired local points occurring in systems with special physical\nstructure. A thorough theoretical analysis demonstrates the effectiveness of\nour method in addressing the safe stabilization problem for systems with single\nor multiple bounded unsafe state sets. Extensive simulations of linear and\nnonlinear systems further illustrate the efficacy of the proposed method and\nits superiority over the smooth control Lyapunov barrier function method.\n","authors":["Jianglin Lan","Eldert van Henten","Peter Groot Koerkamp","Congcong Sun"],"pdf_url":"https://arxiv.org/pdf/2409.13624v2.pdf","comment":"19 pages, 14 figures, under review by a journal"},{"id":"http://arxiv.org/abs/2409.17500v1","updated":"2024-09-26T03:12:53Z","published":"2024-09-26T03:12:53Z","title":"GLinSAT: The General Linear Satisfiability Neural Network Layer By\n Accelerated Gradient Descent","summary":" Ensuring that the outputs of neural networks satisfy specific constraints is\ncrucial for applying neural networks to real-life decision-making problems. In\nthis paper, we consider making a batch of neural network outputs satisfy\nbounded and general linear constraints. We first reformulate the neural network\noutput projection problem as an entropy-regularized linear programming problem.\nWe show that such a problem can be equivalently transformed into an\nunconstrained convex optimization problem with Lipschitz continuous gradient\naccording to the duality theorem. Then, based on an accelerated gradient\ndescent algorithm with numerical performance enhancement, we present our\narchitecture, GLinSAT, to solve the problem. To the best of our knowledge, this\nis the first general linear satisfiability layer in which all the operations\nare differentiable and matrix-factorization-free. Despite the fact that we can\nexplicitly perform backpropagation based on automatic differentiation\nmechanism, we also provide an alternative approach in GLinSAT to calculate the\nderivatives based on implicit differentiation of the optimality condition.\nExperimental results on constrained traveling salesman problems, partial graph\nmatching with outliers, predictive portfolio allocation and power system unit\ncommitment demonstrate the advantages of GLinSAT over existing satisfiability\nlayers.\n","authors":["Hongtai Zeng","Chao Yang","Yanzhen Zhou","Cheng Yang","Qinglai Guo"],"pdf_url":"https://arxiv.org/pdf/2409.17500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16663v2","updated":"2024-09-26T02:57:52Z","published":"2024-09-25T06:48:25Z","title":"Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles\n Using Latent Space Generative World Models","summary":" We propose the use of latent space generative world models to address the\ncovariate shift problem in autonomous driving. A world model is a neural\nnetwork capable of predicting an agent's next state given past states and\nactions. By leveraging a world model during training, the driving policy\neffectively mitigates covariate shift without requiring an excessive amount of\ntraining data. During end-to-end training, our policy learns how to recover\nfrom errors by aligning with states observed in human demonstrations, so that\nat runtime it can recover from perturbations outside the training distribution.\nAdditionally, we introduce a novel transformer-based perception encoder that\nemploys multi-view cross-attention and a learned scene query. We present\nqualitative and quantitative results, demonstrating significant improvements\nupon prior state of the art in closed-loop testing in the CARLA simulator, as\nwell as showing the ability to handle perturbations in both CARLA and NVIDIA's\nDRIVE Sim.\n","authors":["Alexander Popov","Alperen Degirmenci","David Wehr","Shashank Hegde","Ryan Oldja","Alexey Kamenev","Bertrand Douillard","David Nistér","Urs Muller","Ruchi Bhargava","Stan Birchfield","Nikolai Smolyanskiy"],"pdf_url":"https://arxiv.org/pdf/2409.16663v2.pdf","comment":"7 pages, 6 figures, for ICRA 2025 conference, for associated video\n file, see https://youtu.be/fO7RZ57gVxk"},{"id":"http://arxiv.org/abs/2409.17488v1","updated":"2024-09-26T02:50:32Z","published":"2024-09-26T02:50:32Z","title":"Optimal control of stochastic reaction networks with entropic control\n cost and emergence of mode-switching strategies","summary":" Controlling the stochastic dynamics of biological populations is a challenge\nthat arises across various biological contexts. However, these dynamics are\ninherently nonlinear and involve a discrete state space, i.e., the number of\nmolecules, cells, or organisms. Additionally, the possibility of extinction has\na significant impact on both the dynamics and control strategies, particularly\nwhen the population size is small. These factors hamper the direct application\nof conventional control theories to biological systems. To address these\nchallenges, we formulate the optimal control problem for stochastic population\ndynamics by utilizing a control cost function based on the Kullback-Leibler\ndivergence. This approach naturally accounts for population-specific factors\nand simplifies the complex nonlinear Hamilton-Jacobi-Bellman equation into a\nlinear form, facilitating efficient computation of optimal solutions. We\ndemonstrate the effectiveness of our approach by applying it to the control of\ninteracting random walkers, Moran processes, and SIR models, and observe the\nmode-switching phenomena in the control strategies. Our approach provides new\nopportunities for applying control theory to a wide range of biological\nproblems.\n","authors":["Shuhei A. Horiguchi","Tetsuya J. Kobayashi"],"pdf_url":"https://arxiv.org/pdf/2409.17488v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.15139v2","updated":"2024-09-26T00:58:52Z","published":"2024-09-23T15:42:53Z","title":"The Top Manifold Connectedness of Quantum Control Landscapes","summary":" The control of quantum systems has been proven to possess trap-free\noptimization landscapes under the satisfaction of proper assumptions. However,\nmany details of the landscape geometry and their influence on search efficiency\nstill need to be fully understood. This paper numerically explores the\npath-connectedness of globally optimal control solutions forming the top\nmanifold of the landscape. We randomly sample a plurality of optimal controls\nin the top manifold to assess the existence of a continuous path at the top of\nthe landscape that connects two arbitrary optimal solutions. It is shown that\nfor different quantum control objectives including state-to-state transition\nprobabilities, observable expectation values and unitary transformations, such\na continuous path can be readily found, implying that these top manifolds are\nfundamentally path-connected. The significance of the latter conjecture lies in\nseeking locations in the top manifold where an ancillary objective can also be\noptimized while maintaining the full optimality of the original objective that\ndefined the landscape.\n","authors":["Yidian Fan","Re-Bing Wu","Tak-San Ho","Gaurav V. Bhole","Herschel Rabitz"],"pdf_url":"https://arxiv.org/pdf/2409.15139v2.pdf","comment":"34 pages, 10 figures"},{"id":"http://arxiv.org/abs/2409.18317v1","updated":"2024-09-26T21:51:49Z","published":"2024-09-26T21:51:49Z","title":"Survey of Moving Target Defense in Power Grids: Design Principles,\n Tradeoffs, and Future Directions","summary":" Moving target defense (MTD) in power grids is an emerging defense technique\nthat has gained prominence in the recent past. It aims to solve the\nlong-standing problem of securing the power grid against stealthy attacks. The\nkey idea behind MTD is to introduce periodic/event-triggered controlled changes\nto the power grid's SCADA network/physical plant, thereby invalidating the\nknowledge attackers use for crafting stealthy attacks. In this paper, we\nprovide a comprehensive overview of this topic and classify the different ways\nin which MTD is implemented in power grids. We further introduce the guiding\nprinciples behind the design of MTD, key performance metrics, and the\nassociated trade-offs in MTD and identify the future development of MTD for\npower grid security.\n","authors":["Subhash Lakshminarayana","Yexiang Chen","Charalambos Konstantinou","Daisuke Mashima","Anurag K. Srivastava"],"pdf_url":"https://arxiv.org/pdf/2409.18317v1.pdf","comment":"10 pages, 3 figures, survey"},{"id":"http://arxiv.org/abs/2402.06778v2","updated":"2024-09-26T21:30:35Z","published":"2024-02-09T20:49:51Z","title":"Distributed Quasi-Newton Method for Multi-Agent Optimization","summary":" We present a distributed quasi-Newton (DQN) method, which enables a group of\nagents to compute an optimal solution of a separable multi-agent optimization\nproblem locally using an approximation of the curvature of the aggregate\nobjective function. Each agent computes a descent direction from its local\nestimate of the aggregate Hessian, obtained from quasi-Newton approximation\nschemes using the gradient of its local objective function. Moreover, we\nintroduce a distributed quasi-Newton method for equality-constrained\noptimization (EC-DQN), where each agent takes Karush-Kuhn-Tucker-like update\nsteps to compute an optimal solution. In our algorithms, each agent\ncommunicates with its one-hop neighbors over a peer-to-peer communication\nnetwork to compute a common solution. We prove convergence of our algorithms to\na stationary point of the optimization problem. In addition, we demonstrate the\ncompetitive empirical convergence of our algorithm in both well-conditioned and\nill-conditioned optimization problems, in terms of the computation time and\ncommunication cost incurred by each agent for convergence, compared to existing\ndistributed first-order and second-order methods. Particularly, in\nill-conditioned problems, our algorithms achieve a faster computation time for\nconvergence, while requiring a lower communication cost, across a range of\ncommunication networks with different degrees of connectedness.\n","authors":["Ola Shorinwa","Mac Schwager"],"pdf_url":"https://arxiv.org/pdf/2402.06778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18304v1","updated":"2024-09-26T21:22:10Z","published":"2024-09-26T21:22:10Z","title":"Multi-platoon car-following models with flexible platoon sizes and\n communication levels","summary":" In this paper, we extend a single platoon car-following (CF) model to some\nmulti-platoon CF models for connected and autonomous vehicles (CAVs) with\nflexible platoon size and communication level. Specifically, we consider\nforward and backward communication methods between platoons with delays. Some\ngeneral results of linear stability are mathematically proven, and numerical\nsimulations are performed to illustrate the effects of platoon sizes and\ncommunication levels, as well as to demonstrate the potential for stabilizing\nhuman-driven vehicles (HDVs) in mixed traffic conditions. The simulation\nresults are consistent with theoretical analysis, and demonstrate that in the\nring road scenario, CAV platoons can stabilize certain percentage of HDVs. This\npaper can provide suggestions for the design of communication system of\nautonomous vehicles (AVs), and management of mixed traffic flow of CAVs and\nHDVs.\n","authors":["Shouwei Hui","Michael Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18304v1.pdf","comment":"Preprint for IEEE"},{"id":"http://arxiv.org/abs/2409.18298v1","updated":"2024-09-26T21:10:50Z","published":"2024-09-26T21:10:50Z","title":"Causality-based Subject and Task Fingerprints using fMRI Time-series\n Data","summary":" Recently, there has been a revived interest in system neuroscience causation\nmodels due to their unique capability to unravel complex relationships in\nmulti-scale brain networks. In this paper, our goal is to verify the\nfeasibility and effectiveness of using a causality-based approach for fMRI\nfingerprinting. Specifically, we propose an innovative method that utilizes the\ncausal dynamics activities of the brain to identify the unique cognitive\npatterns of individuals (e.g., subject fingerprint) and fMRI tasks (e.g., task\nfingerprint). The key novelty of our approach stems from the development of a\ntwo-timescale linear state-space model to extract 'spatio-temporal' (aka\ncausal) signatures from an individual's fMRI time series data. To the best of\nour knowledge, we pioneer and subsequently quantify, in this paper, the concept\nof 'causal fingerprint.' Our method is well-separated from other fingerprint\nstudies as we quantify fingerprints from a cause-and-effect perspective, which\nare then incorporated with a modal decomposition and projection method to\nperform subject identification and a GNN-based (Graph Neural Network) model to\nperform task identification. Finally, we show that the experimental results and\ncomparisons with non-causality-based methods demonstrate the effectiveness of\nthe proposed methods. We visualize the obtained causal signatures and discuss\ntheir biological relevance in light of the existing understanding of brain\nfunctionalities. Collectively, our work paves the way for further studies on\ncausal fingerprints with potential applications in both healthy controls and\nneurodegenerative diseases.\n","authors":["Dachuan Song","Li Shen","Duy Duong-Tran","Xuan Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18289v1","updated":"2024-09-26T21:00:45Z","published":"2024-09-26T21:00:45Z","title":"Criticality and Safety Margins for Reinforcement Learning","summary":" State of the art reinforcement learning methods sometimes encounter unsafe\nsituations. Identifying when these situations occur is of interest both for\npost-hoc analysis and during deployment, where it might be advantageous to call\nout to a human overseer for help. Efforts to gauge the criticality of different\npoints in time have been developed, but their accuracy is not well established\ndue to a lack of ground truth, and they are not designed to be easily\ninterpretable by end users. Therefore, we seek to define a criticality\nframework with both a quantifiable ground truth and a clear significance to\nusers. We introduce true criticality as the expected drop in reward when an\nagent deviates from its policy for n consecutive random actions. We also\nintroduce the concept of proxy criticality, a low-overhead metric that has a\nstatistically monotonic relationship to true criticality. Safety margins make\nthese interpretable, when defined as the number of random actions for which\nperformance loss will not exceed some tolerance with high confidence. We\ndemonstrate this approach in several environment-agent combinations; for an A3C\nagent in an Atari Beamrider environment, the lowest 5% of safety margins\ncontain 47% of agent losses; i.e., supervising only 5% of decisions could\npotentially prevent roughly half of an agent's errors. This criticality\nframework measures the potential impacts of bad decisions, even before those\ndecisions are made, allowing for more effective debugging and oversight of\nautonomous agents.\n","authors":["Alexander Grushin","Walt Woods","Alvaro Velasquez","Simon Khan"],"pdf_url":"https://arxiv.org/pdf/2409.18289v1.pdf","comment":"17 pages, 10 figures. This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2409.18281v1","updated":"2024-09-26T20:50:00Z","published":"2024-09-26T20:50:00Z","title":"Optimizing Downlink C-NOMA Transmission with Movable Antennas: A\n DDPG-based Approach","summary":" This paper analyzes a downlink C-NOMA scenario where a base station (BS) is\ndeployed to serve a pair of users equipped with movable antenna (MA)\ntechnology. The user with better channel conditions with the BS will be able to\ntransmit the signal to the other user providing an extra transmission resource\nand enhancing performance. Both users are equipped with a receiving MA each and\na transmitting MA for the relaying user. In this regard, we formulate an\noptimization problem with the objective of maximizing the achievable sum rate\nby jointly determining the beamforming vector at the BS, the transmit power at\nthe device and the positions of the MAs while meeting the quality of service\n(QoS) constraints. Due to the non-convex structure of the formulated problem\nand the randomness in the channels we adopt a deep deterministic policy\ngradient (DDPG) approach, a reinforcement learning (RL) algorithm capable of\ndealing with continuous state and action spaces. Numerical results demonstrate\nthe superiority of the presented model compared to the other benchmark schemes\nshowing gains reaching 45% compared to the NOMA enabled MA scheme and 60%\ncompared to C-NOMA model with fixed antennas. The solution approach showed 93%\naccuracy compared to the optimal solution.\n","authors":["Ali Amhaz","Mohamed Elhattab","Chadi Assi","Sanaa Sharafeddine"],"pdf_url":"https://arxiv.org/pdf/2409.18281v1.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2409.18119v1","updated":"2024-09-26T17:56:59Z","published":"2024-09-26T17:56:59Z","title":"Multi-View and Multi-Scale Alignment for Contrastive Language-Image\n Pre-training in Mammography","summary":" Contrastive Language-Image Pre-training (CLIP) shows promise in medical image\nanalysis but requires substantial data and computational resources. Due to\nthese restrictions, existing CLIP applications in medical imaging focus mainly\non modalities like chest X-rays that have abundant image-report data available,\nleaving many other important modalities under-explored. Here, we propose the\nfirst adaptation of the full CLIP model to mammography, which presents\nsignificant challenges due to labeled data scarcity, high-resolution images\nwith small regions of interest, and data imbalance. We first develop a\nspecialized supervision framework for mammography that leverages its multi-view\nnature. Furthermore, we design a symmetric local alignment module to better\nfocus on detailed features in high-resolution images. Lastly, we incorporate a\nparameter-efficient fine-tuning approach for large language models pre-trained\nwith medical knowledge to address data limitations. Our multi-view and\nmulti-scale alignment (MaMA) method outperforms state-of-the-art baselines for\nthree different tasks on two large real-world mammography datasets, EMBED and\nRSNA-Mammo, with only 52% model size compared with the largest baseline.\n","authors":["Yuexi Du","John Onofrey","Nicha C. Dvornek"],"pdf_url":"https://arxiv.org/pdf/2409.18119v1.pdf","comment":"This work is also the basis of the overall best solution for the\n MICCAI 2024 CXR-LT Challenge"},{"id":"http://arxiv.org/abs/2407.00312v2","updated":"2024-09-26T17:55:37Z","published":"2024-06-29T04:29:03Z","title":"UDC: A Unified Neural Divide-and-Conquer Framework for Large-Scale\n Combinatorial Optimization Problems","summary":" Single-stage neural combinatorial optimization solvers have achieved\nnear-optimal results on various small-scale combinatorial optimization (CO)\nproblems without needing expert knowledge. However, these solvers exhibit\nsignificant performance degradation when applied to large-scale CO problems.\nRecently, two-stage neural methods with divide-and-conquer strategies have\nshown efficiency in addressing large-scale CO problems. Nevertheless, the\nperformance of these methods highly relies on problem-specific heuristics in\neither the divide or the conquer procedure, which limits their applicability to\ngeneral CO problems. Moreover, these methods employ separate training schemes\nand ignore the interdependencies between the dividing and conquering\nstrategies, which often leads to sub-optimal solutions. To tackle these\ndrawbacks, this article develops a unified neural divide-and-conquer framework\n(i.e., UDC) for solving general large-scale CO problems. UDC offers a\nDivide-Conquer-Reunion (DCR) training method to eliminate the negative impact\nof a sub-optimal dividing policy. Employing a high-efficiency Graph Neural\nNetwork (GNN) for global instance dividing and a fixed-length sub-path solver\nfor conquering divided sub-problems, the proposed UDC framework demonstrates\nextensive applicability, achieving superior performance in 10 representative\nlarge-scale CO problems. The code is available at\nhttps://github.com/CIAM-Group/NCO_code/tree/main/single_objective/UDC-Large-scale-CO-master.\n","authors":["Zhi Zheng","Changliang Zhou","Tong Xialiang","Mingxuan Yuan","Zhenkun Wang"],"pdf_url":"https://arxiv.org/pdf/2407.00312v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18104v1","updated":"2024-09-26T17:49:20Z","published":"2024-09-26T17:49:20Z","title":"Find Rhinos without Finding Rhinos: Active Learning with Multimodal\n Imagery of South African Rhino Habitats","summary":" Much of Earth's charismatic megafauna is endangered by human activities,\nparticularly the rhino, which is at risk of extinction due to the poaching\ncrisis in Africa. Monitoring rhinos' movement is crucial to their protection\nbut has unfortunately proven difficult because rhinos are elusive. Therefore,\ninstead of tracking rhinos, we propose the novel approach of mapping communal\ndefecation sites, called middens, which give information about rhinos' spatial\nbehavior valuable to anti-poaching, management, and reintroduction efforts.\nThis paper provides the first-ever mapping of rhino midden locations by\nbuilding classifiers to detect them using remotely sensed thermal, RGB, and\nLiDAR imagery in passive and active learning settings. As existing active\nlearning methods perform poorly due to the extreme class imbalance in our\ndataset, we design MultimodAL, an active learning system employing a ranking\ntechnique and multimodality to achieve competitive performance with passive\nlearning models with 94% fewer labels. Our methods could therefore save over 76\nhours in labeling time when used on a similarly-sized dataset. Unexpectedly,\nour midden map reveals that rhino middens are not randomly distributed\nthroughout the landscape; rather, they are clustered. Consequently, rangers\nshould be targeted at areas with high midden densities to strengthen\nanti-poaching efforts, in line with UN Target 15.7.\n","authors":["Lucia Gordon","Nikhil Behari","Samuel Collier","Elizabeth Bondi-Kelly","Jackson A. Killian","Catherine Ressijac","Peter Boucher","Andrew Davies","Milind Tambe"],"pdf_url":"https://arxiv.org/pdf/2409.18104v1.pdf","comment":"9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good"},{"id":"http://arxiv.org/abs/2409.18101v1","updated":"2024-09-26T17:44:52Z","published":"2024-09-26T17:44:52Z","title":"AI-Powered Augmented Reality for Satellite Assembly, Integration and\n Test","summary":" The integration of Artificial Intelligence (AI) and Augmented Reality (AR) is\nset to transform satellite Assembly, Integration, and Testing (AIT) processes\nby enhancing precision, minimizing human error, and improving operational\nefficiency in cleanroom environments. This paper presents a technical\ndescription of the European Space Agency's (ESA) project \"AI for AR in\nSatellite AIT,\" which combines real-time computer vision and AR systems to\nassist technicians during satellite assembly. Leveraging Microsoft HoloLens 2\nas the AR interface, the system delivers context-aware instructions and\nreal-time feedback, tackling the complexities of object recognition and 6D pose\nestimation in AIT workflows. All AI models demonstrated over 70% accuracy, with\nthe detection model exceeding 95% accuracy, indicating a high level of\nperformance and reliability. A key contribution of this work lies in the\neffective use of synthetic data for training AI models in AR applications,\naddressing the significant challenges of obtaining real-world datasets in\nhighly dynamic satellite environments, as well as the creation of the Segmented\nAnything Model for Automatic Labelling (SAMAL), which facilitates the automatic\nannotation of real data, achieving speeds up to 20 times faster than manual\nhuman annotation. The findings demonstrate the efficacy of AI-driven AR systems\nin automating critical satellite assembly tasks, setting a foundation for\nfuture innovations in the space industry.\n","authors":["Alvaro Patricio","Joao Valente","Atabak Dehban","Ines Cadilha","Daniel Reis","Rodrigo Ventura"],"pdf_url":"https://arxiv.org/pdf/2409.18101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18099v1","updated":"2024-09-26T17:44:20Z","published":"2024-09-26T17:44:20Z","title":"EfficientCrackNet: A Lightweight Model for Crack Segmentation","summary":" Crack detection, particularly from pavement images, presents a formidable\nchallenge in the domain of computer vision due to several inherent complexities\nsuch as intensity inhomogeneity, intricate topologies, low contrast, and noisy\nbackgrounds. Automated crack detection is crucial for maintaining the\nstructural integrity of essential infrastructures, including buildings,\npavements, and bridges. Existing lightweight methods often face challenges\nincluding computational inefficiency, complex crack patterns, and difficult\nbackgrounds, leading to inaccurate detection and impracticality for real-world\napplications. To address these limitations, we propose EfficientCrackNet, a\nlightweight hybrid model combining Convolutional Neural Networks (CNNs) and\ntransformers for precise crack segmentation. EfficientCrackNet integrates\ndepthwise separable convolutions (DSC) layers and MobileViT block to capture\nboth global and local features. The model employs an Edge Extraction Method\n(EEM) and for efficient crack edge detection without pretraining, and\nUltra-Lightweight Subspace Attention Module (ULSAM) to enhance feature\nextraction. Extensive experiments on three benchmark datasets Crack500,\nDeepCrack, and GAPs384 demonstrate that EfficientCrackNet achieves superior\nperformance compared to existing lightweight models, while requiring only 0.26M\nparameters, and 0.483 FLOPs (G). The proposed model offers an optimal balance\nbetween accuracy and computational efficiency, outperforming state-of-the-art\nlightweight models, and providing a robust and adaptable solution for\nreal-world crack segmentation.\n","authors":["Abid Hasan Zim","Aquib Iqbal","Zaid Al-Huda","Asad Malik","Minoru Kuribayash"],"pdf_url":"https://arxiv.org/pdf/2409.18099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12822v3","updated":"2024-09-26T17:39:44Z","published":"2024-06-18T17:43:47Z","title":"Is It Good Data for Multilingual Instruction Tuning or Just Bad\n Multilingual Evaluation for Large Language Models?","summary":" Multilingual large language models are designed, claimed, and expected to\ncater to speakers of varied languages. We hypothesise that the current\npractices of fine-tuning and evaluating these models may not perfectly align\nwith this objective owing to a heavy reliance on translation, which cannot\ncover language-specific knowledge but can introduce translation defects. It\nremains unknown whether the nature of the instruction data has an impact on the\nmodel output; conversely, it is questionable whether translated test sets can\ncapture such nuances. Due to the often coupled practices of using translated\ndata in both stages, such imperfections could have been overlooked. This work\ninvestigates these issues using controlled native or translated data during the\ninstruction tuning and evaluation stages. We show that native or generation\nbenchmarks reveal a notable difference between native and translated\ninstruction data especially when model performance is high, whereas other types\nof test sets cannot. The comparison between round-trip and single-pass\ntranslations reflects the importance of knowledge from language-native\nresources. Finally, we demonstrate that regularization is beneficial to\nbridging this gap on structured but not generative tasks.\n","authors":["Pinzhen Chen","Simon Yu","Zhicheng Guo","Barry Haddow"],"pdf_url":"https://arxiv.org/pdf/2406.12822v3.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18092v1","updated":"2024-09-26T17:39:05Z","published":"2024-09-26T17:39:05Z","title":"DiffSSC: Semantic LiDAR Scan Completion using Denoising Diffusion\n Probabilistic Models","summary":" Perception systems play a crucial role in autonomous driving, incorporating\nmultiple sensors and corresponding computer vision algorithms. 3D LiDAR sensors\nare widely used to capture sparse point clouds of the vehicle's surroundings.\nHowever, such systems struggle to perceive occluded areas and gaps in the scene\ndue to the sparsity of these point clouds and their lack of semantics. To\naddress these challenges, Semantic Scene Completion (SSC) jointly predicts\nunobserved geometry and semantics in the scene given raw LiDAR measurements,\naiming for a more complete scene representation. Building on promising results\nof diffusion models in image generation and super-resolution tasks, we propose\ntheir extension to SSC by implementing the noising and denoising diffusion\nprocesses in the point and semantic spaces individually. To control the\ngeneration, we employ semantic LiDAR point clouds as conditional input and\ndesign local and global regularization losses to stabilize the denoising\nprocess. We evaluate our approach on autonomous driving datasets and our\napproach outperforms the state-of-the-art for SSC.\n","authors":["Helin Cao","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2409.18092v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2409.16898v2","updated":"2024-09-26T17:38:14Z","published":"2024-09-25T13:08:10Z","title":"AI-driven View Guidance System in Intra-cardiac Echocardiography Imaging","summary":" Intra-cardiac Echocardiography (ICE) is a crucial imaging modality used in\nelectrophysiology (EP) and structural heart disease (SHD) interventions,\nproviding real-time, high-resolution views from within the heart. Despite its\nadvantages, effective manipulation of the ICE catheter requires significant\nexpertise, which can lead to inconsistent outcomes, particularly among less\nexperienced operators. To address this challenge, we propose an AI-driven\nclosed-loop view guidance system with human-in-the-loop feedback, designed to\nassist users in navigating ICE imaging without requiring specialized knowledge.\nOur method models the relative position and orientation vectors between\narbitrary views and clinically defined ICE views in a spatial coordinate\nsystem, guiding users on how to manipulate the ICE catheter to transition from\nthe current view to the desired view over time. Operating in a closed-loop\nconfiguration, the system continuously predicts and updates the necessary\ncatheter manipulations, ensuring seamless integration into existing clinical\nworkflows. The effectiveness of the proposed system is demonstrated through a\nsimulation-based evaluation, achieving an 89% success rate with the 6532 test\ndataset, highlighting its potential to improve the accuracy and efficiency of\nICE imaging procedures.\n","authors":["Jaeyoung Huh","Paul Klein","Gareth Funka-Lea","Puneet Sharma","Ankur Kapoor","Young-Ho Kim"],"pdf_url":"https://arxiv.org/pdf/2409.16898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18084v1","updated":"2024-09-26T17:27:15Z","published":"2024-09-26T17:27:15Z","title":"GSON: A Group-based Social Navigation Framework with Large Multimodal\n Model","summary":" As the number of service robots and autonomous vehicles in human-centered\nenvironments grows, their requirements go beyond simply navigating to a\ndestination. They must also take into account dynamic social contexts and\nensure respect and comfort for others in shared spaces, which poses significant\nchallenges for perception and planning. In this paper, we present a group-based\nsocial navigation framework GSON to enable mobile robots to perceive and\nexploit the social group of their surroundings by leveling the visual reasoning\ncapability of the Large Multimodal Model (LMM). For perception, we apply visual\nprompting techniques to zero-shot extract the social relationship among\npedestrians and combine the result with a robust pedestrian detection and\ntracking pipeline to alleviate the problem of low inference speed of the LMM.\nGiven the perception result, the planning system is designed to avoid\ndisrupting the current social structure. We adopt a social structure-based\nmid-level planner as a bridge between global path planning and local motion\nplanning to preserve the global context and reactive response. The proposed\nmethod is validated on real-world mobile robot navigation tasks involving\ncomplex social structure understanding and reasoning. Experimental results\ndemonstrate the effectiveness of the system in these scenarios compared with\nseveral baselines.\n","authors":["Shangyi Luo","Ji Zhu","Peng Sun","Yuhong Deng","Cunjun Yu","Anxing Xiao","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18082v1","updated":"2024-09-26T17:26:16Z","published":"2024-09-26T17:26:16Z","title":"SKT: Integrating State-Aware Keypoint Trajectories with Vision-Language\n Models for Robotic Garment Manipulation","summary":" Automating garment manipulation poses a significant challenge for assistive\nrobotics due to the diverse and deformable nature of garments. Traditional\napproaches typically require separate models for each garment type, which\nlimits scalability and adaptability. In contrast, this paper presents a unified\napproach using vision-language models (VLMs) to improve keypoint prediction\nacross various garment categories. By interpreting both visual and semantic\ninformation, our model enables robots to manage different garment states with a\nsingle model. We created a large-scale synthetic dataset using advanced\nsimulation techniques, allowing scalable training without extensive real-world\ndata. Experimental results indicate that the VLM-based method significantly\nenhances keypoint detection accuracy and task success rates, providing a more\nflexible and general solution for robotic garment manipulation. In addition,\nthis research also underscores the potential of VLMs to unify various garment\nmanipulation tasks within a single framework, paving the way for broader\napplications in home automation and assistive robotics for future.\n","authors":["Xin Li","Siyuan Huang","Qiaojun Yu","Zhengkai Jiang","Ce Hao","Yimeng Zhu","Hongsheng Li","Peng Gao","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2409.18082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18073v1","updated":"2024-09-26T17:19:49Z","published":"2024-09-26T17:19:49Z","title":"Infer Human's Intentions Before Following Natural Language Instructions","summary":" For AI agents to be helpful to humans, they should be able to follow natural\nlanguage instructions to complete everyday cooperative tasks in human\nenvironments. However, real human instructions inherently possess ambiguity,\nbecause the human speakers assume sufficient prior knowledge about their hidden\ngoals and intentions. Standard language grounding and planning methods fail to\naddress such ambiguities because they do not model human internal goals as\nadditional partially observable factors in the environment. We propose a new\nframework, Follow Instructions with Social and Embodied Reasoning (FISER),\naiming for better natural language instruction following in collaborative\nembodied tasks. Our framework makes explicit inferences about human goals and\nintentions as intermediate reasoning steps. We implement a set of\nTransformer-based models and evaluate them over a challenging benchmark,\nHandMeThat. We empirically demonstrate that using social reasoning to\nexplicitly infer human intentions before making action plans surpasses purely\nend-to-end approaches. We also compare our implementation with strong\nbaselines, including Chain of Thought prompting on the largest available\npre-trained language models, and find that FISER provides better performance on\nthe embodied social reasoning tasks under investigation, reaching the\nstate-of-the-art on HandMeThat.\n","authors":["Yanming Wan","Yue Wu","Yiping Wang","Jiayuan Mao","Natasha Jaques"],"pdf_url":"https://arxiv.org/pdf/2409.18073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18071v1","updated":"2024-09-26T17:18:39Z","published":"2024-09-26T17:18:39Z","title":"FreeEdit: Mask-free Reference-based Image Editing with Multi-modal\n Instruction","summary":" Introducing user-specified visual concepts in image editing is highly\npractical as these concepts convey the user's intent more precisely than\ntext-based descriptions. We propose FreeEdit, a novel approach for achieving\nsuch reference-based image editing, which can accurately reproduce the visual\nconcept from the reference image based on user-friendly language instructions.\nOur approach leverages the multi-modal instruction encoder to encode language\ninstructions to guide the editing process. This implicit way of locating the\nediting area eliminates the need for manual editing masks. To enhance the\nreconstruction of reference details, we introduce the Decoupled Residual\nReferAttention (DRRA) module. This module is designed to integrate fine-grained\nreference features extracted by a detail extractor into the image editing\nprocess in a residual way without interfering with the original self-attention.\nGiven that existing datasets are unsuitable for reference-based image editing\ntasks, particularly due to the difficulty in constructing image triplets that\ninclude a reference image, we curate a high-quality dataset, FreeBench, using a\nnewly developed twice-repainting scheme. FreeBench comprises the images before\nand after editing, detailed editing instructions, as well as a reference image\nthat maintains the identity of the edited object, encompassing tasks such as\nobject addition, replacement, and deletion. By conducting phased training on\nFreeBench followed by quality tuning, FreeEdit achieves high-quality zero-shot\nediting through convenient language instructions. We conduct extensive\nexperiments to evaluate the effectiveness of FreeEdit across multiple task\ntypes, demonstrating its superiority over existing methods. The code will be\navailable at: https://freeedit.github.io/.\n","authors":["Runze He","Kai Ma","Linjiang Huang","Shaofei Huang","Jialin Gao","Xiaoming Wei","Jiao Dai","Jizhong Han","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18071v1.pdf","comment":"14 pages, 14 figures, project website: https://freeedit.github.io/"},{"id":"http://arxiv.org/abs/2310.06114v3","updated":"2024-09-26T17:14:09Z","published":"2023-10-09T19:42:22Z","title":"Learning Interactive Real-World Simulators","summary":" Generative models trained on internet data have revolutionized how text,\nimage, and video content can be created. Perhaps the next milestone for\ngenerative models is to simulate realistic experience in response to actions\ntaken by humans, robots, and other interactive agents. Applications of a\nreal-world simulator range from controllable content creation in games and\nmovies, to training embodied agents purely in simulation that can be directly\ndeployed in the real world. We explore the possibility of learning a universal\nsimulator (UniSim) of real-world interaction through generative modeling. We\nfirst make the important observation that natural datasets available for\nlearning a real-world simulator are often rich along different dimensions\n(e.g., abundant objects in image data, densely sampled actions in robotics\ndata, and diverse movements in navigation data). With careful orchestration of\ndiverse datasets, each providing a different aspect of the overall experience,\nwe can simulate the visual outcome of both high-level instructions such as\n\"open the drawer\" and low-level controls from otherwise static scenes and\nobjects. We use the simulator to train both high-level vision-language policies\nand low-level reinforcement learning policies, each of which can be deployed in\nthe real world in zero shot after training purely in simulation. We also show\nthat other types of intelligence such as video captioning models can benefit\nfrom training with simulated experience, opening up even wider applications.\nVideo demos can be found at https://universal-simulator.github.io.\n","authors":["Sherry Yang","Yilun Du","Kamyar Ghasemipour","Jonathan Tompson","Leslie Kaelbling","Dale Schuurmans","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2310.06114v3.pdf","comment":"https://universal-simulator.github.io"},{"id":"http://arxiv.org/abs/2409.18055v1","updated":"2024-09-26T16:59:01Z","published":"2024-09-26T16:59:01Z","title":"Visual Data Diagnosis and Debiasing with Concept Graphs","summary":" The widespread success of deep learning models today is owed to the curation\nof extensive datasets significant in size and complexity. However, such models\nfrequently pick up inherent biases in the data during the training process,\nleading to unreliable predictions. Diagnosing and debiasing datasets is thus a\nnecessity to ensure reliable model performance. In this paper, we present\nCONBIAS, a novel framework for diagnosing and mitigating Concept co-occurrence\nBiases in visual datasets. CONBIAS represents visual datasets as knowledge\ngraphs of concepts, enabling meticulous analysis of spurious concept\nco-occurrences to uncover concept imbalances across the whole dataset.\nMoreover, we show that by employing a novel clique-based concept balancing\nstrategy, we can mitigate these imbalances, leading to enhanced performance on\ndownstream tasks. Extensive experiments show that data augmentation based on a\nbalanced concept distribution augmented by CONBIAS improves generalization\nperformance across multiple datasets compared to state-of-the-art methods. We\nwill make our code and data publicly available.\n","authors":["Rwiddhi Chakraborty","Yinong Wang","Jialu Gao","Runkai Zheng","Cheng Zhang","Fernando De la Torre"],"pdf_url":"https://arxiv.org/pdf/2409.18055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18053v1","updated":"2024-09-26T16:58:04Z","published":"2024-09-26T16:58:04Z","title":"DualAD: Dual-Layer Planning for Reasoning in Autonomous Driving","summary":" We present a novel autonomous driving framework, DualAD, designed to imitate\nhuman reasoning during driving. DualAD comprises two layers: a rule-based\nmotion planner at the bottom layer that handles routine driving tasks requiring\nminimal reasoning, and an upper layer featuring a rule-based text encoder that\nconverts driving scenarios from absolute states into text description. This\ntext is then processed by a large language model (LLM) to make driving\ndecisions. The upper layer intervenes in the bottom layer's decisions when\npotential danger is detected, mimicking human reasoning in critical situations.\nClosed-loop experiments demonstrate that DualAD, using a zero-shot pre-trained\nmodel, significantly outperforms rule-based motion planners that lack reasoning\nabilities. Our experiments also highlight the effectiveness of the text\nencoder, which considerably enhances the model's scenario understanding.\nAdditionally, the integrated DualAD model improves with stronger LLMs,\nindicating the framework's potential for further enhancement. We make code and\nbenchmarks publicly available.\n","authors":["Dingrui Wang","Marc Kaufeld","Johannes Betz"],"pdf_url":"https://arxiv.org/pdf/2409.18053v1.pdf","comment":"Autonomous Driving, Large Language Models (LLMs), Human Reasoning,\n Critical Scenario"},{"id":"http://arxiv.org/abs/2409.18052v1","updated":"2024-09-26T16:55:44Z","published":"2024-09-26T16:55:44Z","title":"Explaining Explaining","summary":" Explanation is key to people having confidence in high-stakes AI systems.\nHowever, machine-learning-based systems - which account for almost all current\nAI - can't explain because they are usually black boxes. The explainable AI\n(XAI) movement hedges this problem by redefining \"explanation\". The\nhuman-centered explainable AI (HCXAI) movement identifies the\nexplanation-oriented needs of users but can't fulfill them because of its\ncommitment to machine learning. In order to achieve the kinds of explanations\nneeded by real people operating in critical domains, we must rethink how to\napproach AI. We describe a hybrid approach to developing cognitive agents that\nuses a knowledge-based infrastructure supplemented by data obtained through\nmachine learning when applicable. These agents will serve as assistants to\nhumans who will bear ultimate responsibility for the decisions and actions of\nthe human-robot team. We illustrate the explanatory potential of such agents\nusing the under-the-hood panels of a demonstration system in which a team of\nsimulated robots collaborates on a search task assigned by a human.\n","authors":["Sergei Nirenburg","Marjorie McShane","Kenneth W. Goodman","Sanjay Oruganti"],"pdf_url":"https://arxiv.org/pdf/2409.18052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18049v1","updated":"2024-09-26T16:49:58Z","published":"2024-09-26T16:49:58Z","title":"Revisit Anything: Visual Place Recognition via Image Segment Retrieval","summary":" Accurately recognizing a revisited place is crucial for embodied agents to\nlocalize and navigate. This requires visual representations to be distinct,\ndespite strong variations in camera viewpoint and scene appearance. Existing\nvisual place recognition pipelines encode the \"whole\" image and search for\nmatches. This poses a fundamental challenge in matching two images of the same\nplace captured from different camera viewpoints: \"the similarity of what\noverlaps can be dominated by the dissimilarity of what does not overlap\". We\naddress this by encoding and searching for \"image segments\" instead of the\nwhole images. We propose to use open-set image segmentation to decompose an\nimage into `meaningful' entities (i.e., things and stuff). This enables us to\ncreate a novel image representation as a collection of multiple overlapping\nsubgraphs connecting a segment with its neighboring segments, dubbed\nSuperSegment. Furthermore, to efficiently encode these SuperSegments into\ncompact vector representations, we propose a novel factorized representation of\nfeature aggregation. We show that retrieving these partial representations\nleads to significantly higher recognition recall than the typical whole image\nbased retrieval. Our segments-based approach, dubbed SegVLAD, sets a new\nstate-of-the-art in place recognition on a diverse selection of benchmark\ndatasets, while being applicable to both generic and task-specialized image\nencoders. Finally, we demonstrate the potential of our method to ``revisit\nanything'' by evaluating our method on an object instance retrieval task, which\nbridges the two disparate areas of research: visual place recognition and\nobject-goal navigation, through their common aim of recognizing goal objects\nspecific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything.\n","authors":["Kartik Garg","Sai Shubodh Puligilla","Shishir Kolathaya","Madhava Krishna","Sourav Garg"],"pdf_url":"https://arxiv.org/pdf/2409.18049v1.pdf","comment":"Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures"},{"id":"http://arxiv.org/abs/2409.18047v1","updated":"2024-09-26T16:48:21Z","published":"2024-09-26T16:48:21Z","title":"HARMONIC: Cognitive and Control Collaboration in Human-Robotic Teams","summary":" This paper presents a novel approach to multi-robot planning and\ncollaboration. We demonstrate a cognitive strategy for robots in human-robot\nteams that incorporates metacognition, natural language communication, and\nexplainability. The system is embodied using the HARMONIC architecture that\nflexibly integrates cognitive and control capabilities across the team. We\nevaluate our approach through simulation experiments involving a joint search\ntask by a team of heterogeneous robots (a UGV and a drone) and a human. We\ndetail the system's handling of complex, real-world scenarios, effective action\ncoordination between robots with different capabilities, and natural\nhuman-robot communication. This work demonstrates that the robots' ability to\nreason about plans, goals, and attitudes, and to provide explanations for\nactions and decisions are essential prerequisites for realistic human-robot\nteaming.\n","authors":["Sanjay Oruganti","Sergei Nirenburg","Marjorie McShane","Jesse English","Michael K. Roberts","Christian Arndt"],"pdf_url":"https://arxiv.org/pdf/2409.18047v1.pdf","comment":"Submitted to ICRA 2025 Conference, Atlanta, GA, USA"},{"id":"http://arxiv.org/abs/2409.18046v1","updated":"2024-09-26T16:47:32Z","published":"2024-09-26T16:47:32Z","title":"IFCap: Image-like Retrieval and Frequency-based Entity Filtering for\n Zero-shot Captioning","summary":" Recent advancements in image captioning have explored text-only training\nmethods to overcome the limitations of paired image-text data. However,\nexisting text-only training methods often overlook the modality gap between\nusing text data during training and employing images during inference. To\naddress this issue, we propose a novel approach called Image-like Retrieval,\nwhich aligns text features with visually relevant features to mitigate the\nmodality gap. Our method further enhances the accuracy of generated captions by\ndesigning a Fusion Module that integrates retrieved captions with input\nfeatures. Additionally, we introduce a Frequency-based Entity Filtering\ntechnique that significantly improves caption quality. We integrate these\nmethods into a unified framework, which we refer to as IFCap\n($\\textbf{I}$mage-like Retrieval and $\\textbf{F}$requency-based Entity\nFiltering for Zero-shot $\\textbf{Cap}$tioning). Through extensive\nexperimentation, our straightforward yet powerful approach has demonstrated its\nefficacy, outperforming the state-of-the-art methods by a significant margin in\nboth image captioning and video captioning compared to zero-shot captioning\nbased on text-only training.\n","authors":["Soeun Lee","Si-Woo Kim","Taewhan Kim","Dong-Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18046v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18037v1","updated":"2024-09-26T16:42:13Z","published":"2024-09-26T16:42:13Z","title":"HARMONIC: A Framework for Explanatory Cognitive Robots","summary":" We present HARMONIC, a framework for implementing cognitive robots that\ntransforms general-purpose robots into trusted teammates capable of complex\ndecision-making, natural communication and human-level explanation. The\nframework supports interoperability between a strategic (cognitive) layer for\nhigh-level decision-making and a tactical (robot) layer for low-level control\nand execution. We describe the core features of the framework and our initial\nimplementation, in which HARMONIC was deployed on a simulated UGV and drone\ninvolved in a multi-robot search and retrieval task.\n","authors":["Sanjay Oruganti","Sergei Nirenburg","Marjorie McShane","Jesse English","Michael K. Roberts","Christian Arndt"],"pdf_url":"https://arxiv.org/pdf/2409.18037v1.pdf","comment":"Accepted for presentation at ICRA@40. 23-26 September 2024,\n Rotterdam, Netherlands"},{"id":"http://arxiv.org/abs/2409.16626v2","updated":"2024-09-26T16:41:27Z","published":"2024-09-25T05:11:58Z","title":"Ascend HiFloat8 Format for Deep Learning","summary":" This preliminary white paper proposes a novel 8-bit floating-point data\nformat HiFloat8 (abbreviated as HiF8) for deep learning. HiF8 features tapered\nprecision. For normal value encoding, it provides 7 exponent values with 3-bit\nmantissa, 8 exponent values with 2-bit mantissa, and 16 exponent values with\n1-bit mantissa. For denormal value encoding, it extends the dynamic range by 7\nextra powers of 2, from 31 to 38 binades (notice that FP16 covers 40 binades).\nMeanwhile, HiF8 encodes all the special values except that positive zero and\nnegative zero are represented by only one bit-pattern. Thanks to the better\nbalance between precision and dynamic range, HiF8 can be simultaneously used in\nboth forward and backward passes of AI training. In this paper, we will\ndescribe the definition and rounding methods of HiF8, as well as the tentative\ntraining and inference solutions. To demonstrate the efficacy of HiF8, massive\nsimulation results on various neural networks, including traditional neural\nnetworks and large language models (LLMs), will also be presented.\n","authors":["Yuanyong Luo","Zhongxing Zhang","Richard Wu","Hu Liu","Ying Jin","Kai Zheng","Minmin Wang","Zhanying He","Guipeng Hu","Luyao Chen","Tianchi Hu","Junsong Wang","Minqi Chen","Mikhaylov Dmitry","Korviakov Vladimir","Bobrin Maxim","Yuhao Hu","Guanfu Chen","Zeyi Huang"],"pdf_url":"https://arxiv.org/pdf/2409.16626v2.pdf","comment":"13 Pages, 4 Figures, 9 Tables"},{"id":"http://arxiv.org/abs/2409.13731v3","updated":"2024-09-26T16:34:35Z","published":"2024-09-10T02:00:28Z","title":"KAG: Boosting LLMs in Professional Domains via Knowledge Augmented\n Generation","summary":" The recently developed retrieval-augmented generation (RAG) technology has\nenabled the efficient construction of domain-specific applications. However, it\nalso has limitations, including the gap between vector similarity and the\nrelevance of knowledge reasoning, as well as insensitivity to knowledge logic,\nsuch as numerical values, temporal relations, expert rules, and others, which\nhinder the effectiveness of professional knowledge services. In this work, we\nintroduce a professional domain knowledge service framework called Knowledge\nAugmented Generation (KAG). KAG is designed to address the aforementioned\nchallenges with the motivation of making full use of the advantages of\nknowledge graph(KG) and vector retrieval, and to improve generation and\nreasoning performance by bidirectionally enhancing large language models (LLMs)\nand KGs through five key aspects: (1) LLM-friendly knowledge representation,\n(2) mutual-indexing between knowledge graphs and original chunks, (3)\nlogical-form-guided hybrid reasoning engine, (4) knowledge alignment with\nsemantic reasoning, and (5) model capability enhancement for KAG. We compared\nKAG with existing RAG methods in multihop question answering and found that it\nsignificantly outperforms state-of-theart methods, achieving a relative\nimprovement of 19.6% on 2wiki and 33.5% on hotpotQA in terms of F1 score. We\nhave successfully applied KAG to two professional knowledge Q&A tasks of Ant\nGroup, including E-Government Q&A and E-Health Q&A, achieving significant\nimprovement in professionalism compared to RAG methods.\n","authors":["Lei Liang","Mengshu Sun","Zhengke Gui","Zhongshu Zhu","Zhouyu Jiang","Ling Zhong","Yuan Qu","Peilong Zhao","Zhongpu Bo","Jin Yang","Huaidong Xiong","Lin Yuan","Jun Xu","Zaoyang Wang","Zhiqiang Zhang","Wen Zhang","Huajun Chen","Wenguang Chen","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.13731v3.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2409.18028v1","updated":"2024-09-26T16:34:35Z","published":"2024-09-26T16:34:35Z","title":"Compositional Hardness of Code in Large Language Models -- A\n Probabilistic Perspective","summary":" A common practice in large language model (LLM) usage for complex analytical\ntasks such as code generation, is to sample a solution for the entire task\nwithin the model's context window. Previous works have shown that subtask\ndecomposition within the model's context (chain of thought), is beneficial for\nsolving such tasks. In this work, we point a limitation of LLMs' ability to\nperform several sub-tasks within the same context window - an in-context\nhardness of composition, pointing to an advantage for distributing a decomposed\nproblem in a multi-agent system of LLMs. The hardness of composition is\nquantified by a generation complexity metric, i.e., the number of LLM\ngenerations required to sample at least one correct solution. We find a gap\nbetween the generation complexity of solving a compositional problem within the\nsame context relative to distributing it among multiple agents, that increases\nexponentially with the solution's length. We prove our results theoretically\nand demonstrate them empirically.\n","authors":["Yotam Wolf","Binyamin Rothberg","Dorin Shteyman","Amnon Shashua"],"pdf_url":"https://arxiv.org/pdf/2409.18028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18025v1","updated":"2024-09-26T16:32:19Z","published":"2024-09-26T16:32:19Z","title":"An Adversarial Perspective on Machine Unlearning for AI Safety","summary":" Large language models are finetuned to refuse questions about hazardous\nknowledge, but these protections can often be bypassed. Unlearning methods aim\nat completely removing hazardous capabilities from models and make them\ninaccessible to adversaries. This work challenges the fundamental differences\nbetween unlearning and traditional safety post-training from an adversarial\nperspective. We demonstrate that existing jailbreak methods, previously\nreported as ineffective against unlearning, can be successful when applied\ncarefully. Furthermore, we develop a variety of adaptive methods that recover\nmost supposedly unlearned capabilities. For instance, we show that finetuning\non 10 unrelated examples or removing specific directions in the activation\nspace can recover most hazardous capabilities for models edited with RMU, a\nstate-of-the-art unlearning method. Our findings challenge the robustness of\ncurrent unlearning approaches and question their advantages over safety\ntraining.\n","authors":["Jakub Łucki","Boyi Wei","Yangsibo Huang","Peter Henderson","Florian Tramèr","Javier Rando"],"pdf_url":"https://arxiv.org/pdf/2409.18025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18017v1","updated":"2024-09-26T16:25:48Z","published":"2024-09-26T16:25:48Z","title":"Transferring disentangled representations: bridging the gap between\n synthetic and real images","summary":" Developing meaningful and efficient representations that separate the\nfundamental structure of the data generation mechanism is crucial in\nrepresentation learning. However, Disentangled Representation Learning has not\nfully shown its potential on real images, because of correlated generative\nfactors, their resolution and limited access to ground truth labels.\nSpecifically on the latter, we investigate the possibility of leveraging\nsynthetic data to learn general-purpose disentangled representations applicable\nto real data, discussing the effect of fine-tuning and what properties of\ndisentanglement are preserved after the transfer. We provide an extensive\nempirical study to address these issues. In addition, we propose a new\ninterpretable intervention-based metric, to measure the quality of factors\nencoding in the representation. Our results indicate that some level of\ndisentanglement, transferring a representation from synthetic to real data, is\npossible and effective.\n","authors":["Jacopo Dapueto","Nicoletta Noceti","Francesca Odone"],"pdf_url":"https://arxiv.org/pdf/2409.18017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18014v1","updated":"2024-09-26T16:22:59Z","published":"2024-09-26T16:22:59Z","title":"Role-RL: Online Long-Context Processing with Role Reinforcement Learning\n for Distinct LLMs in Their Optimal Roles","summary":" Large language models (LLMs) with long-context processing are still\nchallenging because of their implementation complexity, training efficiency and\ndata sparsity. To address this issue, a new paradigm named Online Long-context\nProcessing (OLP) is proposed when we process a document of unlimited length,\nwhich typically occurs in the information reception and organization of diverse\nstreaming media such as automated news reporting, live e-commerce, and viral\nshort videos. Moreover, a dilemma was often encountered when we tried to select\nthe most suitable LLM from a large number of LLMs amidst explosive growth\naiming for outstanding performance, affordable prices, and short response\ndelays. In view of this, we also develop Role Reinforcement Learning (Role-RL)\nto automatically deploy different LLMs in their respective roles within the OLP\npipeline according to their actual performance. Extensive experiments are\nconducted on our OLP-MINI dataset and it is found that OLP with Role-RL\nframework achieves OLP benchmark with an average recall rate of 93.2% and the\nLLM cost saved by 79.4%. The code and dataset are publicly available at:\nhttps://anonymous.4open.science/r/Role-RL.\n","authors":["Lewei He","Tianyu Shi","Pengran Huang","Bingzhi Chen","Qianglong Chen","Jiahui Pan"],"pdf_url":"https://arxiv.org/pdf/2409.18014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18009v1","updated":"2024-09-26T16:19:37Z","published":"2024-09-26T16:19:37Z","title":"Control Industrial Automation System with Large Language Models","summary":" Traditional industrial automation systems require specialized expertise to\noperate and complex reprogramming to adapt to new processes. Large language\nmodels offer the intelligence to make them more flexible and easier to use.\nHowever, LLMs' application in industrial settings is underexplored. This paper\nintroduces a framework for integrating LLMs to achieve end-to-end control of\nindustrial automation systems. At the core of the framework are an agent system\ndesigned for industrial tasks, a structured prompting method, and an\nevent-driven information modeling mechanism that provides real-time data for\nLLM inference. The framework supplies LLMs with real-time events on different\ncontext semantic levels, allowing them to interpret the information, generate\nproduction plans, and control operations on the automation system. It also\nsupports structured dataset creation for fine-tuning on this downstream\napplication of LLMs. Our contribution includes a formal system design,\nproof-of-concept implementation, and a method for generating task-specific\ndatasets for LLM fine-tuning and testing. This approach enables a more adaptive\nautomation system that can respond to spontaneous events, while allowing easier\noperation and configuration through natural language for more intuitive\nhuman-machine interaction. We provide demo videos and detailed data on GitHub:\nhttps://github.com/YuchenXia/LLM4IAS\n","authors":["Yuchen Xia","Nasser Jazdi","Jize Zhang","Chaitanya Shah","Michael Weyrich"],"pdf_url":"https://arxiv.org/pdf/2409.18009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17995v1","updated":"2024-09-26T16:07:20Z","published":"2024-09-26T16:07:20Z","title":"Joint Localization and Planning using Diffusion","summary":" Diffusion models have been successfully applied to robotics problems such as\nmanipulation and vehicle path planning. In this work, we explore their\napplication to end-to-end navigation -- including both perception and planning\n-- by considering the problem of jointly performing global localization and\npath planning in known but arbitrary 2D environments. In particular, we\nintroduce a diffusion model which produces collision-free paths in a global\nreference frame given an egocentric LIDAR scan, an arbitrary map, and a desired\ngoal position. To this end, we implement diffusion in the space of paths in\nSE(2), and describe how to condition the denoising process on both obstacles\nand sensor observations. In our evaluation, we show that the proposed\nconditioning techniques enable generalization to realistic maps of considerably\ndifferent appearance than the training environment, demonstrate our model's\nability to accurately describe ambiguous solutions, and run extensive\nsimulation experiments showcasing our model's use as a real-time, end-to-end\nlocalization and planning stack.\n","authors":["L. Lao Beyer","S. Karaman"],"pdf_url":"https://arxiv.org/pdf/2409.17995v1.pdf","comment":"7 pages, 9 figures. Submitted to ICRA 2025, under review"},{"id":"http://arxiv.org/abs/2409.17994v1","updated":"2024-09-26T16:06:38Z","published":"2024-09-26T16:06:38Z","title":"CRoP: Context-wise Robust Static Human-Sensing Personalization","summary":" The advancement in deep learning and internet-of-things have led to diverse\nhuman sensing applications. However, distinct patterns in human sensing,\ninfluenced by various factors or contexts, challenge generic neural network\nmodel's performance due to natural distribution shifts. To address this,\npersonalization tailors models to individual users. Yet most personalization\nstudies overlook intra-user heterogeneity across contexts in sensory data,\nlimiting intra-user generalizability. This limitation is especially critical in\nclinical applications, where limited data availability hampers both\ngeneralizability and personalization. Notably, intra-user sensing attributes\nare expected to change due to external factors such as treatment progression,\nfurther complicating the challenges.This work introduces CRoP, a novel static\npersonalization approach using an off-the-shelf pre-trained model and pruning\nto optimize personalization and generalization. CRoP shows superior\npersonalization effectiveness and intra-user robustness across four\nhuman-sensing datasets, including two from real-world health domains,\nhighlighting its practical and social impact. Additionally, to support CRoP's\ngeneralization ability and design choices, we provide empirical justification\nthrough gradient inner product analysis, ablation studies, and comparisons\nagainst state-of-the-art baselines.\n","authors":["Sawinder Kaur","Avery Gump","Jingyu Xin","Yi Xiao","Harshit Sharma","Nina R Benway","Jonathan L Preston","Asif Salekin"],"pdf_url":"https://arxiv.org/pdf/2409.17994v1.pdf","comment":"31 pages, 10 figues and 13 tables"},{"id":"http://arxiv.org/abs/2409.16427v2","updated":"2024-09-26T15:56:08Z","published":"2024-09-24T19:47:21Z","title":"HAICOSYSTEM: An Ecosystem for Sandboxing Safety Risks in Human-AI\n Interactions","summary":" AI agents are increasingly autonomous in their interactions with human users\nand tools, leading to increased interactional safety risks. We present\nHAICOSYSTEM, a framework examining AI agent safety within diverse and complex\nsocial interactions. HAICOSYSTEM features a modular sandbox environment that\nsimulates multi-turn interactions between human users and AI agents, where the\nAI agents are equipped with a variety of tools (e.g., patient management\nplatforms) to navigate diverse scenarios (e.g., a user attempting to access\nother patients' profiles). To examine the safety of AI agents in these\ninteractions, we develop a comprehensive multi-dimensional evaluation framework\nthat uses metrics covering operational, content-related, societal, and legal\nrisks. Through running 1840 simulations based on 92 scenarios across seven\ndomains (e.g., healthcare, finance, education), we demonstrate that HAICOSYSTEM\ncan emulate realistic user-AI interactions and complex tool use by AI agents.\nOur experiments show that state-of-the-art LLMs, both proprietary and\nopen-sourced, exhibit safety risks in over 50\\% cases, with models generally\nshowing higher risks when interacting with simulated malicious users. Our\nfindings highlight the ongoing challenge of building agents that can safely\nnavigate complex interactions, particularly when faced with malicious users. To\nfoster the AI agent safety ecosystem, we release a code platform that allows\npractitioners to create custom scenarios, simulate interactions, and evaluate\nthe safety and performance of their agents.\n","authors":["Xuhui Zhou","Hyunwoo Kim","Faeze Brahman","Liwei Jiang","Hao Zhu","Ximing Lu","Frank Xu","Bill Yuchen Lin","Yejin Choi","Niloofar Mireshghallah","Ronan Le Bras","Maarten Sap"],"pdf_url":"https://arxiv.org/pdf/2409.16427v2.pdf","comment":"Both the second and third authors contributed equally"},{"id":"http://arxiv.org/abs/2409.17978v1","updated":"2024-09-26T15:52:36Z","published":"2024-09-26T15:52:36Z","title":"HydraViT: Stacking Heads for a Scalable ViT","summary":" The architecture of Vision Transformers (ViTs), particularly the Multi-head\nAttention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs\non devices with varying constraints, such as mobile phones, requires multiple\nmodels of different sizes. However, this approach has limitations, such as\ntraining and storing each required model separately. This paper introduces\nHydraViT, a novel approach that addresses these limitations by stacking\nattention heads to achieve a scalable ViT. By repeatedly changing the size of\nthe embedded dimensions throughout each layer and their corresponding number of\nattention heads in MHA during training, HydraViT induces multiple subnetworks.\nThereby, HydraViT achieves adaptability across a wide spectrum of hardware\nenvironments while maintaining performance. Our experimental results\ndemonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10\nsubnetworks, covering a wide range of resource constraints. HydraViT achieves\nup to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy\nwith the same throughput on ImageNet-1K compared to the baselines, making it an\neffective solution for scenarios where hardware availability is diverse or\nvaries over time. Source code available at https://github.com/ds-kiel/HydraViT.\n","authors":["Janek Haberer","Ali Hojjat","Olaf Landsiedel"],"pdf_url":"https://arxiv.org/pdf/2409.17978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14950v2","updated":"2024-09-26T15:45:13Z","published":"2023-12-08T15:57:18Z","title":"TypeFly: Flying Drones with Large Language Model","summary":" Recent advancements in robot control using large language models (LLMs) have\ndemonstrated significant potential, primarily due to LLMs' capabilities to\nunderstand natural language commands and generate executable plans in various\nlanguages. However, in real-time and interactive applications involving mobile\nrobots, particularly drones, the sequential token generation process inherent\nto LLMs introduces substantial latency, i.e. response time, in control plan\ngeneration.\n In this paper, we present a system called ChatFly that tackles this problem\nusing a combination of a novel programming language called MiniSpec and its\nruntime to reduce the plan generation time and drone response time. That is,\ninstead of asking an LLM to write a program (robotic plan) in the popular but\nverbose Python, ChatFly gets it to do it in MiniSpec specially designed for\ntoken efficiency and stream interpretation. Using a set of challenging drone\ntasks, we show that design choices made by ChatFly can reduce up to 62%\nresponse time and provide a more consistent user experience, enabling\nresponsive and intelligent LLM-based drone control with efficient completion.\n","authors":["Guojun Chen","Xiaojing Yu","Neiwen Ling","Lin Zhong"],"pdf_url":"https://arxiv.org/pdf/2312.14950v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17954v1","updated":"2024-09-26T15:30:54Z","published":"2024-09-26T15:30:54Z","title":"Enhancing elusive clues in knowledge learning by contrasting attention\n of language models","summary":" Causal language models acquire vast amount of knowledge from general text\ncorpus during pretraining, but the efficiency of knowledge learning is known to\nbe unsatisfactory, especially when learning from knowledge-dense and\nsmall-sized corpora. The deficiency can come from long-distance dependencies\nwhich are hard to capture by language models, and overfitting to co-occurrence\npatterns and distracting clues in the training text. To address these issues,\nthe paper proposes a method to enhance knowledge learning during language model\npretraining, by enhancing elusive but important clues in text discovered by the\nlanguage model themselves. We found that larger language models pay more\nattention to non-obvious but important clues, which are often overlooked by\nsmaller language models. Therefore, we can identify these clues by contrasting\nthe attention weights of large and small language models. We use the identified\nclues as a guide to perform token-dropout data augmentation on the training\ntext, and observed a significant boost in both small and large models'\nperformance in fact memorization. This shows that the behavior contrast between\nmore and less-performant language models contains important clues for knowledge\nlearning, and it can be ``amplified\" for a straight-forward improvement in\nknowledge learning efficiency.\n","authors":["Jian Gao","Xiao Zhang","Ji Wu","Miao Li"],"pdf_url":"https://arxiv.org/pdf/2409.17954v1.pdf","comment":"7 pages and 17 figures"},{"id":"http://arxiv.org/abs/2312.14115v4","updated":"2024-09-26T15:30:00Z","published":"2023-12-21T18:40:34Z","title":"LingoQA: Visual Question Answering for Autonomous Driving","summary":" We introduce LingoQA, a novel dataset and benchmark for visual question\nanswering in autonomous driving. The dataset contains 28K unique short video\nscenarios, and 419K annotations. Evaluating state-of-the-art vision-language\nmodels on our benchmark shows that their performance is below human\ncapabilities, with GPT-4V responding truthfully to 59.6% of the questions\ncompared to 96.6% for humans. For evaluation, we propose a truthfulness\nclassifier, called Lingo-Judge, that achieves a 0.95 Spearman correlation\ncoefficient to human evaluations, surpassing existing techniques like METEOR,\nBLEU, CIDEr, and GPT-4. We establish a baseline vision-language model and run\nextensive ablation studies to understand its performance. We release our\ndataset and benchmark as an evaluation platform for vision-language models in\nautonomous driving.\n","authors":["Ana-Maria Marcu","Long Chen","Jan Hünermann","Alice Karnsund","Benoit Hanotte","Prajwal Chidananda","Saurabh Nair","Vijay Badrinarayanan","Alex Kendall","Jamie Shotton","Elahe Arani","Oleg Sinavski"],"pdf_url":"https://arxiv.org/pdf/2312.14115v4.pdf","comment":"Accepted to ECCV 2024. Benchmark and dataset are available at\n https://github.com/wayveai/LingoQA/"},{"id":"http://arxiv.org/abs/2409.13740v2","updated":"2024-09-26T15:27:08Z","published":"2024-09-10T16:37:58Z","title":"Language agents achieve superhuman synthesis of scientific knowledge","summary":" Language models are known to hallucinate incorrect information, and it is\nunclear if they are sufficiently accurate and reliable for use in scientific\nresearch. We developed a rigorous human-AI comparison methodology to evaluate\nlanguage model agents on real-world literature search tasks covering\ninformation retrieval, summarization, and contradiction detection tasks. We\nshow that PaperQA2, a frontier language model agent optimized for improved\nfactuality, matches or exceeds subject matter expert performance on three\nrealistic literature research tasks without any restrictions on humans (i.e.,\nfull access to internet, search tools, and time). PaperQA2 writes cited,\nWikipedia-style summaries of scientific topics that are significantly more\naccurate than existing, human-written Wikipedia articles. We also introduce a\nhard benchmark for scientific literature research called LitQA2 that guided\ndesign of PaperQA2, leading to it exceeding human performance. Finally, we\napply PaperQA2 to identify contradictions within the scientific literature, an\nimportant scientific task that is challenging for humans. PaperQA2 identifies\n2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of\nwhich 70% are validated by human experts. These results demonstrate that\nlanguage model agents are now capable of exceeding domain experts across\nmeaningful tasks on scientific literature.\n","authors":["Michael D. Skarlinski","Sam Cox","Jon M. Laurent","James D. Braza","Michaela Hinks","Michael J. Hammerling","Manvitha Ponnapati","Samuel G. Rodriques","Andrew D. White"],"pdf_url":"https://arxiv.org/pdf/2409.13740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14500v2","updated":"2024-09-26T15:26:43Z","published":"2024-09-22T15:53:19Z","title":"TabGraphs: A Benchmark and Strong Baselines for Learning on Graphs with\n Tabular Node Features","summary":" Tabular machine learning is an important field for industry and science. In\nthis field, table rows are usually treated as independent data samples, but\nadditional information about relations between them is sometimes available and\ncan be used to improve predictive performance. Such information can be\nnaturally modeled with a graph, thus tabular machine learning may benefit from\ngraph machine learning methods. However, graph machine learning models are\ntypically evaluated on datasets with homogeneous node features, which have\nlittle in common with heterogeneous mixtures of numerical and categorical\nfeatures present in tabular datasets. Thus, there is a critical difference\nbetween the data used in tabular and graph machine learning studies, which does\nnot allow one to understand how successfully graph models can be transferred to\ntabular data. To bridge this gap, we propose a new benchmark of diverse graphs\nwith heterogeneous tabular node features and realistic prediction tasks. We use\nthis benchmark to evaluate a vast set of models, including simple methods\npreviously overlooked in the literature. Our experiments show that graph neural\nnetworks (GNNs) can indeed often bring gains in predictive performance for\ntabular data, but standard tabular models also can be adapted to work with\ngraph data by using simple feature preprocessing, which sometimes enables them\nto compete with and even outperform GNNs. Based on our empirical study, we\nprovide insights for researchers and practitioners in both tabular and graph\nmachine learning fields.\n","authors":["Gleb Bazhenov","Oleg Platonov","Liudmila Prokhorenkova"],"pdf_url":"https://arxiv.org/pdf/2409.14500v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09825v2","updated":"2024-09-26T15:21:31Z","published":"2024-06-14T08:29:34Z","title":"Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of\n Anomalous Behavior in Bio-regenerative Life Support System Telemetry","summary":" The detection of abnormal or critical system states is essential in condition\nmonitoring. While much attention is given to promptly identifying anomalies, a\nretrospective analysis of these anomalies can significantly enhance our\ncomprehension of the underlying causes of observed undesired behavior. This\naspect becomes particularly critical when the monitored system is deployed in a\nvital environment. In this study, we delve into anomalies within the domain of\nBio-Regenerative Life Support Systems (BLSS) for space exploration and analyze\nanomalies found in telemetry data stemming from the EDEN ISS space greenhouse\nin Antarctica. We employ time series clustering on anomaly detection results to\ncategorize various types of anomalies in both uni- and multivariate settings.\nWe then assess the effectiveness of these methods in identifying systematic\nanomalous behavior. Additionally, we illustrate that the anomaly detection\nmethods MDI and DAMP produce complementary results, as previously indicated by\nresearch.\n","authors":["Ferdinand Rewicki","Jakob Gawlikowski","Julia Niebling","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2406.09825v2.pdf","comment":"12 pages, + Supplemental Materials, Published at Machine Learning and\n Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024"},{"id":"http://arxiv.org/abs/2409.17946v1","updated":"2024-09-26T15:20:37Z","published":"2024-09-26T15:20:37Z","title":"Weak-To-Strong Backdoor Attacks for LLMs with Contrastive Knowledge\n Distillation","summary":" Despite being widely applied due to their exceptional capabilities, Large\nLanguage Models (LLMs) have been proven to be vulnerable to backdoor attacks.\nThese attacks introduce targeted vulnerabilities into LLMs by poisoning\ntraining samples and full-parameter fine-tuning. However, this kind of backdoor\nattack is limited since they require significant computational resources,\nespecially as the size of LLMs increases. Besides, parameter-efficient\nfine-tuning (PEFT) offers an alternative but the restricted parameter updating\nmay impede the alignment of triggers with target labels. In this study, we\nfirst verify that backdoor attacks with PEFT may encounter challenges in\nachieving feasible performance. To address these issues and improve the\neffectiveness of backdoor attacks with PEFT, we propose a novel backdoor attack\nalgorithm from weak to strong based on contrastive knowledge distillation\n(W2SAttack). Specifically, we poison small-scale language models through\nfull-parameter fine-tuning to serve as the teacher model. The teacher model\nthen covertly transfers the backdoor to the large-scale student model through\ncontrastive knowledge distillation, which employs PEFT. Theoretical analysis\nreveals that W2SAttack has the potential to augment the effectiveness of\nbackdoor attacks. We demonstrate the superior performance of W2SAttack on\nclassification tasks across four language models, four backdoor attack\nalgorithms, and two different architectures of teacher models. Experimental\nresults indicate success rates close to 100% for backdoor attacks targeting\nPEFT.\n","authors":["Shuai Zhao","Leilei Gan","Zhongliang Guo","Xiaobao Wu","Luwei Xiao","Xiaoyu Xu","Cong-Duy Nguyen","Luu Anh Tuan"],"pdf_url":"https://arxiv.org/pdf/2409.17946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17943v1","updated":"2024-09-26T15:18:34Z","published":"2024-09-26T15:18:34Z","title":"On Translating Technical Terminology: A Translation Workflow for\n Machine-Translated Acronyms","summary":" The typical workflow for a professional translator to translate a document\nfrom its source language (SL) to a target language (TL) is not always focused\non what many language models in natural language processing (NLP) do - predict\nthe next word in a series of words. While high-resource languages like English\nand French are reported to achieve near human parity using common metrics for\nmeasurement such as BLEU and COMET, we find that an important step is being\nmissed: the translation of technical terms, specifically acronyms. Some\nstate-of-the art machine translation systems like Google Translate which are\npublicly available can be erroneous when dealing with acronyms - as much as 50%\nin our findings. This article addresses acronym disambiguation for MT systems\nby proposing an additional step to the SL-TL (FR-EN) translation workflow where\nwe first offer a new acronym corpus for public consumption and then experiment\nwith a search-based thresholding algorithm that achieves nearly 10% increase\nwhen compared to Google Translate and OpusMT.\n","authors":["Richard Yue","John E. Ortega","Kenneth Ward Church"],"pdf_url":"https://arxiv.org/pdf/2409.17943v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17939v1","updated":"2024-09-26T15:12:59Z","published":"2024-09-26T15:12:59Z","title":"Predicting Anchored Text from Translation Memories for Machine\n Translation Using Deep Learning Methods","summary":" Translation memories (TMs) are the backbone for professional translation\ntools called computer-aided translation (CAT) tools. In order to perform a\ntranslation using a CAT tool, a translator uses the TM to gather translations\nsimilar to the desired segment to translate (s'). Many CAT tools offer a\nfuzzy-match algorithm to locate segments (s) in the TM that are close in\ndistance to s'. After locating two similar segments, the CAT tool will present\nparallel segments (s, t) that contain one segment in the source language along\nwith its translation in the target language. Additionally, CAT tools contain\nfuzzy-match repair (FMR) techniques that will automatically use the parallel\nsegments from the TM to create new TM entries containing a modified version of\nthe original with the idea in mind that it will be the translation of s'. Most\nFMR techniques use machine translation as a way of \"repairing\" those words that\nhave to be modified. In this article, we show that for a large part of those\nwords which are anchored, we can use other techniques that are based on machine\nlearning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we\nshow that for anchored words that follow the continuous bag-of-words (CBOW)\nparadigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for\nsome cases, better results than neural machine translation for translating\nanchored words from French to English.\n","authors":["Richard Yue","John E. Ortega"],"pdf_url":"https://arxiv.org/pdf/2409.17939v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17931v1","updated":"2024-09-26T15:08:38Z","published":"2024-09-26T15:08:38Z","title":"Intelligent Energy Management: Remaining Useful Life Prediction and\n Charging Automation System Comprised of Deep Learning and the Internet of\n Things","summary":" Remaining Useful Life (RUL) of battery is an important parameter to know the\nbattery's remaining life and need for recharge. The goal of this research\nproject is to develop machine learning-based models for the battery RUL\ndataset. Different ML models are developed to classify the RUL of the vehicle,\nand the IoT (Internet of Things) concept is simulated for automating the\ncharging system and managing any faults aligning. The graphs plotted depict the\nrelationship between various vehicle parameters using the Blynk IoT platform.\nResults show that the catboost, Multi-Layer Perceptron (MLP), Gated Recurrent\nUnit (GRU), and hybrid model developed could classify RUL into three classes\nwith 99% more accuracy. The data is fed using the tkinter GUI for simulating\nartificial intelligence (AI)-based charging, and with a pyserial backend, data\ncan be entered into the Esp-32 microcontroller for making charge discharge\npossible with the model's predictions. Also, with an IoT system, the charging\ncan be disconnected, monitored, and analyzed for automation. The results show\nthat an accuracy of 99% can be obtained on models MLP, catboost model and\nsimilar accuracy on GRU model can be obtained, and finally relay-based\ntriggering can be made by prediction through the model used for automating the\ncharging and energy-saving mechanism. By showcasing an exemplary Blynk\nplatform-based monitoring and automation phenomenon, we further present\ninnovative ways of monitoring parameters and automating the system.\n","authors":["Biplov Paneru","Bishwash Paneru","DP Sharma Mainali"],"pdf_url":"https://arxiv.org/pdf/2409.17931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17928v1","updated":"2024-09-26T15:07:30Z","published":"2024-09-26T15:07:30Z","title":"Pioneering Reliable Assessment in Text-to-Image Knowledge Editing:\n Leveraging a Fine-Grained Dataset and an Innovative Criterion","summary":" During pre-training, the Text-to-Image (T2I) diffusion models encode factual\nknowledge into their parameters. These parameterized facts enable realistic\nimage generation, but they may become obsolete over time, thereby\nmisrepresenting the current state of the world. Knowledge editing techniques\naim to update model knowledge in a targeted way. However, facing the dual\nchallenges posed by inadequate editing datasets and unreliable evaluation\ncriterion, the development of T2I knowledge editing encounter difficulties in\neffectively generalizing injected knowledge. In this work, we design a T2I\nknowledge editing framework by comprehensively spanning on three phases: First,\nwe curate a dataset \\textbf{CAKE}, comprising paraphrase and multi-object test,\nto enable more fine-grained assessment on knowledge generalization. Second, we\npropose a novel criterion, \\textbf{adaptive CLIP threshold}, to effectively\nfilter out false successful images under the current criterion and achieve\nreliable editing evaluation. Finally, we introduce \\textbf{MPE}, a simple but\neffective approach for T2I knowledge editing. Instead of tuning parameters, MPE\nprecisely recognizes and edits the outdated part of the conditioning\ntext-prompt to accommodate the up-to-date knowledge. A straightforward\nimplementation of MPE (Based on in-context learning) exhibits better overall\nperformance than previous model editors. We hope these efforts can further\npromote faithful evaluation of T2I knowledge editing methods.\n","authors":["Hengrui Gu","Kaixiong Zhou","Yili Wang","Ruobing Wang","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17928v1.pdf","comment":"EMNLP24 Findings"},{"id":"http://arxiv.org/abs/2409.17922v1","updated":"2024-09-26T15:05:15Z","published":"2024-09-26T15:05:15Z","title":"Navigation in a simplified Urban Flow through Deep Reinforcement\n Learning","summary":" The increasing number of unmanned aerial vehicles (UAVs) in urban\nenvironments requires a strategy to minimize their environmental impact, both\nin terms of energy efficiency and noise reduction. In order to reduce these\nconcerns, novel strategies for developing prediction models and optimization of\nflight planning, for instance through deep reinforcement learning (DRL), are\nneeded. Our goal is to develop DRL algorithms capable of enabling the\nautonomous navigation of UAVs in urban environments, taking into account the\npresence of buildings and other UAVs, optimizing the trajectories in order to\nreduce both energetic consumption and noise. This is achieved using fluid-flow\nsimulations which represent the environment in which UAVs navigate and training\nthe UAV as an agent interacting with an urban environment. In this work, we\nconsider a domain domain represented by a two-dimensional flow field with\nobstacles, ideally representing buildings, extracted from a three-dimensional\nhigh-fidelity numerical simulation. The presented methodology, using PPO+LSTM\ncells, was validated by reproducing a simple but fundamental problem in\nnavigation, namely the Zermelo's problem, which deals with a vessel navigating\nin a turbulent flow, travelling from a starting point to a target location,\noptimizing the trajectory. The current method shows a significant improvement\nwith respect to both a simple PPO and a TD3 algorithm, with a success rate (SR)\nof the PPO+LSTM trained policy of 98.7%, and a crash rate (CR) of 0.1%,\noutperforming both PPO (SR = 75.6%, CR=18.6%) and TD3 (SR=77.4% and CR=14.5%).\nThis is the first step towards DRL strategies which will guide UAVs in a\nthree-dimensional flow field using real-time signals, making the navigation\nefficient in terms of flight time and avoiding damages to the vehicle.\n","authors":["Federica Tonti","Jean Rabault","Ricardo Vinuesa"],"pdf_url":"https://arxiv.org/pdf/2409.17922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15228v3","updated":"2024-09-26T14:57:52Z","published":"2024-09-23T17:22:09Z","title":"A Comprehensive Framework for Evaluating API-oriented Code Generation in\n Large Language Models","summary":" Large language models (LLMs) like GitHub Copilot and ChatGPT have emerged as\npowerful tools for code generation, significantly enhancing productivity and\naccelerating software development. However, existing benchmarks primarily focus\non general code generation without considering API-oriented code generation,\ni.e., generating code that invokes APIs from specific libraries. Given the\ngrowing demand for API-oriented code generation, there is a pressing need for a\nsystematic and automated approach to evaluate LLM on API-oriented code\ngeneration. To address this gap, we propose AutoAPIEval, a lightweight and\nautomated framework designed to evaluate the capabilities of LLMs in\nAPI-oriented code generation. Our framework works with any library that\nprovides API documentation and focuses on two unit tasks: API recommendation\nand code example generation, along with four metrics to evaluate the generated\nAPIs and code examples, such as the proportion of incorrect API recommendations\nfor Task 1, and the proportion of code examples where no specific API is\ninvoked and uncompilable/unexecutable code examples for Task 2. In addition, we\nconducted a case study on three LLMs (ChatGPT, MagiCoder, and DeepSeek Coder)\nand Java Runtime Environment 8 to demonstrate the framework's effectiveness.\nOur findings reveal substantial variability in LLM performance across tasks,\nwith ChatGPT adhering better to instructions, while sharing similar\neffectiveness in code example generation with its counterparts (i.e., MagiCoder\nand DeekSeek Coder). We also identify key factors associated with code quality,\nsuch as API popularity and model confidence, and build classifiers that achieve\nhigh accuracy in detecting incorrect API recommendations and erroneous code\nexamples. Retrieval-augmented generation enhances the quality of code generated\nby LLMs, though its effectiveness varies across different LLMs.\n","authors":["Yixi Wu","Pengfei He","Zehao Wang","Shaowei Wang","Yuan Tian","Tse-Hsun Chen"],"pdf_url":"https://arxiv.org/pdf/2409.15228v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17907v1","updated":"2024-09-26T14:52:51Z","published":"2024-09-26T14:52:51Z","title":"PhantomLiDAR: Cross-modality Signal Injection Attacks against LiDAR","summary":" LiDAR (Light Detection and Ranging) is a pivotal sensor for autonomous\ndriving, offering precise 3D spatial information. Previous signal attacks\nagainst LiDAR systems mainly exploit laser signals. In this paper, we\ninvestigate the possibility of cross-modality signal injection attacks, i.e.,\ninjecting intentional electromagnetic interference (IEMI) to manipulate LiDAR\noutput. Our insight is that the internal modules of a LiDAR, i.e., the laser\nreceiving circuit, the monitoring sensors, and the beam-steering modules, even\nwith strict electromagnetic compatibility (EMC) testing, can still couple with\nthe IEMI attack signals and result in the malfunction of LiDAR systems. Based\non the above attack surfaces, we propose the PhantomLiDAR attack, which\nmanipulates LiDAR output in terms of Points Interference, Points Injection,\nPoints Removal, and even LiDAR Power-Off. We evaluate and demonstrate the\neffectiveness of PhantomLiDAR with both simulated and real-world experiments on\nfive COTS LiDAR systems. We also conduct feasibility experiments in real-world\nmoving scenarios. We provide potential defense measures that can be implemented\nat both the sensor level and the vehicle system level to mitigate the risks\nassociated with IEMI attacks. Video demonstrations can be viewed at\nhttps://sites.google.com/view/phantomlidar.\n","authors":["Zizhi Jin","Qinhong Jiang","Xuancun Lu","Chen Yan","Xiaoyu Ji","Wenyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17904v1","updated":"2024-09-26T14:51:40Z","published":"2024-09-26T14:51:40Z","title":"Learning to Love Edge Cases in Formative Math Assessment: Using the\n AMMORE Dataset and Chain-of-Thought Prompting to Improve Grading Accuracy","summary":" This paper introduces AMMORE, a new dataset of 53,000 math open-response\nquestion-answer pairs from Rori, a learning platform used by students in\nseveral African countries and conducts two experiments to evaluate the use of\nlarge language models (LLM) for grading particularly challenging student\nanswers. The AMMORE dataset enables various potential analyses and provides an\nimportant resource for researching student math acquisition in understudied,\nreal-world, educational contexts. In experiment 1 we use a variety of\nLLM-driven approaches, including zero-shot, few-shot, and chain-of-thought\nprompting, to grade the 1% of student answers that a rule-based classifier\nfails to grade accurately. We find that the best-performing approach --\nchain-of-thought prompting -- accurately scored 92% of these edge cases,\neffectively boosting the overall accuracy of the grading from 98.7% to 99.9%.\nIn experiment 2, we aim to better understand the consequential validity of the\nimproved grading accuracy, by passing grades generated by the best-performing\nLLM-based approach to a Bayesian Knowledge Tracing (BKT) model, which estimated\nstudent mastery of specific lessons. We find that relatively modest\nimprovements in model accuracy at the individual question level can lead to\nsignificant changes in the estimation of student mastery. Where the rules-based\nclassifier currently used to grade student, answers misclassified the mastery\nstatus of 6.9% of students across their completed lessons, using the LLM\nchain-of-thought approach this misclassification rate was reduced to 2.6% of\nstudents. Taken together, these findings suggest that LLMs could be a valuable\ntool for grading open-response questions in K-12 mathematics education,\npotentially enabling encouraging wider adoption of open-ended questions in\nformative assessment.\n","authors":["Owen Henkel","Hannah Horne-Robinson","Maria Dyshel","Nabil Ch","Baptiste Moreau-Pernet","Ralph Abood"],"pdf_url":"https://arxiv.org/pdf/2409.17904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17899v1","updated":"2024-09-26T14:49:09Z","published":"2024-09-26T14:49:09Z","title":"Revisiting Acoustic Similarity in Emotional Speech and Music via\n Self-Supervised Representations","summary":" Emotion recognition from speech and music shares similarities due to their\nacoustic overlap, which has led to interest in transferring knowledge between\nthese domains. However, the shared acoustic cues between speech and music,\nparticularly those encoded by Self-Supervised Learning (SSL) models, remain\nlargely unexplored, given the fact that SSL models for speech and music have\nrarely been applied in cross-domain research. In this work, we revisit the\nacoustic similarity between emotion speech and music, starting with an analysis\nof the layerwise behavior of SSL models for Speech Emotion Recognition (SER)\nand Music Emotion Recognition (MER). Furthermore, we perform cross-domain\nadaptation by comparing several approaches in a two-stage fine-tuning process,\nexamining effective ways to utilize music for SER and speech for MER. Lastly,\nwe explore the acoustic similarities between emotional speech and music using\nFrechet audio distance for individual emotions, uncovering the issue of emotion\nbias in both speech and music SSL models. Our findings reveal that while speech\nand music SSL models do capture shared acoustic features, their behaviors can\nvary depending on different emotions due to their training strategies and\ndomain-specificities. Additionally, parameter-efficient fine-tuning can enhance\nSER and MER performance by leveraging knowledge from each other. This study\nprovides new insights into the acoustic similarity between emotional speech and\nmusic, and highlights the potential for cross-domain generalization to improve\nSER and MER systems.\n","authors":["Yujia Sun","Zeyu Zhao","Korin Richmond","Yuanchao Li"],"pdf_url":"https://arxiv.org/pdf/2409.17899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09577v2","updated":"2024-09-26T14:34:53Z","published":"2024-04-15T08:38:43Z","title":"Transformers, Contextualism, and Polysemy","summary":" The transformer architecture, introduced by Vaswani et al. (2017), is at the\nheart of the remarkable recent progress in the development of language models,\nincluding widely-used chatbots such as Chat-GPT and Claude. In this paper, I\nargue that we can extract from the way the transformer architecture works a\ntheory of the relationship between context and meaning. I call this the\ntransformer theory, and I argue that it is novel with regard to two related\nphilosophical debates: the contextualism debate regarding the extent of\ncontext-sensitivity across natural language, and the polysemy debate regarding\nhow polysemy should be captured within an account of word meaning.\n","authors":["Jumbly Grindrod"],"pdf_url":"https://arxiv.org/pdf/2404.09577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10588v5","updated":"2024-09-26T14:34:06Z","published":"2024-09-16T14:56:27Z","title":"Opponent Shaping for Antibody Development","summary":" Anti-viral therapies are typically designed to target the current strains of\na virus. Game theoretically, this corresponds to a short-sighted, or myopic,\nresponse. However, therapy-induced selective pressures act on viral antigens to\ndrive the emergence of mutated strains, against which initial therapies have\nreduced efficacy. Building on a computational model of binding between\nantibodies and viral antigens (the Absolut! framework), we design and implement\na genetic simulation of such viral evolutionary escape. Crucially, this allows\nour antibody optimisation algorithm to consider and influence the entire escape\ncurve of the virus, i.e. to guide (or ''shape'') the viral evolution. This is\ninspired by opponent shaping which, in general-sum learning, accounts for the\nadaptation of the co-player rather than playing a myopic best response. Hence\nwe call the optimised antibodies shapers. Within our simulations, we\ndemonstrate that our shapers target both current and simulated future viral\nvariants, outperforming the antibodies chosen in a myopic way. Furthermore, we\nshow that shapers exert specific evolutionary pressure on the virus compared to\nmyopic antibodies. Altogether, shapers modify the evolutionary trajectories of\nviral strains and minimise the viral escape compared to their myopic\ncounterparts. While this is a simplified model, we hope that our proposed\nparadigm will enable the discovery of better long-lived vaccines and antibody\ntherapies in the future, enabled by rapid advancements in the capabilities of\nsimulation tools. Our code is available at\nhttps://github.com/olakalisz/antibody-shapers.\n","authors":["Sebastian Towers","Aleksandra Kalisz","Philippe A. Robert","Alicia Higueruelo","Francesca Vianello","Ming-Han Chloe Tsai","Harrison Steel","Jakob N. Foerster"],"pdf_url":"https://arxiv.org/pdf/2409.10588v5.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2409.17876v1","updated":"2024-09-26T14:23:44Z","published":"2024-09-26T14:23:44Z","title":"Why Companies \"Democratise\" Artificial Intelligence: The Case of Open\n Source Software Donations","summary":" Companies claim to \"democratise\" artificial intelligence (AI) when they\ndonate AI open source software (OSS) to non-profit foundations or release AI\nmodels, among others, but what does this term mean and why do they do it? As\nthe impact of AI on society and the economy grows, understanding the commercial\nincentives behind AI democratisation efforts is crucial for ensuring these\nefforts serve broader interests beyond commercial agendas. Towards this end,\nthis study employs a mixed-methods approach to investigate commercial\nincentives for 43 AI OSS donations to the Linux Foundation. It makes\ncontributions to both research and practice. It contributes a taxonomy of both\nindividual and organisational social, economic, and technological incentives\nfor AI democratisation. In particular, it highlights the role of democratising\nthe governance and control rights of an OSS project (i.e., from one company to\nopen governance) as a structural enabler for downstream goals, such as\nattracting external contributors, reducing development costs, and influencing\nindustry standards, among others. Furthermore, OSS donations are often\nchampioned by individual developers within companies, highlighting the\nimportance of the bottom-up incentives for AI democratisation. The taxonomy\nprovides a framework and toolkit for discerning incentives for other AI\ndemocratisation efforts, such as the release of AI models. The paper concludes\nwith a discussion of future research directions.\n","authors":["Cailean Osborne"],"pdf_url":"https://arxiv.org/pdf/2409.17876v1.pdf","comment":"30 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2310.01807v2","updated":"2024-09-26T14:21:10Z","published":"2023-10-03T05:40:56Z","title":"Discrete, compositional, and symbolic representations through attractor\n dynamics","summary":" Symbolic systems are powerful frameworks for modeling cognitive processes as\nthey encapsulate the rules and relationships fundamental to many aspects of\nhuman reasoning and behavior. Central to these models are systematicity,\ncompositionality, and productivity, making them invaluable in both cognitive\nscience and artificial intelligence. However, certain limitations remain. For\ninstance, the integration of structured symbolic processes and latent\nsub-symbolic processes has been implemented at the computational level through\nfiat methods such as quantization or softmax sampling, which assume, rather\nthan derive, the operations underpinning discretization and symbolicization. In\nthis work, we introduce a novel neural stochastic dynamical systems model that\nintegrates attractor dynamics with symbolic representations to model cognitive\nprocesses akin to the probabilistic language of thought (PLoT). Our model\nsegments the continuous representational space into discrete basins, with\nattractor states corresponding to symbolic sequences, that reflect the\nsemanticity and compositionality characteristic of symbolic systems through\nunsupervised learning, rather than relying on pre-defined primitives. Moreover,\nlike PLoT, our model learns to sample a diverse distribution of attractor\nstates that reflect the mutual information between the input data and the\nsymbolic encodings. This approach establishes a unified framework that\nintegrates both symbolic and sub-symbolic processing through neural dynamics, a\nneuro-plausible substrate with proven expressivity in AI, offering a more\ncomprehensive model that mirrors the complex duality of cognitive operations.\n","authors":["Andrew Nam","Eric Elmoznino","Nikolay Malkin","James McClelland","Yoshua Bengio","Guillaume Lajoie"],"pdf_url":"https://arxiv.org/pdf/2310.01807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17874v1","updated":"2024-09-26T14:20:14Z","published":"2024-09-26T14:20:14Z","title":"DarkSAM: Fooling Segment Anything Model to Segment Nothing","summary":" Segment Anything Model (SAM) has recently gained much attention for its\noutstanding generalization to unseen data and tasks. Despite its promising\nprospect, the vulnerabilities of SAM, especially to universal adversarial\nperturbation (UAP) have not been thoroughly investigated yet. In this paper, we\npropose DarkSAM, the first prompt-free universal attack framework against SAM,\nincluding a semantic decoupling-based spatial attack and a texture\ndistortion-based frequency attack. We first divide the output of SAM into\nforeground and background. Then, we design a shadow target strategy to obtain\nthe semantic blueprint of the image as the attack target. DarkSAM is dedicated\nto fooling SAM by extracting and destroying crucial object features from images\nin both spatial and frequency domains. In the spatial domain, we disrupt the\nsemantics of both the foreground and background in the image to confuse SAM. In\nthe frequency domain, we further enhance the attack effectiveness by distorting\nthe high-frequency components (i.e., texture information) of the image.\nConsequently, with a single UAP, DarkSAM renders SAM incapable of segmenting\nobjects across diverse images with varying prompts. Experimental results on\nfour datasets for SAM and its two variant models demonstrate the powerful\nattack capability and transferability of DarkSAM.\n","authors":["Ziqi Zhou","Yufei Song","Minghui Li","Shengshan Hu","Xianlong Wang","Leo Yu Zhang","Dezhong Yao","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2409.17874v1.pdf","comment":"This paper has been accepted by the 38th Annual Conference on Neural\n Information Processing Systems (NeurIPS'24)"},{"id":"http://arxiv.org/abs/2409.17870v1","updated":"2024-09-26T14:17:58Z","published":"2024-09-26T14:17:58Z","title":"Efficient Arbitrary Precision Acceleration for Large Language Models on\n GPU Tensor Cores","summary":" Large language models (LLMs) have been widely applied but face challenges in\nefficient inference. While quantization methods reduce computational demands,\nultra-low bit quantization with arbitrary precision is hindered by limited GPU\nTensor Core support and inefficient memory management, leading to suboptimal\nacceleration. To address these challenges, we propose a comprehensive\nacceleration scheme for arbitrary precision LLMs. At its core, we introduce a\nnovel bipolar-INT data format that facilitates parallel computing and supports\nsymmetric quantization, effectively reducing data redundancy. Building on this,\nwe implement an arbitrary precision matrix multiplication scheme that\ndecomposes and recovers matrices at the bit level, enabling flexible precision\nwhile maximizing GPU Tensor Core utilization. Furthermore, we develop an\nefficient matrix preprocessing method that optimizes data layout for subsequent\ncomputations. Finally, we design a data recovery-oriented memory management\nsystem that strategically utilizes fast shared memory, significantly enhancing\nkernel execution speed and minimizing memory access latency. Experimental\nresults demonstrate our approach's effectiveness, with up to 13\\times speedup\nin matrix multiplication compared to NVIDIA's CUTLASS. When integrated into\nLLMs, we achieve up to 6.7\\times inference acceleration. These improvements\nsignificantly enhance LLM inference efficiency, enabling broader and more\nresponsive applications of LLMs.\n","authors":["Shaobo Ma","Chao Fang","Haikuo Shao","Zhongfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05208v3","updated":"2024-09-26T14:16:01Z","published":"2023-10-08T15:49:36Z","title":"ZSC-Eval: An Evaluation Toolkit and Benchmark for Multi-agent Zero-shot\n Coordination","summary":" Zero-shot coordination (ZSC) is a new cooperative multi-agent reinforcement\nlearning (MARL) challenge that aims to train an ego agent to work with diverse,\nunseen partners during deployment. The significant difference between the\ndeployment-time partners' distribution and the training partners' distribution\ndetermined by the training algorithm makes ZSC a unique out-of-distribution\n(OOD) generalization challenge. The potential distribution gap between\nevaluation and deployment-time partners leads to inadequate evaluation, which\nis exacerbated by the lack of appropriate evaluation metrics. In this paper, we\npresent ZSC-Eval, the first evaluation toolkit and benchmark for ZSC\nalgorithms. ZSC-Eval consists of: 1) Generation of evaluation partner\ncandidates through behavior-preferring rewards to approximate deployment-time\npartners' distribution; 2) Selection of evaluation partners by Best-Response\nDiversity (BR-Div); 3) Measurement of generalization performance with various\nevaluation partners via the Best-Response Proximity (BR-Prox) metric. We use\nZSC-Eval to benchmark ZSC algorithms in Overcooked and Google Research Football\nenvironments and get novel empirical findings. We also conduct a human\nexperiment of current ZSC algorithms to verify the ZSC-Eval's consistency with\nhuman evaluation. ZSC-Eval is now available at\nhttps://github.com/sjtu-marl/ZSC-Eval.\n","authors":["Xihuai Wang","Shao Zhang","Wenhao Zhang","Wentao Dong","Jingxiao Chen","Ying Wen","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.05208v3.pdf","comment":"Accepted in NeurIPS 2024 Dataset and Benchmark Track"},{"id":"http://arxiv.org/abs/2409.17865v1","updated":"2024-09-26T14:15:54Z","published":"2024-09-26T14:15:54Z","title":"Implementing a Nordic-Baltic Federated Health Data Network: a case\n report","summary":" Background: Centralized collection and processing of healthcare data across\nnational borders pose significant challenges, including privacy concerns, data\nheterogeneity and legal barriers. To address some of these challenges, we\nformed an interdisciplinary consortium to develop a feder-ated health data\nnetwork, comprised of six institutions across five countries, to facilitate\nNordic-Baltic cooperation on secondary use of health data. The objective of\nthis report is to offer early insights into our experiences developing this\nnetwork. Methods: We used a mixed-method ap-proach, combining both experimental\ndesign and implementation science to evaluate the factors affecting the\nimplementation of our network. Results: Technically, our experiments indicate\nthat the network functions without significant performance degradation compared\nto centralized simu-lation. Conclusion: While use of interdisciplinary\napproaches holds a potential to solve challeng-es associated with establishing\nsuch collaborative networks, our findings turn the spotlight on the uncertain\nregulatory landscape playing catch up and the significant operational costs.\n","authors":["Taridzo Chomutare","Aleksandar Babic","Laura-Maria Peltonen","Silja Elunurm","Peter Lundberg","Arne Jönsson","Emma Eneling","Ciprian-Virgil Gerstenberger","Troels Siggaard","Raivo Kolde","Oskar Jerdhaf","Martin Hansson","Alexandra Makhlysheva","Miroslav Muzny","Erik Ylipää","Søren Brunak","Hercules Dalianis"],"pdf_url":"https://arxiv.org/pdf/2409.17865v1.pdf","comment":"24 pages (including appendices), 1 figure"},{"id":"http://arxiv.org/abs/2409.17864v1","updated":"2024-09-26T14:12:23Z","published":"2024-09-26T14:12:23Z","title":"A Multimodal Single-Branch Embedding Network for Recommendation in\n Cold-Start and Missing Modality Scenarios","summary":" Most recommender systems adopt collaborative filtering (CF) and provide\nrecommendations based on past collective interactions. Therefore, the\nperformance of CF algorithms degrades when few or no interactions are\navailable, a scenario referred to as cold-start. To address this issue,\nprevious work relies on models leveraging both collaborative data and side\ninformation on the users or items. Similar to multimodal learning, these models\naim at combining collaborative and content representations in a shared\nembedding space. In this work we propose a novel technique for multimodal\nrecommendation, relying on a multimodal Single-Branch embedding network for\nRecommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction\ndata as well as multimodal side information using the same single-branch\nembedding network on different modalities. This makes SiBraR effective in\nscenarios of missing modality, including cold start. Our extensive experiments\non large-scale recommendation datasets from three different recommendation\ndomains (music, movie, and e-commerce) and providing multimodal content\ninformation (audio, text, image, labels, and interactions) show that SiBraR\nsignificantly outperforms CF as well as state-of-the-art content-based RSs in\ncold-start scenarios, and is competitive in warm scenarios. We show that\nSiBraR's recommendations are accurate in missing modality scenarios, and that\nthe model is able to map different modalities to the same region of the shared\nembedding space, hence reducing the modality gap.\n","authors":["Christian Ganhör","Marta Moscati","Anna Hausberger","Shah Nawaz","Markus Schedl"],"pdf_url":"https://arxiv.org/pdf/2409.17864v1.pdf","comment":"Accepted at 18th ACM Conference on Recommender Systems (RecSys '24)"},{"id":"http://arxiv.org/abs/2409.17841v1","updated":"2024-09-26T13:45:36Z","published":"2024-09-26T13:45:36Z","title":"Machine Learning-based vs Deep Learning-based Anomaly Detection in\n Multivariate Time Series for Spacecraft Attitude Sensors","summary":" In the framework of Failure Detection, Isolation and Recovery (FDIR) on\nspacecraft, new AI-based approaches are emerging in the state of the art to\novercome the limitations commonly imposed by traditional threshold checking.\n The present research aims at characterizing two different approaches to the\nproblem of stuck values detection in multivariate time series coming from\nspacecraft attitude sensors. The analysis reveals the performance differences\nin the two approaches, while commenting on their interpretability and\ngeneralization to different scenarios.\n","authors":["R. Gallon","F. Schiemenz","A. Krstova","A. Menicucci","E. Gill"],"pdf_url":"https://arxiv.org/pdf/2409.17841v1.pdf","comment":"Accepted for the ESA SPAICE Conference 2024"},{"id":"http://arxiv.org/abs/2409.17840v1","updated":"2024-09-26T13:44:22Z","published":"2024-09-26T13:44:22Z","title":"Detecting and Measuring Confounding Using Causal Mechanism Shifts","summary":" Detecting and measuring confounding effects from data is a key challenge in\ncausal inference. Existing methods frequently assume causal sufficiency,\ndisregarding the presence of unobserved confounding variables. Causal\nsufficiency is both unrealistic and empirically untestable. Additionally,\nexisting methods make strong parametric assumptions about the underlying causal\ngenerative process to guarantee the identifiability of confounding variables.\nRelaxing the causal sufficiency and parametric assumptions and leveraging\nrecent advancements in causal discovery and confounding analysis with\nnon-i.i.d. data, we propose a comprehensive approach for detecting and\nmeasuring confounding. We consider various definitions of confounding and\nintroduce tailored methodologies to achieve three objectives: (i) detecting and\nmeasuring confounding among a set of variables, (ii) separating observed and\nunobserved confounding effects, and (iii) understanding the relative strengths\nof confounding bias between different sets of variables. We present useful\nproperties of a confounding measure and present measures that satisfy those\nproperties. Empirical results support the theoretical analysis.\n","authors":["Abbavaram Gowtham Reddy","Vineeth N Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2409.17840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17836v1","updated":"2024-09-26T13:38:33Z","published":"2024-09-26T13:38:33Z","title":"Language Models as Zero-shot Lossless Gradient Compressors: Towards\n General Neural Parameter Prior Models","summary":" Despite the widespread use of statistical prior models in various fields,\nsuch models for neural network gradients have long been overlooked. The\ninherent challenge stems from their high-dimensional structures and complex\ninterdependencies, which complicate effective modeling. In this work, we\ndemonstrate the potential of large language models (LLMs) to act as gradient\npriors in a zero-shot setting. We examine the property by considering lossless\ngradient compression -- a critical application in distributed learning -- that\ndepends heavily on precise probability modeling. To achieve this, we introduce\nLM-GC, a novel method that integrates LLMs with arithmetic coding. Our\ntechnique converts plain gradients into text-like formats, enhancing token\nefficiency by up to 38 times compared to their plain representations. We ensure\nthat this data conversion maintains a close alignment with the structure of\nplain gradients and the symbols commonly recognized by LLMs. Our experiments\nindicate that LM-GC surpasses existing state-of-the-art lossless compression\nmethods, improving compression rates by 10\\% up to 17.2\\% across various\ndatasets and architectures. Additionally, our approach shows promising\ncompatibility with lossy compression techniques such as quantization and\nsparsification. These findings highlight the significant potential of LLMs as a\nmodel for effectively handling gradients. We will release the source code upon\npublication.\n","authors":["Hui-Po Wang","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2409.17836v1.pdf","comment":"To appear in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2305.01899v2","updated":"2024-09-26T13:34:35Z","published":"2023-05-03T05:16:54Z","title":"Empowering Agrifood System with Artificial Intelligence: A Survey of the\n Progress, Challenges and Opportunities","summary":" With the world population rapidly increasing, transforming our agrifood\nsystems to be more productive, efficient, safe, and sustainable is crucial to\nmitigate potential food shortages. Recently, artificial intelligence (AI)\ntechniques such as deep learning (DL) have demonstrated their strong abilities\nin various areas, including language, vision, remote sensing (RS), and agrifood\nsystems applications. However, the overall impact of AI on agrifood systems\nremains unclear. In this paper, we thoroughly review how AI techniques can\ntransform agrifood systems and contribute to the modern agrifood industry.\nFirstly, we summarize the data acquisition methods in agrifood systems,\nincluding acquisition, storage, and processing techniques. Secondly, we present\na progress review of AI methods in agrifood systems, specifically in\nagriculture, animal husbandry, and fishery, covering topics such as agrifood\nclassification, growth monitoring, yield prediction, and quality assessment.\nFurthermore, we highlight potential challenges and promising research\nopportunities for transforming modern agrifood systems with AI. We hope this\nsurvey could offer an overall picture to newcomers in the field and serve as a\nstarting point for their further research. The project website is\nhttps://github.com/Frenkie14/Agrifood-Survey.\n","authors":["Tao Chen","Liang Lv","Di Wang","Jing Zhang","Yue Yang","Zeyang Zhao","Chen Wang","Xiaowei Guo","Hao Chen","Qingye Wang","Yufei Xu","Qiming Zhang","Bo Du","Liangpei Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2305.01899v2.pdf","comment":"Accepted by ACM Computing Surveys"},{"id":"http://arxiv.org/abs/2409.16934v2","updated":"2024-09-26T13:22:37Z","published":"2024-09-25T13:45:23Z","title":"Investigating OCR-Sensitive Neurons to Improve Entity Recognition in\n Historical Documents","summary":" This paper investigates the presence of OCR-sensitive neurons within the\nTransformer architecture and their influence on named entity recognition (NER)\nperformance on historical documents. By analysing neuron activation patterns in\nresponse to clean and noisy text inputs, we identify and then neutralise\nOCR-sensitive neurons to improve model performance. Based on two open access\nlarge language models (Llama2 and Mistral), experiments demonstrate the\nexistence of OCR-sensitive regions and show improvements in NER performance on\nhistorical newspapers and classical commentaries, highlighting the potential of\ntargeted neuron modulation to improve models' performance on noisy text.\n","authors":["Emanuela Boros","Maud Ehrmann"],"pdf_url":"https://arxiv.org/pdf/2409.16934v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17819v1","updated":"2024-09-26T13:15:18Z","published":"2024-09-26T13:15:18Z","title":"Inference-Time Language Model Alignment via Integrated Value Guidance","summary":" Large language models are typically fine-tuned to align with human\npreferences, but tuning large models is computationally intensive and complex.\nIn this work, we introduce $\\textit{Integrated Value Guidance}$ (IVG), a method\nthat uses implicit and explicit value functions to guide language model\ndecoding at token and chunk-level respectively, efficiently aligning large\nlanguage models purely at inference time. This approach circumvents the\ncomplexities of direct fine-tuning and outperforms traditional methods.\nEmpirically, we demonstrate the versatility of IVG across various tasks. In\ncontrolled sentiment generation and summarization tasks, our method\nsignificantly improves the alignment of large models using inference-time\nguidance from $\\texttt{gpt2}$-based value functions. Moreover, in a more\nchallenging instruction-following benchmark AlpacaEval 2.0, we show that both\nspecifically tuned and off-the-shelf value functions greatly improve the\nlength-controlled win rates of large models against $\\texttt{gpt-4-turbo}$\n(e.g., $19.51\\% \\rightarrow 26.51\\%$ for $\\texttt{Mistral-7B-Instruct-v0.2}$\nand $25.58\\% \\rightarrow 33.75\\%$ for $\\texttt{Mixtral-8x7B-Instruct-v0.1}$\nwith Tulu guidance).\n","authors":["Zhixuan Liu","Zhanhui Zhou","Yuanfu Wang","Chao Yang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2409.17819v1.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17815v1","updated":"2024-09-26T13:12:13Z","published":"2024-09-26T13:12:13Z","title":"DREAMS: A python framework to train deep learning models with model card\n reporting for medical and health applications","summary":" Electroencephalography (EEG) data provides a non-invasive method for\nresearchers and clinicians to observe brain activity in real time. The\nintegration of deep learning techniques with EEG data has significantly\nimproved the ability to identify meaningful patterns, leading to valuable\ninsights for both clinical and research purposes. However, most of the\nframeworks so far, designed for EEG data analysis, are either too focused on\npre-processing or in deep learning methods per, making their use for both\nclinician and developer communities problematic. Moreover, critical issues such\nas ethical considerations, biases, uncertainties, and the limitations inherent\nin AI models for EEG data analysis are frequently overlooked, posing challenges\nto the responsible implementation of these technologies. In this paper, we\nintroduce a comprehensive deep learning framework tailored for EEG data\nprocessing, model training and report generation. While constructed in way to\nbe adapted and developed further by AI developers, it enables to report,\nthrough model cards, the outcome and specific information of use for both\ndevelopers and clinicians. In this way, we discuss how this framework can, in\nthe future, provide clinical researchers and developers with the tools needed\nto create transparent and accountable AI models for EEG data analysis and\ndiagnosis.\n","authors":["Rabindra Khadka","Pedro G Lind","Anis Yazidi","Asma Belhadi"],"pdf_url":"https://arxiv.org/pdf/2409.17815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16106v2","updated":"2024-09-26T13:05:36Z","published":"2024-09-24T14:07:47Z","title":"Scenario of Use Scheme: Threat Model Specification for Speaker Privacy\n Protection in the Medical Domain","summary":" Speech recordings are being more frequently used to detect and monitor\ndisease, leading to privacy concerns. Beyond cryptography, protection of speech\ncan be addressed by approaches, such as perturbation, disentanglement, and\nre-synthesis, that eliminate sensitive information of the speaker, leaving the\ninformation necessary for medical analysis purposes. In order for such privacy\nprotective approaches to be developed, clear and systematic specifications of\nassumptions concerning medical settings and the needs of medical professionals\nare necessary. In this paper, we propose a Scenario of Use Scheme that\nincorporates an Attacker Model, which characterizes the adversary against whom\nthe speaker's privacy must be defended, and a Protector Model, which specifies\nthe defense. We discuss the connection of the scheme with previous work on\nspeech privacy. Finally, we present a concrete example of a specified Scenario\nof Use and a set of experiments about protecting speaker data against gender\ninference attacks while maintaining utility for Parkinson's detection.\n","authors":["Mehtab Ur Rahman","Martha Larson","Louis ten Bosch","Cristian Tejedor-García"],"pdf_url":"https://arxiv.org/pdf/2409.16106v2.pdf","comment":"Accepted and published at SPSC Symposium 2024 4th Symposium on\n Security and Privacy in Speech Communication. Interspeech 2024"},{"id":"http://arxiv.org/abs/2406.10615v2","updated":"2024-09-26T12:55:43Z","published":"2024-06-15T12:27:35Z","title":"Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation","summary":" Given the high cost of collecting robotic data in the real world, sample\nefficiency is a consistently compelling pursuit in robotics. In this paper, we\nintroduce SGRv2, an imitation learning framework that enhances sample\nefficiency through improved visual and action representations. Central to the\ndesign of SGRv2 is the incorporation of a critical inductive bias-action\nlocality, which posits that robot's actions are predominantly influenced by the\ntarget object and its interactions with the local environment. Extensive\nexperiments in both simulated and real-world settings demonstrate that action\nlocality is essential for boosting sample efficiency. SGRv2 excels in RLBench\ntasks with keyframe control using merely 5 demonstrations and surpasses the RVT\nbaseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and\nMimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR.\nIn real-world environments, with only eight demonstrations, SGRv2 can perform a\nvariety of tasks at a markedly higher success rate compared to baseline models.\nProject website: http://sgrv2-robot.github.io\n","authors":["Tong Zhang","Yingdong Hu","Jiacheng You","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2406.10615v2.pdf","comment":"CoRL 2024. Project website: http://sgrv2-robot.github.io"},{"id":"http://arxiv.org/abs/2409.17791v1","updated":"2024-09-26T12:37:26Z","published":"2024-09-26T12:37:26Z","title":"Self-supervised Preference Optimization: Enhance Your Language Model\n with Preference Degree Awareness","summary":" Recently, there has been significant interest in replacing the reward model\nin Reinforcement Learning with Human Feedback (RLHF) methods for Large Language\nModels (LLMs), such as Direct Preference Optimization (DPO) and its variants.\nThese approaches commonly use a binary cross-entropy mechanism on pairwise\nsamples, i.e., minimizing and maximizing the loss based on preferred or\ndis-preferred responses, respectively. However, while this training strategy\nomits the reward model, it also overlooks the varying preference degrees within\ndifferent responses. We hypothesize that this is a key factor hindering LLMs\nfrom sufficiently understanding human preferences. To address this problem, we\npropose a novel Self-supervised Preference Optimization (SPO) framework, which\nconstructs a self-supervised preference degree loss combined with the alignment\nloss, thereby helping LLMs improve their ability to understand the degree of\npreference. Extensive experiments are conducted on two widely used datasets of\ndifferent tasks. The results demonstrate that SPO can be seamlessly integrated\nwith existing preference optimization methods and significantly boost their\nperformance to achieve state-of-the-art performance. We also conduct detailed\nanalyses to offer comprehensive insights into SPO, which verifies its\neffectiveness. The code is available at https://github.com/lijian16/SPO.\n","authors":["Jian Li","Haojing Huang","Yujia Zhang","Pengfei Xu","Xi Chen","Rui Song","Lida Shi","Jingwen Wang","Hao Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17791v1.pdf","comment":"Accepted at EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17788v1","updated":"2024-09-26T12:33:34Z","published":"2024-09-26T12:33:34Z","title":"Ophthalmic Biomarker Detection with Parallel Prediction of Transformer\n and Convolutional Architecture","summary":" Ophthalmic diseases represent a significant global health issue,\nnecessitating the use of advanced precise diagnostic tools. Optical Coherence\nTomography (OCT) imagery which offers high-resolution cross-sectional images of\nthe retina has become a pivotal imaging modality in ophthalmology.\nTraditionally physicians have manually detected various diseases and biomarkers\nfrom such diagnostic imagery. In recent times, deep learning techniques have\nbeen extensively used for medical diagnostic tasks enabling fast and precise\ndiagnosis. This paper presents a novel approach for ophthalmic biomarker\ndetection using an ensemble of Convolutional Neural Network (CNN) and Vision\nTransformer. While CNNs are good for feature extraction within the local\ncontext of the image, transformers are known for their ability to extract\nfeatures from the global context of the image. Using an ensemble of both\ntechniques allows us to harness the best of both worlds. Our method has been\nimplemented on the OLIVES dataset to detect 6 major biomarkers from the OCT\nimages and shows significant improvement of the macro averaged F1 score on the\ndataset.\n","authors":["Md. Touhidul Islam","Md. Abtahi Majeed Chowdhury","Mahmudul Hasan","Asif Quadir","Lutfa Aktar"],"pdf_url":"https://arxiv.org/pdf/2409.17788v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2409.14590v2","updated":"2024-09-26T12:29:45Z","published":"2024-09-22T20:47:04Z","title":"Explainable AI needs formal notions of explanation correctness","summary":" The use of machine learning (ML) in critical domains such as medicine poses\nrisks and requires regulation. One requirement is that decisions of ML systems\nin high-risk applications should be human-understandable. The field of\n\"explainable artificial intelligence\" (XAI) seemingly addresses this need.\nHowever, in its current form, XAI is unfit to provide quality control for ML;\nit itself needs scrutiny. Popular XAI methods cannot reliably answer important\nquestions about ML models, their training data, or a given test input. We\nrecapitulate results demonstrating that popular XAI methods systematically\nattribute importance to input features that are independent of the prediction\ntarget. This limits their utility for purposes such as model and data\n(in)validation, model improvement, and scientific discovery. We argue that the\nfundamental reason for this limitation is that current XAI methods do not\naddress well-defined problems and are not evaluated against objective criteria\nof explanation correctness. Researchers should formally define the problems\nthey intend to solve first and then design methods accordingly. This will lead\nto notions of explanation correctness that can be theoretically verified and\nobjective metrics of explanation performance that can be assessed using\nground-truth data.\n","authors":["Stefan Haufe","Rick Wilming","Benedict Clark","Rustam Zhumagambetov","Danny Panknin","Ahcène Boubekki"],"pdf_url":"https://arxiv.org/pdf/2409.14590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17777v1","updated":"2024-09-26T12:15:13Z","published":"2024-09-26T12:15:13Z","title":"Harnessing Shared Relations via Multimodal Mixup Contrastive Learning\n for Multimodal Classification","summary":" Deep multimodal learning has shown remarkable success by leveraging\ncontrastive learning to capture explicit one-to-one relations across\nmodalities. However, real-world data often exhibits shared relations beyond\nsimple pairwise associations. We propose M3CoL, a Multimodal Mixup Contrastive\nLearning approach to capture nuanced shared relations inherent in multimodal\ndata. Our key contribution is a Mixup-based contrastive loss that learns robust\nrepresentations by aligning mixed samples from one modality with their\ncorresponding samples from other modalities thereby capturing shared relations\nbetween them. For multimodal classification tasks, we introduce a framework\nthat integrates a fusion module with unimodal prediction modules for auxiliary\nsupervision during training, complemented by our proposed Mixup-based\ncontrastive loss. Through extensive experiments on diverse datasets (N24News,\nROSMAP, BRCA, and Food-101), we demonstrate that M3CoL effectively captures\nshared multimodal relations and generalizes across domains. It outperforms\nstate-of-the-art methods on N24News, ROSMAP, and BRCA, while achieving\ncomparable performance on Food-101. Our work highlights the significance of\nlearning shared relations for robust multimodal learning, opening up promising\navenues for future research.\n","authors":["Raja Kumar","Raghav Singhal","Pranamya Kulkarni","Deval Mehta","Kshitij Jadhav"],"pdf_url":"https://arxiv.org/pdf/2409.17777v1.pdf","comment":"RK and RS contributed equally to this work, 20 Pages, 8 Figures, 9\n Tables"},{"id":"http://arxiv.org/abs/2409.17774v1","updated":"2024-09-26T12:11:28Z","published":"2024-09-26T12:11:28Z","title":"Faithfulness and the Notion of Adversarial Sensitivity in NLP\n Explanations","summary":" Faithfulness is arguably the most critical metric to assess the reliability\nof explainable AI. In NLP, current methods for faithfulness evaluation are\nfraught with discrepancies and biases, often failing to capture the true\nreasoning of models. We introduce Adversarial Sensitivity as a novel approach\nto faithfulness evaluation, focusing on the explainer's response when the model\nis under adversarial attack. Our method accounts for the faithfulness of\nexplainers by capturing sensitivity to adversarial input changes. This work\naddresses significant limitations in existing evaluation techniques, and\nfurthermore, quantifies faithfulness from a crucial yet underexplored paradigm.\n","authors":["Supriya Manna","Niladri Sett"],"pdf_url":"https://arxiv.org/pdf/2409.17774v1.pdf","comment":"Accepted as a Full Paper at EMNLP 2024 Workshop BlackBoxNLP"},{"id":"http://arxiv.org/abs/2309.16928v3","updated":"2024-09-26T12:09:22Z","published":"2023-09-29T02:04:24Z","title":"Learning to Receive Help: Intervention-Aware Concept Embedding Models","summary":" Concept Bottleneck Models (CBMs) tackle the opacity of neural architectures\nby constructing and explaining their predictions using a set of high-level\nconcepts. A special property of these models is that they permit concept\ninterventions, wherein users can correct mispredicted concepts and thus improve\nthe model's performance. Recent work, however, has shown that intervention\nefficacy can be highly dependent on the order in which concepts are intervened\non and on the model's architecture and training hyperparameters. We argue that\nthis is rooted in a CBM's lack of train-time incentives for the model to be\nappropriately receptive to concept interventions. To address this, we propose\nIntervention-aware Concept Embedding models (IntCEMs), a novel CBM-based\narchitecture and training paradigm that improves a model's receptiveness to\ntest-time interventions. Our model learns a concept intervention policy in an\nend-to-end fashion from where it can sample meaningful intervention\ntrajectories at train-time. This conditions IntCEMs to effectively select and\nreceive concept interventions when deployed at test-time. Our experiments show\nthat IntCEMs significantly outperform state-of-the-art concept-interpretable\nmodels when provided with test-time concept interventions, demonstrating the\neffectiveness of our approach.\n","authors":["Mateo Espinosa Zarlenga","Katherine M. Collins","Krishnamurthy Dvijotham","Adrian Weller","Zohreh Shams","Mateja Jamnik"],"pdf_url":"https://arxiv.org/pdf/2309.16928v3.pdf","comment":"Accepted as a spotlight at the Thirty-seventh Conference on Neural\n Information Processing Systems (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2409.17767v1","updated":"2024-09-26T12:02:36Z","published":"2024-09-26T12:02:36Z","title":"Federated Learning under Attack: Improving Gradient Inversion for Batch\n of Images","summary":" Federated Learning (FL) has emerged as a machine learning approach able to\npreserve the privacy of user's data. Applying FL, clients train machine\nlearning models on a local dataset and a central server aggregates the learned\nparameters coming from the clients, training a global machine learning model\nwithout sharing user's data. However, the state-of-the-art shows several\napproaches to promote attacks on FL systems. For instance, inverting or leaking\ngradient attacks can find, with high precision, the local dataset used during\nthe training phase of the FL. This paper presents an approach, called Deep\nLeakage from Gradients with Feedback Blending (DLG-FB), which is able to\nimprove the inverting gradient attack, considering the spatial correlation that\ntypically exists in batches of images. The performed evaluation shows an\nimprovement of 19.18% and 48,82% in terms of attack success rate and the number\nof iterations per attacked image, respectively.\n","authors":["Luiz Leite","Yuri Santo","Bruno L. Dalmazo","André Riker"],"pdf_url":"https://arxiv.org/pdf/2409.17767v1.pdf","comment":"5 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.17763v1","updated":"2024-09-26T11:58:41Z","published":"2024-09-26T11:58:41Z","title":"Confidence intervals uncovered: Are we ready for real-world medical\n imaging AI?","summary":" Medical imaging is spearheading the AI transformation of healthcare.\nPerformance reporting is key to determine which methods should be translated\ninto clinical practice. Frequently, broad conclusions are simply derived from\nmean performance values. In this paper, we argue that this common practice is\noften a misleading simplification as it ignores performance variability. Our\ncontribution is threefold. (1) Analyzing all MICCAI segmentation papers (n =\n221) published in 2023, we first observe that more than 50\\% of papers do not\nassess performance variability at all. Moreover, only one (0.5\\%) paper\nreported confidence intervals (CIs) for model performance. (2) To address the\nreporting bottleneck, we show that the unreported standard deviation (SD) in\nsegmentation papers can be approximated by a second-order polynomial function\nof the mean Dice similarity coefficient (DSC). Based on external validation\ndata from 56 previous MICCAI challenges, we demonstrate that this approximation\ncan accurately reconstruct the CI of a method using information provided in\npublications. (3) Finally, we reconstructed 95\\% CIs around the mean DSC of\nMICCAI 2023 segmentation papers. The median CI width was 0.03 which is three\ntimes larger than the median performance gap between the first and second\nranked method. For more than 60\\% of papers, the mean performance of the\nsecond-ranked method was within the CI of the first-ranked method. We conclude\nthat current publications typically do not provide sufficient evidence to\nsupport which models could potentially be translated into clinical practice.\n","authors":["Evangelia Christodoulou","Annika Reinke","Rola Houhou","Piotr Kalinowski","Selen Erkan","Carole H. Sudre","Ninon Burgos","Sofiène Boutaj","Sophie Loizillon","Maëlys Solal","Nicola Rieke","Veronika Cheplygina","Michela Antonelli","Leon D. Mayer","Minu D. Tizabi","M. Jorge Cardoso","Amber Simpson","Paul F. Jäger","Annette Kopp-Schneider","Gaël Varoquaux","Olivier Colliot","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2409.17763v1.pdf","comment":"Paper accepted at MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2409.17757v1","updated":"2024-09-26T11:46:58Z","published":"2024-09-26T11:46:58Z","title":"Integrating Hierarchical Semantic into Iterative Generation Model for\n Entailment Tree Explanation","summary":" Manifestly and logically displaying the line of reasoning from evidence to\nanswer is significant to explainable question answering (QA). The entailment\ntree exhibits the lines structurally, which is different from the\nself-explanation principle in large-scale language models. Existing methods\nrarely consider the semantic association of sentences between and within\nhierarchies within the tree structure, which is prone to apparent mistakes in\ncombinations. In this work, we propose an architecture of integrating the\nHierarchical Semantics of sentences under the framework of Controller-Generator\n(HiSCG) to explain answers. The HiSCG designs a hierarchical mapping between\nhypotheses and facts, discriminates the facts involved in tree constructions,\nand optimizes single-step entailments. To the best of our knowledge, We are the\nfirst to notice hierarchical semantics of sentences between the same layer and\nadjacent layers to yield improvements. The proposed method achieves comparable\nperformance on all three settings of the EntailmentBank dataset. The\ngeneralization results on two out-of-domain datasets also demonstrate the\neffectiveness of our method.\n","authors":["Qin Wang","Jianzhou Feng","Yiming Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04259v2","updated":"2024-09-26T11:42:35Z","published":"2024-08-08T06:57:49Z","title":"EfficientRAG: Efficient Retriever for Multi-Hop Question Answering","summary":" Retrieval-augmented generation (RAG) methods encounter difficulties when\naddressing complex questions like multi-hop queries. While iterative retrieval\nmethods improve performance by gathering additional information, current\napproaches often rely on multiple calls of large language models (LLMs). In\nthis paper, we introduce EfficientRAG, an efficient retriever for multi-hop\nquestion answering. EfficientRAG iteratively generates new queries without the\nneed for LLM calls at each iteration and filters out irrelevant information.\nExperimental results demonstrate that EfficientRAG surpasses existing RAG\nmethods on three open-domain multi-hop question-answering datasets.\n","authors":["Ziyuan Zhuang","Zhiyang Zhang","Sitao Cheng","Fangkai Yang","Jia Liu","Shujian Huang","Qingwei Lin","Saravan Rajmohan","Dongmei Zhang","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.04259v2.pdf","comment":"20 pages, 4 figures"},{"id":"http://arxiv.org/abs/2001.07495v5","updated":"2024-09-26T11:42:25Z","published":"2020-01-21T13:05:31Z","title":"Unsupervisedly Learned Representations: Should the Quest be Over?","summary":" After four decades of research there still exists a Classification accuracy\ngap of about 20% between our best Unsupervisedly Learned Representations\nmethods and the accuracy rates achieved by intelligent animals. It thus may\nwell be that we are looking in the wrong direction. A possible solution to this\npuzzle is presented. We demonstrate that Reinforcement Learning can learn\nrepresentations which achieve the same accuracy as that of animals. Our main\nmodest contribution lies in the observations that: a. when applied to a real\nworld environment Reinforcement Learning does not require labels, and thus may\nbe legitimately considered as Unsupervised Learning, and b. in contrast, when\nReinforcement Learning is applied in a simulated environment it does inherently\nrequire labels and should thus be generally be considered as Supervised\nLearning. The corollary of these observations is that further search for\nUnsupervised Learning competitive paradigms which may be trained in simulated\nenvironments may be futile.\n","authors":["Daniel N. Nissani"],"pdf_url":"https://arxiv.org/pdf/2001.07495v5.pdf","comment":"To be published at The 6th International Conference on Machine\n Learning, Optimization and Data Science - LOD 2020"},{"id":"http://arxiv.org/abs/2409.17755v1","updated":"2024-09-26T11:40:07Z","published":"2024-09-26T11:40:07Z","title":"SECURE: Semantics-aware Embodied Conversation under Unawareness for\n Lifelong Robot Learning","summary":" This paper addresses a challenging interactive task learning scenario we call\nrearrangement under unawareness: to manipulate a rigid-body environment in a\ncontext where the robot is unaware of a concept that's key to solving the\ninstructed task. We propose SECURE, an interactive task learning framework\ndesigned to solve such problems by fixing a deficient domain model using\nembodied conversation. Through dialogue, the robot discovers and then learns to\nexploit unforeseen possibilities. Using SECURE, the robot not only learns from\nthe user's corrective feedback when it makes a mistake, but it also learns to\nmake strategic dialogue decisions for revealing useful evidence about novel\nconcepts for solving the instructed task. Together, these abilities allow the\nrobot to generalise to subsequent tasks using newly acquired knowledge. We\ndemonstrate that a robot that is semantics-aware -- that is, it exploits the\nlogical consequences of both sentence and discourse semantics in the learning\nand inference process -- learns to solve rearrangement under unawareness more\neffectively than a robot that lacks such capabilities.\n","authors":["Rimvydas Rubavicius","Peter David Fagan","Alex Lascarides","Subramanian Ramamoorthy"],"pdf_url":"https://arxiv.org/pdf/2409.17755v1.pdf","comment":"10 pages,4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.17754v1","updated":"2024-09-26T11:36:08Z","published":"2024-09-26T11:36:08Z","title":"Byzantine-Robust Aggregation for Securing Decentralized Federated\n Learning","summary":" Federated Learning (FL) emerges as a distributed machine learning approach\nthat addresses privacy concerns by training AI models locally on devices.\nDecentralized Federated Learning (DFL) extends the FL paradigm by eliminating\nthe central server, thereby enhancing scalability and robustness through the\navoidance of a single point of failure. However, DFL faces significant\nchallenges in optimizing security, as most Byzantine-robust algorithms proposed\nin the literature are designed for centralized scenarios. In this paper, we\npresent a novel Byzantine-robust aggregation algorithm to enhance the security\nof Decentralized Federated Learning environments, coined WFAgg. This proposal\nhandles the adverse conditions and strength robustness of dynamic decentralized\ntopologies at the same time by employing multiple filters to identify and\nmitigate Byzantine attacks. Experimental results demonstrate the effectiveness\nof the proposed algorithm in maintaining model accuracy and convergence in the\npresence of various Byzantine attack scenarios, outperforming state-of-the-art\ncentralized Byzantine-robust aggregation schemes (such as Multi-Krum or\nClustering). These algorithms are evaluated on an IID image classification\nproblem in both centralized and decentralized scenarios.\n","authors":["Diego Cajaraville-Aboy","Ana Fernández-Vilas","Rebeca P. Díaz-Redondo","Manuel Fernández-Veiga"],"pdf_url":"https://arxiv.org/pdf/2409.17754v1.pdf","comment":"18 pages, 7 figures, 1 table"},{"id":"http://arxiv.org/abs/2401.01008v3","updated":"2024-09-26T11:35:22Z","published":"2023-12-13T17:05:37Z","title":"Fast Sampling Through The Reuse Of Attention Maps In Diffusion Models","summary":" Text-to-image diffusion models have demonstrated unprecedented capabilities\nfor flexible and realistic image synthesis. Nevertheless, these models rely on\na time-consuming sampling procedure, which has motivated attempts to reduce\ntheir latency. When improving efficiency, researchers often use the original\ndiffusion model to train an additional network designed specifically for fast\nimage generation. In contrast, our approach seeks to reduce latency directly,\nwithout any retraining, fine-tuning, or knowledge distillation. In particular,\nwe find the repeated calculation of attention maps to be costly yet redundant,\nand instead suggest reusing them during sampling. Our specific reuse strategies\nare based on ODE theory, which implies that the later a map is reused, the\nsmaller the distortion in the final image. We empirically compare these reuse\nstrategies with few-step sampling procedures of comparable latency, finding\nthat reuse generates images that are closer to those produced by the original\nhigh-latency diffusion model.\n","authors":["Rosco Hunter","Łukasz Dudziak","Mohamed S. Abdelfattah","Abhinav Mehrotra","Sourav Bhattacharya","Hongkai Wen"],"pdf_url":"https://arxiv.org/pdf/2401.01008v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10712v3","updated":"2024-09-26T11:15:14Z","published":"2024-02-16T14:15:15Z","title":"An Empirical Study on Cross-lingual Vocabulary Adaptation for Efficient\n Language Model Inference","summary":" The development of state-of-the-art generative large language models (LLMs)\ndisproportionately relies on English-centric tokenizers, vocabulary and\npre-training data. Despite the fact that some LLMs have multilingual\ncapabilities, recent studies have shown that their inference efficiency\ndeteriorates when generating text in languages other than English. This results\nin increased inference time and costs. Cross-lingual vocabulary adaptation\n(CVA) methods have been proposed for adapting models to a target language\naiming to improve downstream performance. However, the effectiveness of these\nmethods on increasing inference efficiency of generative LLMs has yet to be\nexplored. In this paper, we perform an empirical study of five CVA methods on\nfour generative LLMs (including monolingual and multilingual models) across\nfour typologically-diverse languages and four natural language understanding\ntasks. We find that CVA substantially contributes to LLM inference speedups of\nup to 271.5\\%. We also show that adapting LLMs that have been pre-trained on\nmore balanced multilingual data results in downstream performance comparable to\nthe original models.\n","authors":["Atsuki Yamaguchi","Aline Villavicencio","Nikolaos Aletras"],"pdf_url":"https://arxiv.org/pdf/2402.10712v3.pdf","comment":"Accepted at EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2406.12442v2","updated":"2024-09-26T11:15:14Z","published":"2024-06-18T09:46:44Z","title":"Abstraction-of-Thought Makes Language Models Better Reasoners","summary":" Abstract reasoning, the ability to reason from the abstract essence of a\nproblem, serves as a key to generalization in human reasoning. However,\neliciting language models to perform reasoning with abstraction remains\nunexplored. This paper seeks to bridge this gap by introducing a novel\nstructured reasoning format called Abstraction-of-Thought (AoT). The uniqueness\nof AoT lies in its explicit requirement for varying levels of abstraction\nwithin the reasoning process. This approach could elicit language models to\nfirst contemplate on the abstract level before incorporating concrete details,\nwhich is overlooked by the prevailing step-by-step Chain-of-Thought (CoT)\nmethod. To align models with the AoT format, we present AoT Collection, a\ngeneric finetuning dataset consisting of 348k high-quality samples with AoT\nreasoning processes, collected via an automated and scalable pipeline. We\nfinetune a wide range of language models with AoT Collection and conduct\nextensive evaluations on 23 unseen tasks from the challenging benchmark\nBig-Bench Hard. Experimental results indicate that models aligned to AoT\nreasoning format substantially outperform those aligned to CoT in many\nreasoning tasks.\n","authors":["Ruixin Hong","Hongming Zhang","Xiaoman Pan","Dong Yu","Changshui Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.12442v2.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17728v1","updated":"2024-09-26T10:57:02Z","published":"2024-09-26T10:57:02Z","title":"AlterMOMA: Fusion Redundancy Pruning for Camera-LiDAR Fusion Models with\n Alternative Modality Masking","summary":" Camera-LiDAR fusion models significantly enhance perception performance in\nautonomous driving. The fusion mechanism leverages the strengths of each\nmodality while minimizing their weaknesses. Moreover, in practice, camera-LiDAR\nfusion models utilize pre-trained backbones for efficient training. However, we\nargue that directly loading single-modal pre-trained camera and LiDAR backbones\ninto camera-LiDAR fusion models introduces similar feature redundancy across\nmodalities due to the nature of the fusion mechanism. Unfortunately, existing\npruning methods are developed explicitly for single-modal models, and thus,\nthey struggle to effectively identify these specific redundant parameters in\ncamera-LiDAR fusion models. In this paper, to address the issue above on\ncamera-LiDAR fusion models, we propose a novelty pruning framework Alternative\nModality Masking Pruning (AlterMOMA), which employs alternative masking on each\nmodality and identifies the redundant parameters. Specifically, when one\nmodality parameters are masked (deactivated), the absence of features from the\nmasked backbone compels the model to reactivate previous redundant features of\nthe other modality backbone. Therefore, these redundant features and relevant\nredundant parameters can be identified via the reactivation process. The\nredundant parameters can be pruned by our proposed importance score evaluation\nfunction, Alternative Evaluation (AlterEva), which is based on the observation\nof the loss changes when certain modality parameters are activated and\ndeactivated. Extensive experiments on the nuScene and KITTI datasets\nencompassing diverse tasks, baseline models, and pruning algorithms showcase\nthat AlterMOMA outperforms existing pruning methods, attaining state-of-the-art\nperformance.\n","authors":["Shiqi Sun","Yantao Lu","Ning Liu","Bo Jiang","JinChao Chen","Ying Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17728v1.pdf","comment":"17 pages, 3 figures, Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.08160v2","updated":"2024-09-26T10:54:32Z","published":"2024-08-15T13:49:14Z","title":"General-purpose Clothes Manipulation with Semantic Keypoints","summary":" Clothes manipulation is a critical skill for household robots. Recent\nadvancements have been made in task-specific clothes manipulation, such as\nfolding, flattening, and hanging. However, due to clothes' complex geometries\nand deformability, creating a general-purpose robot system that can manipulate\na diverse range of clothes in many ways remains challenging. Since clothes are\ntypically designed with specific structures, we propose identifying these\nspecific features like ``left sleeve'' as semantic keypoints. Semantic\nkeypoints can provide semantic cues for task planning and geometric cues for\nlow-level action generation. With this insight, we develop a hierarchical\nlearning framework using the large language model (LLM) for general-purpose\nCLothes mAnipulation with Semantic keyPoints (CLASP). Extensive simulation\nexperiments show that CLASP outperforms baseline methods on both seen and\nunseen tasks across various clothes manipulation tasks. Real-world experiments\nshow that CLASP can be directly deployed in the real world and applied to a\nwide variety of clothes.\n","authors":["Yuhong Deng","David Hsu"],"pdf_url":"https://arxiv.org/pdf/2408.08160v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1904.04579v6","updated":"2024-09-26T10:53:16Z","published":"2019-04-09T10:30:23Z","title":"A Concept-Value Network as a Brain Model","summary":" This paper suggests a statistical framework for describing the relations\nbetween the physical and conceptual entities of a brain-like model. Features\nand concept instances are put into context, where the paper suggests that\nfeatures may be the electrical wiring, although chemical connections are also\npossible. With this idea, the actual length of the connection is important,\nbecause it is related to firing rates and neuron synchronization, but the\nsignal type is less important. The paper then suggests that concepts are neuron\ngroups that link feature sets and concept instances are determined by chemical\nsignals from those groups. Therefore, features become the static horizontal\nframework of the neural system and concepts are vertically interconnected\ncombinations of these. With regards to functionality, the neuron is then\nconsidered to be functional and the more horizontal memory structures can even\nbe glial. This would also suggest that features can be distributed entities and\nnot concentrated to a single area. Another aspect could be signal 'breaks' that\ncompartmentalise a pattern and may help with neural binding.\n","authors":["Kieran Greer"],"pdf_url":"https://arxiv.org/pdf/1904.04579v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19454v2","updated":"2024-09-26T10:34:40Z","published":"2024-04-30T11:10:34Z","title":"Augmented neural forms with parametric boundary-matching operators for\n solving ordinary differential equations","summary":" Approximating solutions of ordinary and partial differential equations\nconstitutes a significant challenge. Based on functional expressions that\ninherently depend on neural networks, neural forms are specifically designed to\nprecisely satisfy the prescribed initial or boundary conditions of the problem,\nwhile providing the approximate solutions in closed form. Departing from the\nimportant class of ordinary differential equations, the present work aims to\nrefine and validate the neural forms methodology, paving the ground for further\ndevelopments in more challenging fields. The main contributions are as follows.\nFirst, it introduces a formalism for systematically crafting proper neural\nforms with adaptable boundary matches that are amenable to optimization.\nSecond, it describes a novel technique for converting problems with Neumann or\nRobin conditions into equivalent problems with parametric Dirichlet conditions.\nThird, it outlines a method for determining an upper bound on the absolute\ndeviation from the exact solution. The proposed augmented neural forms approach\nwas tested on a set of diverse problems, encompassing first- and second-order\nordinary differential equations, as well as first-order systems. Stiff\ndifferential equations have been considered as well. The resulting solutions\nwere subjected to assessment against existing exact solutions, solutions\nderived through the common penalized neural method, and solutions obtained via\ncontemporary numerical analysis methods. The reported results demonstrate that\nthe augmented neural forms not only satisfy the boundary and initial conditions\nexactly, but also provide closed-form solutions that facilitate high-quality\ninterpolation and controllable overall precision. These attributes are\nessential for expanding the application field of neural forms to more\nchallenging problems that are described by partial differential equations.\n","authors":["Adam D. Kypriadis","Isaac E. Lagaris","Aristidis Likas","Konstantinos E. Parsopoulos"],"pdf_url":"https://arxiv.org/pdf/2404.19454v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17896v2","updated":"2024-09-26T10:33:53Z","published":"2024-07-25T09:36:37Z","title":"SR-CurvANN: Advancing 3D Surface Reconstruction through Curvature-Aware\n Neural Networks","summary":" Incomplete or missing data in three-dimensional (3D) models can lead to\nerroneous or flawed renderings, limiting their usefulness in applications such\nas visualization, geometric computation, and 3D printing. Conventional\nsurface-repair techniques often fail to infer complex geometric details in\nmissing areas. Neural networks successfully address hole-filling tasks in 2D\nimages using inpainting techniques. The combination of surface reconstruction\nalgorithms, guided by the model's curvature properties and the creativity of\nneural networks in the inpainting processes should provide realistic results in\nthe hole completion task. In this paper, we propose a novel method entitled\nSR-CurvANN (Surface Reconstruction Based on Curvature-Aware Neural Networks)\nthat incorporates neural network-based 2D inpainting to effectively reconstruct\n3D surfaces. We train the neural networks with images that represent planar\nrepresentations of the curvature at vertices of hundreds of 3D models. Once the\nmissing areas have been inferred, a coarse-to-fine surface deformation process\nensures that the surface fits the reconstructed curvature image. Our proposal\nmakes it possible to learn and generalize patterns from a wide variety of\ntraining 3D models, generating comprehensive inpainted curvature images and\nsurfaces. Experiments conducted on 959 models with several holes have\ndemonstrated that SR-CurvANN excels in the shape completion process, filling\nholes with a remarkable level of realism and precision.\n","authors":["Marina Hernández-Bautista","Francisco J. Melero"],"pdf_url":"https://arxiv.org/pdf/2407.17896v2.pdf","comment":"Major changes in title, paper structure, text and figures. Improved\n results. 23 pages, 14 figures. Decision about submission not taken yet"},{"id":"http://arxiv.org/abs/2407.14788v2","updated":"2024-09-26T10:21:33Z","published":"2024-07-20T07:39:07Z","title":"On the Design and Analysis of LLM-Based Algorithms","summary":" We initiate a formal investigation into the design and analysis of LLM-based\nalgorithms, i.e. algorithms that contain one or multiple calls of large\nlanguage models (LLMs) as sub-routines and critically rely on the capabilities\nof LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt\nengineering to complicated LLM-powered agent systems and compound AI systems,\nhave achieved remarkable empirical success, the design and optimization of them\nhave mostly relied on heuristics and trial-and-errors, which is largely due to\na lack of formal and analytical study for these algorithms. To fill this gap,\nwe start by identifying the computational-graph representation of LLM-based\nalgorithms, the design principle of task decomposition, and some key\nabstractions, which then facilitate our formal analysis for the accuracy and\nefficiency of LLM-based algorithms, despite the black-box nature of LLMs.\nThrough extensive analytical and empirical investigation in a series of case\nstudies, we demonstrate that the proposed framework is broadly applicable to a\nwide range of scenarios and diverse patterns of LLM-based algorithms, such as\nparallel, hierarchical and recursive task decomposition. Our proposed framework\nholds promise for advancing LLM-based algorithms, by revealing the reasons\nbehind curious empirical phenomena, guiding the choices of hyperparameters,\npredicting the empirical performance of algorithms, and inspiring new algorithm\ndesign. To promote further study of LLM-based algorithms, we release our source\ncode at\nhttps://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm.\n","authors":["Yanxi Chen","Yaliang Li","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.14788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17702v1","updated":"2024-09-26T10:16:08Z","published":"2024-09-26T10:16:08Z","title":"Episodic Memory Verbalization using Hierarchical Representations of\n Life-Long Robot Experience","summary":" Verbalization of robot experience, i.e., summarization of and question\nanswering about a robot's past, is a crucial ability for improving human-robot\ninteraction. Previous works applied rule-based systems or fine-tuned deep\nmodels to verbalize short (several-minute-long) streams of episodic data,\nlimiting generalization and transferability. In our work, we apply large\npretrained models to tackle this task with zero or few examples, and\nspecifically focus on verbalizing life-long experiences. For this, we derive a\ntree-like data structure from episodic memory (EM), with lower levels\nrepresenting raw perception and proprioception data, and higher levels\nabstracting events to natural language concepts. Given such a hierarchical\nrepresentation built from the experience stream, we apply a large language\nmodel as an agent to interactively search the EM given a user's query,\ndynamically expanding (initially collapsed) tree nodes to find the relevant\ninformation. The approach keeps computational costs low even when scaling to\nmonths of robot experience data. We evaluate our method on simulated household\nrobot data, human egocentric videos, and real-world robot recordings,\ndemonstrating its flexibility and scalability.\n","authors":["Leonard Bärmann","Chad DeChant","Joana Plewnia","Fabian Peller-Konrad","Daniel Bauer","Tamim Asfour","Alex Waibel"],"pdf_url":"https://arxiv.org/pdf/2409.17702v1.pdf","comment":"Code, data and demo videos at https://hierarchical-emv.github.io"},{"id":"http://arxiv.org/abs/2409.17699v1","updated":"2024-09-26T10:12:19Z","published":"2024-09-26T10:12:19Z","title":"MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard\n for Prompt Attacks","summary":" The proliferation of Large Language Models (LLMs) in diverse applications\nunderscores the pressing need for robust security measures to thwart potential\njailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger\ndata integrity and user privacy. Guardrails serve as crucial protective\nmechanisms against such threats, but existing models often fall short in terms\nof both detection accuracy, and computational efficiency. This paper advocates\nfor the significance of jailbreak attack prevention on LLMs, and emphasises the\nrole of input guardrails in safeguarding these models. We introduce MoJE\n(Mixture of Jailbreak Expert), a novel guardrail architecture designed to\nsurpass current limitations in existing state-of-the-art guardrails. By\nemploying simple linguistic statistical techniques, MoJE excels in detecting\njailbreak attacks while maintaining minimal computational overhead during model\ninference. Through rigorous experimentation, MoJE demonstrates superior\nperformance capable of detecting 90% of the attacks without compromising benign\nprompts, enhancing LLMs security against jailbreak attacks.\n","authors":["Giandomenico Cornacchia","Giulio Zizzo","Kieran Fraser","Muhammad Zaid Hamed","Ambrish Rawat","Mark Purcell"],"pdf_url":"https://arxiv.org/pdf/2409.17699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17698v1","updated":"2024-09-26T10:09:10Z","published":"2024-09-26T10:09:10Z","title":"The application of GPT-4 in grading design university students'\n assignment and providing feedback: An exploratory study","summary":" This study aims to investigate whether GPT-4 can effectively grade\nassignments for design university students and provide useful feedback. In\ndesign education, assignments do not have a single correct answer and often\ninvolve solving an open-ended design problem. This subjective nature of design\nprojects often leads to grading problems,as grades can vary between different\nraters,for instance instructor from engineering background or architecture\nbackground. This study employs an iterative research approach in developing a\nCustom GPT with the aim of achieving more reliable results and testing whether\nit can provide design students with constructive feedback. The findings\ninclude: First,through several rounds of iterations the inter-reliability\nbetween GPT and human raters reached a level that is generally accepted by\neducators. This indicates that by providing accurate prompts to GPT,and\ncontinuously iterating to build a Custom GPT, it can be used to effectively\ngrade students' design assignments, serving as a reliable complement to human\nraters. Second, the intra-reliability of GPT's scoring at different times is\nbetween 0.65 and 0.78. This indicates that, with adequate instructions, a\nCustom GPT gives consistent results which is a precondition for grading\nstudents. As consistency and comparability are the two main rules to ensure the\nreliability of educational assessment, this study has looked at whether a\nCustom GPT can be developed that adheres to these two rules. We finish the\npaper by testing whether Custom GPT can provide students with useful feedback\nand reflecting on how educators can develop and iterate a Custom GPT to serve\nas a complementary rater.\n","authors":["Qian Huang","Thijs Willems","King Wang Poon"],"pdf_url":"https://arxiv.org/pdf/2409.17698v1.pdf","comment":"25 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17692v1","updated":"2024-09-26T09:57:16Z","published":"2024-09-26T09:57:16Z","title":"MIO: A Foundation Model on Multimodal Tokens","summary":" In this paper, we introduce MIO, a novel foundation model built on multimodal\ntokens, capable of understanding and generating speech, text, images, and\nvideos in an end-to-end, autoregressive manner. While the emergence of large\nlanguage models (LLMs) and multimodal large language models (MM-LLMs) propels\nadvancements in artificial general intelligence through their versatile\ncapabilities, they still lack true any-to-any understanding and generation.\nRecently, the release of GPT-4o has showcased the remarkable potential of\nany-to-any LLMs for complex real-world tasks, enabling omnidirectional input\nand output across images, speech, and text. However, it is closed-source and\ndoes not support the generation of multimodal interleaved sequences. To address\nthis gap, we present MIO, which is trained on a mixture of discrete tokens\nacross four modalities using causal multimodal modeling. MIO undergoes a\nfour-stage training process: (1) alignment pre-training, (2) interleaved\npre-training, (3) speech-enhanced pre-training, and (4) comprehensive\nsupervised fine-tuning on diverse textual, visual, and speech tasks. Our\nexperimental results indicate that MIO exhibits competitive, and in some cases\nsuperior, performance compared to previous dual-modal baselines, any-to-any\nmodel baselines, and even modality-specific baselines. Moreover, MIO\ndemonstrates advanced capabilities inherent to its any-to-any feature, such as\ninterleaved video-text generation, chain-of-visual-thought reasoning, visual\nguideline generation, instructional image editing, etc.\n","authors":["Zekun Wang","King Zhu","Chunpu Xu","Wangchunshu Zhou","Jiaheng Liu","Yibo Zhang","Jiashuo Wang","Ning Shi","Siyu Li","Yizhi Li","Haoran Que","Zhaoxiang Zhang","Yuanxing Zhang","Ge Zhang","Ke Xu","Jie Fu","Wenhao Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17692v1.pdf","comment":"Technical Report. Codes and models will be available soon"},{"id":"http://arxiv.org/abs/2409.15867v3","updated":"2024-09-26T09:56:58Z","published":"2024-09-24T08:41:01Z","title":"In-Context Ensemble Improves Video-Language Models for Low-Level\n Workflow Understanding from Human Demonstrations","summary":" A Standard Operating Procedure (SOP) defines a low-level, step-by-step\nwritten guide for a business software workflow based on a video demonstration.\nSOPs are a crucial step toward automating end-to-end software workflows.\nManually creating SOPs can be time-consuming. Recent advancements in large\nvideo-language models offer the potential for automating SOP generation by\nanalyzing recordings of human demonstrations. However, current large\nvideo-language models face challenges with zero-shot SOP generation. We explore\nin-context learning with video-language models for SOP generation. We report\nthat in-context learning sometimes helps video-language models at SOP\ngeneration. We then propose an in-context ensemble learning to further enhance\nthe capabilities of the models in SOP generation.\n","authors":["Moucheng Xu","Evangelos Chatzaroulas","Luc McCutcheon","Abdul Ahad","Hamzah Azeem","Janusz Marecki","Ammar Anwar"],"pdf_url":"https://arxiv.org/pdf/2409.15867v3.pdf","comment":"multimodal in-context ensemble learning, video-language models, SOP\n generation, pseudo-labels, in-context learning, prompt engineering"},{"id":"http://arxiv.org/abs/2409.17691v1","updated":"2024-09-26T09:56:13Z","published":"2024-09-26T09:56:13Z","title":"Efficient Bias Mitigation Without Privileged Information","summary":" Deep neural networks trained via empirical risk minimisation often exhibit\nsignificant performance disparities across groups, particularly when group and\ntask labels are spuriously correlated (e.g., \"grassy background\" and \"cows\").\nExisting bias mitigation methods that aim to address this issue often either\nrely on group labels for training or validation, or require an extensive\nhyperparameter search. Such data and computational requirements hinder the\npractical deployment of these methods, especially when datasets are too large\nto be group-annotated, computational resources are limited, and models are\ntrained through already complex pipelines. In this paper, we propose Targeted\nAugmentations for Bias Mitigation (TAB), a simple hyperparameter-free framework\nthat leverages the entire training history of a helper model to identify\nspurious samples, and generate a group-balanced training set from which a\nrobust model can be trained. We show that TAB improves worst-group performance\nwithout any group information or model selection, outperforming existing\nmethods while maintaining overall accuracy.\n","authors":["Mateo Espinosa Zarlenga","Swami Sankaranarayanan","Jerone T. A. Andrews","Zohreh Shams","Mateja Jamnik","Alice Xiang"],"pdf_url":"https://arxiv.org/pdf/2409.17691v1.pdf","comment":"Accepted at the 18th European Conference on Computer Vision (ECCV\n 2024) as an Oral presentation"},{"id":"http://arxiv.org/abs/2405.06802v2","updated":"2024-09-26T09:52:20Z","published":"2024-05-10T20:29:25Z","title":"Leveraging summary of radiology reports with transformers","summary":" Two fundamental problems in health-care stem from patient handoff and triage.\nDoctors are often required to perform complex findings summarization to\nfacilitate efficient communication with specialists and decision making on the\nurgency of each case. To address these challenges, we present a state of the\nart radiology report summarization model utilizing adjusted bidirectional\nencoder representation from transformers BERTtoBERT encoder and decoder\narchitecture. We also provide a data processing pipeline for future models\ndeveloped on the the MIMIC CXR dataset. Our approach includes a novel method\nfor augmenting medical data and a comprehensive performance analysis. Our best\nperforming model achieved a recall oriented understudy for gisting evaluation L\nF1 score of 58.75/100, outperforming specialized checkpoints with more\nsophisticated attention mechanisms. We also provide a data processing pipeline\nfor future models developed on the MIMIC chest X-ray dataset. The model\nintroduced in this paper demonstrates significantly improved capacity in\nradiology report summarization, highlighting the potential for ensuring better\nclinical workflows and enhanced patient care.\n","authors":["Raul Salles de Padua","Imran Qureshi"],"pdf_url":"https://arxiv.org/pdf/2405.06802v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.05181v3","updated":"2024-09-26T09:52:13Z","published":"2023-12-08T17:08:03Z","title":"Tenplex: Dynamic Parallelism for Deep Learning using Parallelizable\n Tensor Collections","summary":" Deep learning (DL) jobs use multi-dimensional parallelism, i.e. combining\ndata, model, and pipeline parallelism, to use large GPU clusters efficiently.\nLong-running jobs may experience changes to their GPU allocation: (i) resource\nelasticity during training adds or removes GPUs; (ii) hardware maintenance may\nrequire redeployment on different GPUs; and (iii) GPU failures force jobs to\nrun with fewer devices. Current DL frameworks tie jobs to a set of GPUs and\nthus lack support for these scenarios. In particular, they cannot change the\nmulti-dimensional parallelism of an already-running job in an efficient and\nmodel-independent way.\n We describe Scalai, a state management library for DL systems that enables\njobs to change their parallelism dynamically after the GPU allocation is\nupdated at runtime. Scalai achieves this through a new abstraction, a\nparallelizable tensor collection (PTC), that externalizes the job state during\ntraining. After a GPU change, Scalai uses the PTC to transform the job state:\nthe PTC repartitions the dataset state under data parallelism and exposes it to\nDL workers through a virtual file system; and the PTC obtains the model state\nas partitioned checkpoints and transforms them to reflect the new\nparallelization configuration. For efficiency, Scalai executes PTC\ntransformations in parallel with minimum data movement between workers. Our\nexperiments show that Scalai enables DL jobs to support dynamic parallelization\nwith low overhead.\n","authors":["Marcel Wagenländer","Guo Li","Bo Zhao","Luo Mai","Peter Pietzuch"],"pdf_url":"https://arxiv.org/pdf/2312.05181v3.pdf","comment":"The 30th Symposium on Operating Systems Principles (SOSP24)"},{"id":"http://arxiv.org/abs/2409.17687v1","updated":"2024-09-26T09:51:29Z","published":"2024-09-26T09:51:29Z","title":"Graph Edit Distance with General Costs Using Neural Set Divergence","summary":" Graph Edit Distance (GED) measures the (dis-)similarity between two given\ngraphs, in terms of the minimum-cost edit sequence that transforms one graph to\nthe other. However, the exact computation of GED is NP-Hard, which has recently\nmotivated the design of neural methods for GED estimation. However, they do not\nexplicitly account for edit operations with different costs. In response, we\npropose GRAPHEDX, a neural GED estimator that can work with general costs\nspecified for the four edit operations, viz., edge deletion, edge addition,\nnode deletion and node addition. We first present GED as a quadratic assignment\nproblem (QAP) that incorporates these four costs. Then, we represent each graph\nas a set of node and edge embeddings and use them to design a family of neural\nset divergence surrogates. We replace the QAP terms corresponding to each\noperation with their surrogates. Computing such neural set divergence require\naligning nodes and edges of the two graphs. We learn these alignments using a\nGumbel-Sinkhorn permutation generator, additionally ensuring that the node and\nedge alignments are consistent with each other. Moreover, these alignments are\ncognizant of both the presence and absence of edges between node-pairs.\nExperiments on several datasets, under a variety of edit cost settings, show\nthat GRAPHEDX consistently outperforms state-of-the-art methods and heuristics\nin terms of prediction error.\n","authors":["Eeshaan Jain","Indradyumna Roy","Saswat Meher","Soumen Chakrabarti","Abir De"],"pdf_url":"https://arxiv.org/pdf/2409.17687v1.pdf","comment":"Published at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17685v1","updated":"2024-09-26T09:51:08Z","published":"2024-09-26T09:51:08Z","title":"Artificial Data Point Generation in Clustered Latent Space for Small\n Medical Datasets","summary":" One of the growing trends in machine learning is the use of data generation\ntechniques, since the performance of machine learning models is dependent on\nthe quantity of the training dataset. However, in many medical applications,\ncollecting large datasets is challenging due to resource constraints, which\nleads to overfitting and poor generalization. This paper introduces a novel\nmethod, Artificial Data Point Generation in Clustered Latent Space (AGCL),\ndesigned to enhance classification performance on small medical datasets\nthrough synthetic data generation. The AGCL framework involves feature\nextraction, K-means clustering, cluster evaluation based on a class separation\nmetric, and the generation of synthetic data points from clusters with distinct\nclass representations. This method was applied to Parkinson's disease\nscreening, utilizing facial expression data, and evaluated across multiple\nmachine learning classifiers. Experimental results demonstrate that AGCL\nsignificantly improves classification accuracy compared to baseline, GN and\nkNNMTD. AGCL achieved the highest overall test accuracy of 83.33% and\ncross-validation accuracy of 90.90% in majority voting over different emotions,\nconfirming its effectiveness in augmenting small datasets.\n","authors":["Yasaman Haghbin","Hadi Moradi","Reshad Hosseini"],"pdf_url":"https://arxiv.org/pdf/2409.17685v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.17684v1","updated":"2024-09-26T09:51:07Z","published":"2024-09-26T09:51:07Z","title":"Preserving logical and functional dependencies in synthetic tabular data","summary":" Dependencies among attributes are a common aspect of tabular data. However,\nwhether existing tabular data generation algorithms preserve these dependencies\nwhile generating synthetic data is yet to be explored. In addition to the\nexisting notion of functional dependencies, we introduce the notion of logical\ndependencies among the attributes in this article. Moreover, we provide a\nmeasure to quantify logical dependencies among attributes in tabular data.\nUtilizing this measure, we compare several state-of-the-art synthetic data\ngeneration algorithms and test their capability to preserve logical and\nfunctional dependencies on several publicly available datasets. We demonstrate\nthat currently available synthetic tabular data generation algorithms do not\nfully preserve functional dependencies when they generate synthetic datasets.\nIn addition, we also showed that some tabular synthetic data generation models\ncan preserve inter-attribute logical dependencies. Our review and comparison of\nthe state-of-the-art reveal research needs and opportunities to develop\ntask-specific synthetic tabular data generation models.\n","authors":["Chaithra Umesh","Kristian Schultz","Manjunath Mahendra","Saparshi Bej","Olaf Wolkenhauer"],"pdf_url":"https://arxiv.org/pdf/2409.17684v1.pdf","comment":"Submitted to Pattern Recognition Journal"},{"id":"http://arxiv.org/abs/2409.17683v1","updated":"2024-09-26T09:49:27Z","published":"2024-09-26T09:49:27Z","title":"Zero- and Few-shot Named Entity Recognition and Text Expansion in\n Medication Prescriptions using ChatGPT","summary":" Introduction: Medication prescriptions are often in free text and include a\nmix of two languages, local brand names, and a wide range of idiosyncratic\nformats and abbreviations. Large language models (LLMs) have shown promising\nability to generate text in response to input prompts. We use ChatGPT 3.5 to\nautomatically structure and expand medication statements in discharge summaries\nand thus make them easier to interpret for people and machines. Methods:\nNamed-entity Recognition (NER) and Text Expansion (EX) are used in a zero- and\nfew-shot setting with different prompt strategies. 100 medication statements\nwere manually annotated and curated. NER performance was measured by using\nstrict and partial matching. For the task EX, two experts interpreted the\nresults by assessing semantic equivalence between original and expanded\nstatements. The model performance was measured by precision, recall, and F1\nscore. Results: For NER, the best-performing prompt reached an average F1 score\nof 0.94 in the test set. For EX, the few-shot prompt showed superior\nperformance among other prompts, with an average F1 score of 0.87. Conclusion:\nOur study demonstrates good performance for NER and EX tasks in free-text\nmedication statements using ChatGPT. Compared to a zero-shot baseline, a\nfew-shot approach prevented the system from hallucinating, which would be\nunacceptable when processing safety-relevant medication data.\n","authors":["Natthanaphop Isaradech","Andrea Riedel","Wachiranun Sirikul","Markus Kreuzthaler","Stefan Schulz"],"pdf_url":"https://arxiv.org/pdf/2409.17683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.13197v2","updated":"2024-09-26T09:40:31Z","published":"2022-08-28T10:47:32Z","title":"IDP-PGFE: An Interpretable Disruption Predictor based on Physics-Guided\n Feature Extraction","summary":" Disruption prediction has made rapid progress in recent years, especially in\nmachine learning (ML)-based methods. Understanding why a predictor makes a\ncertain prediction can be as crucial as the prediction's accuracy for future\ntokamak disruption predictors. The purpose of most disruption predictors is\naccuracy or cross-machine capability. However, if a disruption prediction model\ncan be interpreted, it can tell why certain samples are classified as\ndisruption precursors. This allows us to tell the types of incoming disruption\nand gives us insight into the mechanism of disruption. This paper designs a\ndisruption predictor called Interpretable Disruption Predictor based On\nPhysics-guided feature extraction (IDP-PGFE) on J-TEXT. The prediction\nperformance of the model is effectively improved by extracting physics-guided\nfeatures. A high-performance model is required to ensure the validity of the\ninterpretation results. The interpretability study of IDP-PGFE provides an\nunderstanding of J-TEXT disruption and is generally consistent with existing\ncomprehension of disruption. IDP-PGFE has been applied to the disruption due to\ncontinuously increasing density towards density limit experiments on J-TEXT.\nThe time evolution of the PGFE features contribution demonstrates that the\napplication of ECRH triggers radiation-caused disruption, which lowers the\ndensity at disruption. While the application of RMP indeed raises the density\nlimit in J-TEXT. The interpretability study guides intuition on the physical\nmechanisms of density limit disruption that RMPs affect not only the MHD\ninstabilities but also the radiation profile, which delays density limit\ndisruption.\n","authors":["Chengshuo Shen","Wei Zheng","Yonghua Ding","Xinkun Ai","Fengming Xue","Yu Zhong","Nengchao Wang","Li Gao","Zhipeng Chen","Zhoujun Yang","Zhongyong Chen","Yuan Pan","J-TEXT team"],"pdf_url":"https://arxiv.org/pdf/2208.13197v2.pdf","comment":"17 pages, 13 figures"},{"id":"http://arxiv.org/abs/2403.10433v4","updated":"2024-09-26T09:33:29Z","published":"2024-03-15T16:11:15Z","title":"AI-enhanced Collective Intelligence","summary":" Current societal challenges exceed the capacity of humans operating either\nalone or collectively. As AI evolves, its role within human collectives will\nvary from an assistive tool to a participatory member. Humans and AI possess\ncomplementary capabilities that, together, can surpass the collective\nintelligence of either humans or AI in isolation. However, the interactions in\nhuman-AI systems are inherently complex, involving intricate processes and\ninterdependencies. This review incorporates perspectives from complex network\nscience to conceptualize a multilayer representation of human-AI collective\nintelligence, comprising cognition, physical, and information layers. Within\nthis multilayer network, humans and AI agents exhibit varying characteristics;\nhumans differ in diversity from surface-level to deep-level attributes, while\nAI agents range in degrees of functionality and anthropomorphism. We explore\nhow agents' diversity and interactions influence the system's collective\nintelligence and analyze real-world instances of AI-enhanced collective\nintelligence. We conclude by considering potential challenges and future\ndevelopments in this field.\n","authors":["Hao Cui","Taha Yasseri"],"pdf_url":"https://arxiv.org/pdf/2403.10433v4.pdf","comment":"43 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.13503v2","updated":"2024-09-26T09:26:05Z","published":"2024-09-20T13:44:00Z","title":"SatFed: A Resource-Efficient LEO Satellite-Assisted Heterogeneous\n Federated Learning Framework","summary":" Traditional federated learning (FL) frameworks rely heavily on terrestrial\nnetworks, where coverage limitations and increasing bandwidth congestion\nsignificantly hinder model convergence. Fortunately, the advancement of\nlow-Earth orbit (LEO) satellite networks offers promising new communication\navenues to augment traditional terrestrial FL. Despite this potential, the\nlimited satellite-ground communication bandwidth and the heterogeneous\noperating environments of ground devices-including variations in data,\nbandwidth, and computing power-pose substantial challenges for effective and\nrobust satellite-assisted FL. To address these challenges, we propose SatFed, a\nresource-efficient satellite-assisted heterogeneous FL framework. SatFed\nimplements freshness-based model prioritization queues to optimize the use of\nhighly constrained satellite-ground bandwidth, ensuring the transmission of the\nmost critical models. Additionally, a multigraph is constructed to capture\nreal-time heterogeneous relationships between devices, including data\ndistribution, terrestrial bandwidth, and computing capability. This multigraph\nenables SatFed to aggregate satellite-transmitted models into peer guidance,\nenhancing local training in heterogeneous environments. Extensive experiments\nwith real-world LEO satellite networks demonstrate that SatFed achieves\nsuperior performance and robustness compared to state-of-the-art benchmarks.\n","authors":["Yuxin Zhang","Zheng Lin","Zhe Chen","Zihan Fang","Wenjun Zhu","Xianhao Chen","Jin Zhao","Yue Gao"],"pdf_url":"https://arxiv.org/pdf/2409.13503v2.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.17663v1","updated":"2024-09-26T09:21:48Z","published":"2024-09-26T09:21:48Z","title":"Explanation Bottleneck Models","summary":" Recent concept-based interpretable models have succeeded in providing\nmeaningful explanations by pre-defined concept sets. However, the dependency on\nthe pre-defined concepts restricts the application because of the limited\nnumber of concepts for explanations. This paper proposes a novel interpretable\ndeep neural network called explanation bottleneck models (XBMs). XBMs generate\na text explanation from the input without pre-defined concepts and then predict\na final task prediction based on the generated explanation by leveraging\npre-trained vision-language encoder-decoder models. To achieve both the target\ntask performance and the explanation quality, we train XBMs through the target\ntask loss with the regularization penalizing the explanation decoder via the\ndistillation from the frozen pre-trained decoder. Our experiments, including a\ncomparison to state-of-the-art concept bottleneck models, confirm that XBMs\nprovide accurate and fluent natural language explanations without pre-defined\nconcept sets. Code will be available at https://github.com/yshinya6/xbm/.\n","authors":["Shin'ya Yamaguchi","Kosuke Nishida"],"pdf_url":"https://arxiv.org/pdf/2409.17663v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17661v1","updated":"2024-09-26T09:20:12Z","published":"2024-09-26T09:20:12Z","title":"A Fuzzy-based Approach to Predict Human Interaction by Functional\n Near-Infrared Spectroscopy","summary":" The paper introduces a Fuzzy-based Attention (Fuzzy Attention Layer)\nmechanism, a novel computational approach to enhance the interpretability and\nefficacy of neural models in psychological research. The proposed Fuzzy\nAttention Layer mechanism is integrated as a neural network layer within the\nTransformer Encoder model to facilitate the analysis of complex psychological\nphenomena through neural signals, such as those captured by functional\nNear-Infrared Spectroscopy (fNIRS). By leveraging fuzzy logic, the Fuzzy\nAttention Layer is capable of learning and identifying interpretable patterns\nof neural activity. This capability addresses a significant challenge when\nusing Transformer: the lack of transparency in determining which specific brain\nactivities most contribute to particular predictions. Our experimental results\ndemonstrated on fNIRS data from subjects engaged in social interactions\ninvolving handholding reveal that the Fuzzy Attention Layer not only learns\ninterpretable patterns of neural activity but also enhances model performance.\nAdditionally, the learned patterns provide deeper insights into the neural\ncorrelates of interpersonal touch and emotional exchange. The application of\nour model shows promising potential in deciphering the subtle complexities of\nhuman social behaviors, thereby contributing significantly to the fields of\nsocial neuroscience and psychological AI.\n","authors":["Xiaowei Jiang","Liang Ou","Yanan Chen","Na Ao","Yu-Cheng Chang","Thomas Do","Chin-Teng Lin"],"pdf_url":"https://arxiv.org/pdf/2409.17661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12753v2","updated":"2024-09-26T09:17:10Z","published":"2024-04-19T09:59:44Z","title":"AutoScraper: A Progressive Understanding Web Agent for Web Scraper\n Generation","summary":" Web scraping is a powerful technique that extracts data from websites,\nenabling automated data collection, enhancing data analysis capabilities, and\nminimizing manual data entry efforts. Existing methods, wrappers-based methods\nsuffer from limited adaptability and scalability when faced with a new website,\nwhile language agents, empowered by large language models (LLMs), exhibit poor\nreusability in diverse web environments. In this work, we introduce the\nparadigm of generating web scrapers with LLMs and propose AutoScraper, a\ntwo-stage framework that can handle diverse and changing web environments more\nefficiently. AutoScraper leverages the hierarchical structure of HTML and\nsimilarity across different web pages for generating web scrapers. Besides, we\npropose a new executability metric for better measuring the performance of web\nscraper generation tasks. We conduct comprehensive experiments with multiple\nLLMs and demonstrate the effectiveness of our framework. Resources of this\npaper can be found at \\url{https://github.com/EZ-hwh/AutoScraper}\n","authors":["Wenhao Huang","Zhouhong Gu","Chenghao Peng","Zhixu Li","Jiaqing Liang","Yanghua Xiao","Liqian Wen","Zulong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.12753v2.pdf","comment":"19 pages, 4 figures, 18 tables. Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17659v1","updated":"2024-09-26T09:14:16Z","published":"2024-09-26T09:14:16Z","title":"Hierarchical End-to-End Autonomous Driving: Integrating BEV Perception\n with Deep Reinforcement Learning","summary":" End-to-end autonomous driving offers a streamlined alternative to the\ntraditional modular pipeline, integrating perception, prediction, and planning\nwithin a single framework. While Deep Reinforcement Learning (DRL) has recently\ngained traction in this domain, existing approaches often overlook the critical\nconnection between feature extraction of DRL and perception. In this paper, we\nbridge this gap by mapping the DRL feature extraction network directly to the\nperception phase, enabling clearer interpretation through semantic\nsegmentation. By leveraging Bird's-Eye-View (BEV) representations, we propose a\nnovel DRL-based end-to-end driving framework that utilizes multi-sensor inputs\nto construct a unified three-dimensional understanding of the environment. This\nBEV-based system extracts and translates critical environmental features into\nhigh-level abstract states for DRL, facilitating more informed control.\nExtensive experimental evaluations demonstrate that our approach not only\nenhances interpretability but also significantly outperforms state-of-the-art\nmethods in autonomous driving control tasks, reducing the collision rate by\n20%.\n","authors":["Siyi Lu","Lei He","Shengbo Eben Li","Yugong Luo","Jianqiang Wang","Keqiang Li"],"pdf_url":"https://arxiv.org/pdf/2409.17659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14816v2","updated":"2024-09-26T09:11:28Z","published":"2024-09-23T08:46:15Z","title":"VARADE: a Variational-based AutoRegressive model for Anomaly Detection\n on the Edge","summary":" Detecting complex anomalies on massive amounts of data is a crucial task in\nIndustry 4.0, best addressed by deep learning. However, available solutions are\ncomputationally demanding, requiring cloud architectures prone to latency and\nbandwidth issues. This work presents VARADE, a novel solution implementing a\nlight autoregressive framework based on variational inference, which is best\nsuited for real-time execution on the edge. The proposed approach was validated\non a robotic arm, part of a pilot production line, and compared with several\nstate-of-the-art algorithms, obtaining the best trade-off between anomaly\ndetection accuracy, power consumption and inference frequency on two different\nedge platforms.\n","authors":["Alessio Mascolini","Sebastiano Gaiardelli","Francesco Ponzio","Nicola Dall'Ora","Enrico Macii","Sara Vinco","Santa Di Cataldo","Franco Fummi"],"pdf_url":"https://arxiv.org/pdf/2409.14816v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17656v1","updated":"2024-09-26T09:07:20Z","published":"2024-09-26T09:07:20Z","title":"Prototype based Masked Audio Model for Self-Supervised Learning of Sound\n Event Detection","summary":" A significant challenge in sound event detection (SED) is the effective\nutilization of unlabeled data, given the limited availability of labeled data\ndue to high annotation costs. Semi-supervised algorithms rely on labeled data\nto learn from unlabeled data, and the performance is constrained by the quality\nand size of the former. In this paper, we introduce the Prototype based Masked\nAudio Model~(PMAM) algorithm for self-supervised representation learning in\nSED, to better exploit unlabeled data. Specifically, semantically rich\nframe-level pseudo labels are constructed from a Gaussian mixture model (GMM)\nbased prototypical distribution modeling. These pseudo labels supervise the\nlearning of a Transformer-based masked audio model, in which binary\ncross-entropy loss is employed instead of the widely used InfoNCE loss, to\nprovide independent loss contributions from different prototypes, which is\nimportant in real scenarios in which multiple labels may apply to unsupervised\ndata frames. A final stage of fine-tuning with just a small amount of labeled\ndata yields a very high performing SED model. On like-for-like tests using the\nDESED task, our method achieves a PSDS1 score of 62.5\\%, surpassing current\nstate-of-the-art models and demonstrating the superiority of the proposed\ntechnique.\n","authors":["Pengfei Cai","Yan Song","Nan Jiang","Qing Gu","Ian McLoughlin"],"pdf_url":"https://arxiv.org/pdf/2409.17656v1.pdf","comment":"Submitted to ICASSP2025; The code for this paper will be available at\n https://github.com/cai525/Transformer4SED after the paper is accepted"},{"id":"http://arxiv.org/abs/2409.17655v1","updated":"2024-09-26T09:06:56Z","published":"2024-09-26T09:06:56Z","title":"AssistantX: An LLM-Powered Proactive Assistant in Collaborative\n Human-Populated Environment","summary":" The increasing demand for intelligent assistants in human-populated\nenvironments has motivated significant research in autonomous robotic systems.\nTraditional service robots and virtual assistants, however, struggle with\nreal-world task execution due to their limited capacity for dynamic reasoning\nand interaction, particularly when human collaboration is required. Recent\ndevelopments in Large Language Models have opened new avenues for improving\nthese systems, enabling more sophisticated reasoning and natural interaction\ncapabilities. In this paper, we introduce AssistantX, an LLM-powered proactive\nassistant designed to operate autonomously in a physical office environment.\nUnlike conventional service robots, AssistantX leverages a novel multi-agent\narchitecture, PPDR4X, which provides advanced inference capabilities and\ncomprehensive collaboration awareness. By effectively bridging the gap between\nvirtual operations and physical interactions, AssistantX demonstrates robust\nperformance in managing complex real-world scenarios. Our evaluation highlights\nthe architecture's effectiveness, showing that AssistantX can respond to clear\ninstructions, actively retrieve supplementary information from memory, and\nproactively seek collaboration from team members to ensure successful task\ncompletion. More details and videos can be found at\nhttps://assistantx-agent.github.io/AssistantX/.\n","authors":["Nan Sun","Bo Mao","Yongchang Li","Lumeng Ma","Di Guo","Huaping Liu"],"pdf_url":"https://arxiv.org/pdf/2409.17655v1.pdf","comment":"6 pages, 8 figures, 4 tables"},{"id":"http://arxiv.org/abs/2409.17652v1","updated":"2024-09-26T09:00:30Z","published":"2024-09-26T09:00:30Z","title":"FactorSim: Generative Simulation via Factorized Representation","summary":" Generating simulations to train intelligent agents in game-playing and\nrobotics from natural language input, from user input or task documentation,\nremains an open-ended challenge. Existing approaches focus on parts of this\nchallenge, such as generating reward functions or task hyperparameters. Unlike\nprevious work, we introduce FACTORSIM that generates full simulations in code\nfrom language input that can be used to train agents. Exploiting the structural\nmodularity specific to coded simulations, we propose to use a factored\npartially observable Markov decision process representation that allows us to\nreduce context dependence during each step of the generation. For evaluation,\nwe introduce a generative simulation benchmark that assesses the generated\nsimulation code's accuracy and effectiveness in facilitating zero-shot\ntransfers in reinforcement learning settings. We show that FACTORSIM\noutperforms existing methods in generating simulations regarding prompt\nalignment (e.g., accuracy), zero-shot transfer abilities, and human evaluation.\nWe also demonstrate its effectiveness in generating robotic tasks.\n","authors":["Fan-Yun Sun","S. I. Harini","Angela Yi","Yihan Zhou","Alex Zook","Jonathan Tremblay","Logan Cross","Jiajun Wu","Nick Haber"],"pdf_url":"https://arxiv.org/pdf/2409.17652v1.pdf","comment":"neurips 2024, project website:\n https://cs.stanford.edu/~sunfanyun/factorsim/"},{"id":"http://arxiv.org/abs/2311.18576v5","updated":"2024-09-26T08:57:49Z","published":"2023-11-30T14:15:39Z","title":"Fixed-length Dense Descriptor for Efficient Fingerprint Matching","summary":" In fingerprint matching, fixed-length descriptors generally offer greater\nefficiency compared to minutiae set, but the recognition accuracy is not as\ngood as that of the latter. Although much progress has been made in deep\nlearning based fixed-length descriptors recently, they often fall short when\ndealing with incomplete or partial fingerprints, diverse fingerprint poses, and\nsignificant background noise. In this paper, we propose a three-dimensional\nrepresentation called Fixed-length Dense Descriptor (FDD) for efficient\nfingerprint matching. FDD features great spatial properties, enabling it to\ncapture the spatial relationships of the original fingerprints, thereby\nenhancing interpretability and robustness. Our experiments on various\nfingerprint datasets reveal that FDD outperforms other fixed-length\ndescriptors, especially in matching fingerprints of different areas,\ncross-modal fingerprint matching, and fingerprint matching with background\nnoise.\n","authors":["Zhiyu Pan","Yongjie Duan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.18576v5.pdf","comment":"Accepted by WIFS 2024"},{"id":"http://arxiv.org/abs/2409.17650v1","updated":"2024-09-26T08:56:54Z","published":"2024-09-26T08:56:54Z","title":"Digital Twin Ecosystem for Oncology Clinical Operations","summary":" Artificial Intelligence (AI) and Large Language Models (LLMs) hold\nsignificant promise in revolutionizing healthcare, especially in clinical\napplications. Simultaneously, Digital Twin technology, which models and\nsimulates complex systems, has gained traction in enhancing patient care.\nHowever, despite the advances in experimental clinical settings, the potential\nof AI and digital twins to streamline clinical operations remains largely\nuntapped. This paper introduces a novel digital twin framework specifically\ndesigned to enhance oncology clinical operations. We propose the integration of\nmultiple specialized digital twins, such as the Medical Necessity Twin, Care\nNavigator Twin, and Clinical History Twin, to enhance workflow efficiency and\npersonalize care for each patient based on their unique data. Furthermore, by\nsynthesizing multiple data sources and aligning them with the National\nComprehensive Cancer Network (NCCN) guidelines, we create a dynamic Cancer Care\nPath, a continuously evolving knowledge base that enables these digital twins\nto provide precise, tailored clinical recommendations.\n","authors":["Himanshu Pandey","Akhil Amod"," Shivang","Kshitij Jaggi","Ruchi Garg","Abheet Jain","Vinayak Tantia"],"pdf_url":"https://arxiv.org/pdf/2409.17650v1.pdf","comment":"Pre Print"},{"id":"http://arxiv.org/abs/2409.17642v1","updated":"2024-09-26T08:45:15Z","published":"2024-09-26T08:45:15Z","title":"AI Delegates with a Dual Focus: Ensuring Privacy and Strategic\n Self-Disclosure","summary":" Large language model (LLM)-based AI delegates are increasingly utilized to\nact on behalf of users, assisting them with a wide range of tasks through\nconversational interfaces. Despite their advantages, concerns arise regarding\nthe potential risk of privacy leaks, particularly in scenarios involving social\ninteractions. While existing research has focused on protecting privacy by\nlimiting the access of AI delegates to sensitive user information, many social\nscenarios require disclosing private details to achieve desired outcomes,\nnecessitating a balance between privacy protection and disclosure. To address\nthis challenge, we conduct a pilot study to investigate user preferences for AI\ndelegates across various social relations and task scenarios, and then propose\na novel AI delegate system that enables privacy-conscious self-disclosure. Our\nuser study demonstrates that the proposed AI delegate strategically protects\nprivacy, pioneering its use in diverse and dynamic social interactions.\n","authors":["Xi Chen","Zhiyang Zhang","Fangkai Yang","Xiaoting Qin","Chao Du","Xi Cheng","Hangxin Liu","Qingwei Lin","Saravan Rajmohan","Dongmei Zhang","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17640v1","updated":"2024-09-26T08:44:38Z","published":"2024-09-26T08:44:38Z","title":"T3: A Novel Zero-shot Transfer Learning Framework Iteratively Training\n on an Assistant Task for a Target Task","summary":" Long text summarization, gradually being essential for efficiently processing\nlarge volumes of information, stays challenging for Large Language Models\n(LLMs) such as GPT and LLaMA families because of the insufficient open-sourced\ntraining datasets and the high requirement of contextual details dealing. To\naddress the issue, we design a novel zero-shot transfer learning framework,\nabbreviated as T3, to iteratively training a baseline LLM on an assistant task\nfor the target task, where the former should own richer data resources and\nshare structural or semantic similarity with the latter. In practice, T3 is\napproached to deal with the long text summarization task by utilizing question\nanswering as the assistant task, and further validated its effectiveness on the\nBBC summary, NarraSum, FairytaleQA, and NLQuAD datasets, with up to nearly 14%\nimprovement in ROUGE, 35% improvement in BLEU, and 16% improvement in Factscore\ncompared to three baseline LLMs, demonstrating its potential for more\nassistant-target task combinations.\n","authors":["Xindi Tong","Yujin Zhu","Shijian Fan","Liang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17634v1","updated":"2024-09-26T08:31:27Z","published":"2024-09-26T08:31:27Z","title":"P4Q: Learning to Prompt for Quantization in Visual-language Models","summary":" Large-scale pre-trained Vision-Language Models (VLMs) have gained prominence\nin various visual and multimodal tasks, yet the deployment of VLMs on\ndownstream application platforms remains challenging due to their prohibitive\nrequirements of training samples and computing resources. Fine-tuning and\nquantization of VLMs can substantially reduce the sample and computation costs,\nwhich are in urgent need. There are two prevailing paradigms in quantization,\nQuantization-Aware Training (QAT) can effectively quantize large-scale VLMs but\nincur a huge training cost, while low-bit Post-Training Quantization (PTQ)\nsuffers from a notable performance drop. We propose a method that balances\nfine-tuning and quantization named ``Prompt for Quantization'' (P4Q), in which\nwe design a lightweight architecture to leverage contrastive loss supervision\nto enhance the recognition performance of a PTQ model. Our method can\neffectively reduce the gap between image features and text features caused by\nlow-bit quantization, based on learnable prompts to reorganize textual\nrepresentations and a low-bit adapter to realign the distributions of image and\ntext features. We also introduce a distillation loss based on cosine similarity\npredictions to distill the quantized model using a full-precision teacher.\nExtensive experimental results demonstrate that our P4Q method outperforms\nprior arts, even achieving comparable results to its full-precision\ncounterparts. For instance, our 8-bit P4Q can theoretically compress the\nCLIP-ViT/B-32 by 4 $\\times$ while achieving 66.94\\% Top-1 accuracy,\noutperforming the learnable prompt fine-tuned full-precision model by 2.24\\%\nwith negligible additional parameters on the ImageNet dataset.\n","authors":["Huixin Sun","Runqi Wang","Yanjing Li","Xianbin Cao","Xiaolong Jiang","Yao Hu","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14545v2","updated":"2024-09-26T08:29:00Z","published":"2023-06-26T09:35:56Z","title":"Time and State Dependent Neural Delay Differential Equations","summary":" Discontinuities and delayed terms are encountered in the governing equations\nof a large class of problems ranging from physics and engineering to medicine\nand economics. These systems cannot be properly modelled and simulated with\nstandard Ordinary Differential Equations (ODE), or data-driven approximations\nsuch as Neural Ordinary Differential Equations (NODE). To circumvent this\nissue, latent variables are typically introduced to solve the dynamics of the\nsystem in a higher dimensional space and obtain the solution as a projection to\nthe original space. However, this solution lacks physical interpretability. In\ncontrast, Delay Differential Equations (DDEs), and their data-driven\napproximated counterparts, naturally appear as good candidates to characterize\nsuch systems. In this work we revisit the recently proposed Neural DDE by\nintroducing Neural State-Dependent DDE (SDDDE), a general and flexible\nframework that can model multiple and state- and time-dependent delays. We show\nthat our method is competitive and outperforms other continuous-class models on\na wide variety of delayed dynamical systems. Code is available at the\nrepository\n\\href{https://github.com/thibmonsel/Time-and-State-Dependent-Neural-Delay-Differential-Equations}{here}.\n","authors":["Thibault Monsel","Onofrio Semeraro","Lionel Mathelin","Guillaume Charpiat"],"pdf_url":"https://arxiv.org/pdf/2306.14545v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17629v1","updated":"2024-09-26T08:23:04Z","published":"2024-09-26T08:23:04Z","title":"Hand-object reconstruction via interaction-aware graph attention\n mechanism","summary":" Estimating the poses of both a hand and an object has become an important\narea of research due to the growing need for advanced vision computing. The\nprimary challenge involves understanding and reconstructing how hands and\nobjects interact, such as contact and physical plausibility. Existing\napproaches often adopt a graph neural network to incorporate spatial\ninformation of hand and object meshes. However, these approaches have not fully\nexploited the potential of graphs without modification of edges within and\nbetween hand- and object-graphs. We propose a graph-based refinement method\nthat incorporates an interaction-aware graph-attention mechanism to account for\nhand-object interactions. Using edges, we establish connections among closely\ncorrelated nodes, both within individual graphs and across different graphs.\nExperiments demonstrate the effectiveness of our proposed method with notable\nimprovements in the realm of physical plausibility.\n","authors":["Taeyun Woo","Tae-Kyun Kim","Jinah Park"],"pdf_url":"https://arxiv.org/pdf/2409.17629v1.pdf","comment":"7 pages, Accepted by ICIP 2024"},{"id":"http://arxiv.org/abs/2409.17622v1","updated":"2024-09-26T08:16:59Z","published":"2024-09-26T08:16:59Z","title":"Neural P$^3$M: A Long-Range Interaction Modeling Enhancer for Geometric\n GNNs","summary":" Geometric graph neural networks (GNNs) have emerged as powerful tools for\nmodeling molecular geometry. However, they encounter limitations in effectively\ncapturing long-range interactions in large molecular systems. To address this\nchallenge, we introduce Neural P$^3$M, a versatile enhancer of geometric GNNs\nto expand the scope of their capabilities by incorporating mesh points\nalongside atoms and reimaging traditional mathematical operations in a\ntrainable manner. Neural P$^3$M exhibits flexibility across a wide range of\nmolecular systems and demonstrates remarkable accuracy in predicting energies\nand forces, outperforming on benchmarks such as the MD22 dataset. It also\nachieves an average improvement of 22% on the OE62 dataset while integrating\nwith various architectures.\n","authors":["Yusong Wang","Chaoran Cheng","Shaoning Li","Yuxuan Ren","Bin Shao","Ge Liu","Pheng-Ann Heng","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.17622v1.pdf","comment":"Published as a conference paper at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2311.09802v2","updated":"2024-09-26T08:15:50Z","published":"2023-11-16T11:26:21Z","title":"Neuro-Symbolic Integration Brings Causal and Reliable Reasoning Proofs","summary":" Two lines of approaches are adopted for complex reasoning with LLMs. One line\nof work prompts LLMs with various reasoning structures, while the structural\noutputs can be naturally regarded as intermediate reasoning steps. Another line\nof work adopt LLM-free declarative solvers to do the reasoning task, rendering\nhigher reasoning accuracy but lacking interpretability due to the black-box\nnature of the solvers. Aiming to resolve the trade-off between answer accuracy\nand interpretability, we present a simple extension to the latter line of work.\nSpecifically, we showcase that the intermediate search logs generated by Prolog\ninterpreters can be accessed and interpreted into human-readable reasoning\nproofs. As long as LLMs correctly translate problem descriptions into Prolog\nrepresentations, the corresponding reasoning proofs are ensured to be causal\nand reliable. On two logical reasoning and one arithmetic reasoning datasets,\nour framework obtains significant improvements in terms of both answer accuracy\nand reasoning proof accuracy. Our code is released at\nhttps://github.com/DAMO-NLP-SG/CaRing\n","authors":["Sen Yang","Xin Li","Leyang Cui","Lidong Bing","Wai Lam"],"pdf_url":"https://arxiv.org/pdf/2311.09802v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14208v2","updated":"2024-09-26T08:12:59Z","published":"2024-06-20T11:26:06Z","title":"SeCoKD: Aligning Large Language Models for In-Context Learning with\n Fewer Shots","summary":" Previous studies have shown that demonstrations can significantly help Large\nLanguage Models (LLMs ) perform better on the given tasks. However, this\nso-called In-Context Learning ( ICL ) ability is very sensitive to the\npresenting context, and often dozens of demonstrations are needed. In this\nwork, we investigate if we can reduce the shot number while still maintaining a\ncompetitive performance. We present SeCoKD, a self-Knowledge Distillation ( KD\n) training framework that aligns the student model with a heavily prompted\nvariation, thereby increasing the utilization of a single demonstration. We\nexperiment with the SeCoKD across three LLMs and six benchmarks focusing mainly\non reasoning tasks. Results show that our method outperforms the base model and\nSupervised Fine-tuning ( SFT ), especially in zero-shot and one-shot settings\nby 30% and 10%, respectively. Moreover, SeCoKD brings little negative artifacts\nwhen evaluated on new tasks, which is more robust than Supervised Fine-tuning.\n","authors":["Weixing Wang","Haojin Yang","Christoph Meinel"],"pdf_url":"https://arxiv.org/pdf/2406.14208v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2409.15254v3","updated":"2024-09-26T08:01:39Z","published":"2024-09-23T17:53:42Z","title":"Archon: An Architecture Search Framework for Inference-Time Techniques","summary":" Inference-time techniques are emerging as highly effective tools to increase\nlarge language model (LLM) capabilities. However, there is still limited\nunderstanding of the best practices for developing systems that combine\ninference-time techniques with one or more LLMs, with challenges including: (1)\neffectively allocating inference compute budget, (2) understanding the\ninteractions between different combinations of inference-time techniques and\ntheir impact on downstream performance, and 3) efficiently searching over the\nlarge space of model choices, inference-time techniques, and their\ncompositions. To address these challenges, we introduce Archon, an automated\nframework for designing inference-time architectures. Archon defines an\nextensible design space, encompassing methods such as generation ensembling,\nmulti-sampling, ranking, fusion, critiquing, verification, and unit testing. It\nthen transforms the problem of selecting and combining LLMs and inference-time\ntechniques into a hyperparameter optimization objective. To optimize this\nobjective, we introduce automated Inference-Time Architecture Search (ITAS)\nalgorithms. Given target benchmark(s), an inference compute budget, and\navailable LLMs, ITAS outputs optimized architectures. We evaluate Archon\narchitectures across a wide range of instruction-following and reasoning\nbenchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval,\nMixEval Hard, MATH, and CodeContests. We show that automatically designed\ninference-time architectures by Archon outperform strong models such as GPT-4o\nand Claude 3.5 Sonnet on these benchmarks, achieving an average increase of\n15.1 and 11.2 percentage points with all-source models and open-source models,\nrespectively. We make our code and datasets available publicly on Github:\nhttps://github.com/ScalingIntelligence/Archon.\n","authors":["Jon Saad-Falcon","Adrian Gamarra Lafuente","Shlok Natarajan","Nahum Maru","Hristo Todorov","Etash Guha","E. Kelly Buchanan","Mayee Chen","Neel Guha","Christopher Ré","Azalia Mirhoseini"],"pdf_url":"https://arxiv.org/pdf/2409.15254v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04428v2","updated":"2024-09-26T07:53:04Z","published":"2024-09-06T17:48:44Z","title":"Hybrid Spiking Neural Networks for Low-Power Intra-Cortical\n Brain-Machine Interfaces","summary":" Intra-cortical brain-machine interfaces (iBMIs) have the potential to\ndramatically improve the lives of people with paraplegia by restoring their\nability to perform daily activities. However, current iBMIs suffer from\nscalability and mobility limitations due to bulky hardware and wiring. Wireless\niBMIs offer a solution but are constrained by a limited data rate. To overcome\nthis challenge, we are investigating hybrid spiking neural networks for\nembedded neural decoding in wireless iBMIs. The networks consist of a temporal\nconvolution-based compression followed by recurrent processing and a final\ninterpolation back to the original sequence length. As recurrent units, we\nexplore gated recurrent units (GRUs), leaky integrate-and-fire (LIF) neurons,\nand a combination of both - spiking GRUs (sGRUs) and analyze their differences\nin terms of accuracy, footprint, and activation sparsity. To that end, we train\ndecoders on the \"Nonhuman Primate Reaching with Multichannel Sensorimotor\nCortex Electrophysiology\" dataset and evaluate it using the NeuroBench\nframework, targeting both tracks of the IEEE BioCAS Grand Challenge on Neural\nDecoding. Our approach achieves high accuracy in predicting velocities of\nprimate reaching movements from multichannel primary motor cortex recordings\nwhile maintaining a low number of synaptic operations, surpassing the current\nbaseline models in the NeuroBench framework. This work highlights the potential\nof hybrid neural networks to facilitate wireless iBMIs with high decoding\nprecision and a substantial increase in the number of monitored neurons, paving\nthe way toward more advanced neuroprosthetic technologies.\n","authors":["Alexandru Vasilache","Jann Krausse","Klaus Knobloch","Juergen Becker"],"pdf_url":"https://arxiv.org/pdf/2409.04428v2.pdf","comment":"This work has been accepted at the 2024 IEEE Biomedical Circuits and\n Systems Conference"},{"id":"http://arxiv.org/abs/2409.17607v1","updated":"2024-09-26T07:47:50Z","published":"2024-09-26T07:47:50Z","title":"Dirichlet-Based Coarse-to-Fine Example Selection For Open-Set Annotation","summary":" Active learning (AL) has achieved great success by selecting the most\nvaluable examples from unlabeled data. However, they usually deteriorate in\nreal scenarios where open-set noise gets involved, which is studied as open-set\nannotation (OSA). In this paper, we owe the deterioration to the unreliable\npredictions arising from softmax-based translation invariance and propose a\nDirichlet-based Coarse-to-Fine Example Selection (DCFS) strategy accordingly.\nOur method introduces simplex-based evidential deep learning (EDL) to break\ntranslation invariance and distinguish known and unknown classes by considering\nevidence-based data and distribution uncertainty simultaneously. Furthermore,\nhard known-class examples are identified by model discrepancy generated from\ntwo classifier heads, where we amplify and alleviate the model discrepancy\nrespectively for unknown and known classes. Finally, we combine the discrepancy\nwith uncertainties to form a two-stage strategy, selecting the most informative\nexamples from known classes. Extensive experiments on various openness ratio\ndatasets demonstrate that DCFS achieves state-of-art performance.\n","authors":["Ye-Wen Wang","Chen-Chen Zong","Ming-Kun Xie","Sheng-Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17602v1","updated":"2024-09-26T07:36:49Z","published":"2024-09-26T07:36:49Z","title":"Open Digital Rights Enforcement Framework (ODRE): from descriptive to\n enforceable policies","summary":" From centralised platforms to decentralised ecosystems, like Data Spaces,\nsharing data has become a paramount challenge. For this reason, the definition\nof data usage policies has become crucial in these domains, highlighting the\nnecessity of effective policy enforcement mechanisms. The Open Digital Rights\nLanguage (ODRL) is a W3C standard ontology designed to describe data usage\npolicies, however, it lacks built-in enforcement capabilities, limiting its\npractical application. This paper introduces the Open Digital Rights\nEnforcement (ODRE) framework, whose goal is to provide ODRL with enforcement\ncapabilities. The ODRE framework proposes a novel approach to express ODRL\npolicies that integrates the descriptive ontology terms of ODRL with other\nlanguages that allow behaviour specification, such as dynamic data handling or\nfunction evaluation. The framework includes an enforcement algorithm for ODRL\npolicies and two open-source implementations in Python and Java. The ODRE\nframework is also designed to support future extensions of ODRL to specific\ndomain scenarios. In addition, current limitations of ODRE, ODRL, and current\nchallenges are reported. Finally, to demonstrate the enforcement capabilities\nof the implementations, their performance, and their extensibility features,\nseveral experiments have been carried out with positive results.\n","authors":["Andrea Cimmino","Juan Cano-Benito","Raúl García-Castro"],"pdf_url":"https://arxiv.org/pdf/2409.17602v1.pdf","comment":"20 pages, 3 Figures, Submitted to Computers & Security journal"},{"id":"http://arxiv.org/abs/2409.17601v1","updated":"2024-09-26T07:35:23Z","published":"2024-09-26T07:35:23Z","title":"TA-Cleaner: A Fine-grained Text Alignment Backdoor Defense Strategy for\n Multimodal Contrastive Learning","summary":" Pre-trained large models for multimodal contrastive learning, such as CLIP,\nhave been widely recognized in the industry as highly susceptible to\ndata-poisoned backdoor attacks. This poses significant risks to downstream\nmodel training. In response to such potential threats, finetuning offers a\nsimpler and more efficient defense choice compared to retraining large models\nwith augmented data. In the supervised learning domain, fine-tuning defense\nstrategies can achieve excellent defense performance. However, in the\nunsupervised and semi-supervised domain, we find that when CLIP faces some\ncomplex attack techniques, the existing fine-tuning defense strategy,\nCleanCLIP, has some limitations on defense performance. The synonym\nsubstitution of its text-augmentation is insufficient to enhance the text\nfeature space. To compensate for this weakness, we improve it by proposing a\nfine-grained \\textbf{T}ext \\textbf{A}lignment \\textbf{C}leaner (TA-Cleaner) to\ncut off feature connections of backdoor triggers. We randomly select a few\nsamples for positive and negative subtext generation at each epoch of\nCleanCLIP, and align the subtexts to the images to strengthen the text\nself-supervision. We evaluate the effectiveness of our TA-Cleaner against six\nattack algorithms and conduct comprehensive zero-shot classification tests on\nImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves\nstate-of-the-art defensiveness among finetuning-based defense techniques. Even\nwhen faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms\nCleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\\% and 63.88\\%,\nrespectively.\n","authors":["Yuan Xun","Siyuan Liang","Xiaojun Jia","Xinwei Liu","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2409.17601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17596v1","updated":"2024-09-26T07:22:38Z","published":"2024-09-26T07:22:38Z","title":"Subjective and Objective Quality-of-Experience Evaluation Study for Live\n Video Streaming","summary":" In recent years, live video streaming has gained widespread popularity across\nvarious social media platforms. Quality of experience (QoE), which reflects\nend-users' satisfaction and overall experience, plays a critical role for media\nservice providers to optimize large-scale live compression and transmission\nstrategies to achieve perceptually optimal rate-distortion trade-off. Although\nmany QoE metrics for video-on-demand (VoD) have been proposed, there remain\nsignificant challenges in developing QoE metrics for live video streaming. To\nbridge this gap, we conduct a comprehensive study of subjective and objective\nQoE evaluations for live video streaming. For the subjective QoE study, we\nintroduce the first live video streaming QoE dataset, TaoLive QoE, which\nconsists of $42$ source videos collected from real live broadcasts and $1,155$\ncorresponding distorted ones degraded due to a variety of streaming\ndistortions, including conventional streaming distortions such as compression,\nstalling, as well as live streaming-specific distortions like frame skipping,\nvariable frame rate, etc. Subsequently, a human study was conducted to derive\nsubjective QoE scores of videos in the TaoLive QoE dataset. For the objective\nQoE study, we benchmark existing QoE models on the TaoLive QoE dataset as well\nas publicly available QoE datasets for VoD scenarios, highlighting that current\nmodels struggle to accurately assess video QoE, particularly for live content.\nHence, we propose an end-to-end QoE evaluation model, Tao-QoE, which integrates\nmulti-scale semantic features and optical flow-based motion features to\npredicting a retrospective QoE score, eliminating reliance on statistical\nquality of service (QoS) features.\n","authors":["Zehao Zhu","Wei Sun","Jun Jia","Wei Wu","Sibin Deng","Kai Li","Ying Chen","Xiongkuo Min","Jia Wang","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2409.17596v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17592v1","updated":"2024-09-26T07:19:12Z","published":"2024-09-26T07:19:12Z","title":"Deep Manifold Part 1: Anatomy of Neural Network Manifold","summary":" Based on the numerical manifold method principle, we developed a mathematical\nframework of a neural network manifold: Deep Manifold and discovered that\nneural networks: 1) is numerical computation combining forward and inverse; 2)\nhave near infinite degrees of freedom; 3) exponential learning capacity with\ndepth; 4) have self-progressing boundary conditions; 5) has training hidden\nbottleneck. We also define two concepts: neural network learning space and deep\nmanifold space and introduce two concepts: neural network intrinsic pathway and\nfixed point. We raise three fundamental questions: 1). What is the training\ncompletion definition; 2). where is the deep learning convergence point (neural\nnetwork fixed point); 3). How important is token timestamp in training data\ngiven negative time is critical in inverse problem.\n","authors":["Max Y. Ma","Gen-Hua Shi"],"pdf_url":"https://arxiv.org/pdf/2409.17592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17589v1","updated":"2024-09-26T07:12:04Z","published":"2024-09-26T07:12:04Z","title":"Improving Fast Adversarial Training via Self-Knowledge Guidance","summary":" Adversarial training has achieved remarkable advancements in defending\nagainst adversarial attacks. Among them, fast adversarial training (FAT) is\ngaining attention for its ability to achieve competitive robustness with fewer\ncomputing resources. Existing FAT methods typically employ a uniform strategy\nthat optimizes all training data equally without considering the influence of\ndifferent examples, which leads to an imbalanced optimization. However, this\nimbalance remains unexplored in the field of FAT. In this paper, we conduct a\ncomprehensive study of the imbalance issue in FAT and observe an obvious class\ndisparity regarding their performances. This disparity could be embodied from a\nperspective of alignment between clean and robust accuracy. Based on the\nanalysis, we mainly attribute the observed misalignment and disparity to the\nimbalanced optimization in FAT, which motivates us to optimize different\ntraining data adaptively to enhance robustness. Specifically, we take disparity\nand misalignment into consideration. First, we introduce self-knowledge guided\nregularization, which assigns differentiated regularization weights to each\nclass based on its training state, alleviating class disparity. Additionally,\nwe propose self-knowledge guided label relaxation, which adjusts label\nrelaxation according to the training accuracy, alleviating the misalignment and\nimproving robustness. By combining these methods, we formulate the\nSelf-Knowledge Guided FAT (SKG-FAT), leveraging naturally generated knowledge\nduring training to enhance the adversarial robustness without compromising\ntraining efficiency. Extensive experiments on four standard datasets\ndemonstrate that the SKG-FAT improves the robustness and preserves competitive\nclean accuracy, outperforming the state-of-the-art methods.\n","authors":["Chengze Jiang","Junkai Wang","Minjing Dong","Jie Gui","Xinli Shi","Yuan Cao","Yuan Yan Tang","James Tin-Yau Kwok"],"pdf_url":"https://arxiv.org/pdf/2409.17589v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2409.17587v1","updated":"2024-09-26T07:07:08Z","published":"2024-09-26T07:07:08Z","title":"Multimodal Banking Dataset: Understanding Client Needs through Event\n Sequences","summary":" Financial organizations collect a huge amount of data about clients that\ntypically has a temporal (sequential) structure and is collected from various\nsources (modalities). Due to privacy issues, there are no large-scale\nopen-source multimodal datasets of event sequences, which significantly limits\nthe research in this area. In this paper, we present the industrial-scale\npublicly available multimodal banking dataset, MBD, that contains more than\n1.5M corporate clients with several modalities: 950M bank transactions, 1B geo\nposition events, 5M embeddings of dialogues with technical support and monthly\naggregated purchases of four bank's products. All entries are properly\nanonymized from real proprietary bank data. Using this dataset, we introduce a\nnovel benchmark with two business tasks: campaigning (purchase prediction in\nthe next month) and matching of clients. We provide numerical results that\ndemonstrate the superiority of our multi-modal baselines over single-modal\ntechniques for each task. As a result, the proposed dataset can open new\nperspectives and facilitate the future development of practically important\nlarge-scale multimodal algorithms for event sequences.\n HuggingFace Link: https://huggingface.co/datasets/ai-lab/MBD\n Github Link: https://github.com/Dzhambo/MBD\n","authors":["Mollaev Dzhambulat","Alexander Kostin","Postnova Maria","Ivan Karpukhin","Ivan A Kireev","Gleb Gusev","Andrey Savchenko"],"pdf_url":"https://arxiv.org/pdf/2409.17587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17583v1","updated":"2024-09-26T07:01:29Z","published":"2024-09-26T07:01:29Z","title":"Let the Quantum Creep In: Designing Quantum Neural Network Models by\n Gradually Swapping Out Classical Components","summary":" Artificial Intelligence (AI), with its multiplier effect and wide\napplications in multiple areas, could potentially be an important application\nof quantum computing. Since modern AI systems are often built on neural\nnetworks, the design of quantum neural networks becomes a key challenge in\nintegrating quantum computing into AI. To provide a more fine-grained\ncharacterisation of the impact of quantum components on the performance of\nneural networks, we propose a framework where classical neural network layers\nare gradually replaced by quantum layers that have the same type of input and\noutput while keeping the flow of information between layers unchanged,\ndifferent from most current research in quantum neural network, which favours\nan end-to-end quantum model. We start with a simple three-layer classical\nneural network without any normalisation layers or activation functions, and\ngradually change the classical layers to the corresponding quantum versions. We\nconduct numerical experiments on image classification datasets such as the\nMNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of\nperformance brought by the systematic introduction of quantum components.\nThrough this framework, our research sheds new light on the design of future\nquantum neural network models where it could be more favourable to search for\nmethods and frameworks that harness the advantages from both the classical and\nquantum worlds.\n","authors":["Peiyong Wang","Casey. R. Myers","Lloyd C. L. Hollenberg","Udaya Parampalli"],"pdf_url":"https://arxiv.org/pdf/2409.17583v1.pdf","comment":"50 pages (including Appendix), many figures, accepted as a poster on\n QTML2024. Code available at\n https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In"},{"id":"http://arxiv.org/abs/2406.10267v2","updated":"2024-09-26T06:57:27Z","published":"2024-06-11T09:24:18Z","title":"Unused information in token probability distribution of generative LLM:\n improving LLM reading comprehension through calculation of expected values","summary":" LLM text decoding is key component for perceived LLM quality. We demonstrate\ntwo experiments showing that decoding methods could be improved by manipulation\nof token probabilities. First, we test few LLM on SummEval summary scoring\ndataset, to measure reading comprehension. We compare scores from greedy\ndecoding to expected values over the next token distribution. We scale logits\nby large temperature to increase the entropy of scores. This allows strong\nimprovement of performance on SummEval (in terms of correlations to human\njudgement). We see improvement from 6-8% to 13-28% for 7B Mistral and from\n20%-46% to 37%-56% for Mixtral, beating GPT 4 0314 result on two metrics. Part\nof the gain seems related to positional bias. Secondly, we use\nprobability-based tree sampling algorithm, to examine all most probable\ngenerations for given prompt.\n","authors":["Krystian Zawistowski"],"pdf_url":"https://arxiv.org/pdf/2406.10267v2.pdf","comment":"7 pages, 1 figure, presented at FEDCSIS 2024 conference,"},{"id":"http://arxiv.org/abs/2409.17581v1","updated":"2024-09-26T06:57:22Z","published":"2024-09-26T06:57:22Z","title":"A Scalable Data-Driven Framework for Systematic Analysis of SEC 10-K\n Filings Using Large Language Models","summary":" The number of companies listed on the NYSE has been growing exponentially,\ncreating a significant challenge for market analysts, traders, and stockholders\nwho must monitor and assess the performance and strategic shifts of a large\nnumber of companies regularly. There is an increasing need for a fast,\ncost-effective, and comprehensive method to evaluate the performance and detect\nand compare many companies' strategy changes efficiently. We propose a novel\ndata-driven approach that leverages large language models (LLMs) to\nsystematically analyze and rate the performance of companies based on their SEC\n10-K filings. These filings, which provide detailed annual reports on a\ncompany's financial performance and strategic direction, serve as a rich source\nof data for evaluating various aspects of corporate health, including\nconfidence, environmental sustainability, innovation, and workforce management.\nWe also introduce an automated system for extracting and preprocessing 10-K\nfilings. This system accurately identifies and segments the required sections\nas outlined by the SEC, while also isolating key textual content that contains\ncritical information about the company. This curated data is then fed into\nCohere's Command-R+ LLM to generate quantitative ratings across various\nperformance metrics. These ratings are subsequently processed and visualized to\nprovide actionable insights. The proposed scheme is then implemented on an\ninteractive GUI as a no-code solution for running the data pipeline and\ncreating the visualizations. The application showcases the rating results and\nprovides year-on-year comparisons of company performance.\n","authors":["Syed Affan Daimi","Asma Iqbal"],"pdf_url":"https://arxiv.org/pdf/2409.17581v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.17580v1","updated":"2024-09-26T06:53:29Z","published":"2024-09-26T06:53:29Z","title":"Enhancing Structured-Data Retrieval with GraphRAG: Soccer Data Case\n Study","summary":" Extracting meaningful insights from large and complex datasets poses\nsignificant challenges, particularly in ensuring the accuracy and relevance of\nretrieved information. Traditional data retrieval methods such as sequential\nsearch and index-based retrieval often fail when handling intricate and\ninterconnected data structures, resulting in incomplete or misleading outputs.\nTo overcome these limitations, we introduce Structured-GraphRAG, a versatile\nframework designed to enhance information retrieval across structured datasets\nin natural language queries. Structured-GraphRAG utilizes multiple knowledge\ngraphs, which represent data in a structured format and capture complex\nrelationships between entities, enabling a more nuanced and comprehensive\nretrieval of information. This graph-based approach reduces the risk of errors\nin language model outputs by grounding responses in a structured format,\nthereby enhancing the reliability of results. We demonstrate the effectiveness\nof Structured-GraphRAG by comparing its performance with that of a recently\npublished method using traditional retrieval-augmented generation. Our findings\nshow that Structured-GraphRAG significantly improves query processing\nefficiency and reduces response times. While our case study focuses on soccer\ndata, the framework's design is broadly applicable, offering a powerful tool\nfor data analysis and enhancing language model applications across various\nstructured domains.\n","authors":["Zahra Sepasdar","Sushant Gautam","Cise Midoglu","Michael A. Riegler","Pål Halvorsen"],"pdf_url":"https://arxiv.org/pdf/2409.17580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17572v1","updated":"2024-09-26T06:40:45Z","published":"2024-09-26T06:40:45Z","title":"Dr. GPT in Campus Counseling: Understanding Higher Education Students'\n Opinions on LLM-assisted Mental Health Services","summary":" In response to the increasing mental health challenges faced by college\nstudents, we sought to understand their perspectives on how AI applications,\nparticularly Large Language Models (LLMs), can be leveraged to enhance their\nmental well-being. Through pilot interviews with ten diverse students, we\nexplored their opinions on the use of LLMs across five fictional scenarios:\nGeneral Information Inquiry, Initial Screening, Reshaping Patient-Expert\nDynamics, Long-term Care, and Follow-up Care. Our findings revealed that\nstudents' acceptance of LLMs varied by scenario, with participants highlighting\nboth potential benefits, such as proactive engagement and personalized\nfollow-up care, and concerns, including limitations in training data and\nemotional support. These insights inform how AI technology should be designed\nand implemented to effectively support and enhance students' mental well-being,\nparticularly in scenarios where LLMs can complement traditional methods, while\nmaintaining empathy and respecting individual preferences.\n","authors":["Owen Xingjian Zhang","Shuyao Zhou","Jiayi Geng","Yuhan Liu","Sunny Xun Liu"],"pdf_url":"https://arxiv.org/pdf/2409.17572v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2409.17568v1","updated":"2024-09-26T06:31:31Z","published":"2024-09-26T06:31:31Z","title":"Showing Many Labels in Multi-label Classification Models: An Empirical\n Study of Adversarial Examples","summary":" With the rapid development of Deep Neural Networks (DNNs), they have been\napplied in numerous fields. However, research indicates that DNNs are\nsusceptible to adversarial examples, and this is equally true in the\nmulti-label domain. To further investigate multi-label adversarial examples, we\nintroduce a novel type of attacks, termed \"Showing Many Labels\". The objective\nof this attack is to maximize the number of labels included in the classifier's\nprediction results. In our experiments, we select nine attack algorithms and\nevaluate their performance under \"Showing Many Labels\". Eight of the attack\nalgorithms were adapted from the multi-class environment to the multi-label\nenvironment, while the remaining one was specifically designed for the\nmulti-label environment. We choose ML-LIW and ML-GCN as target models and train\nthem on four popular multi-label datasets: VOC2007, VOC2012, NUS-WIDE, and\nCOCO. We record the success rate of each algorithm when it shows the expected\nnumber of labels in eight different scenarios. Experimental results indicate\nthat under the \"Showing Many Labels\", iterative attacks perform significantly\nbetter than one-step attacks. Moreover, it is possible to show all labels in\nthe dataset.\n","authors":["Yujiang Liu","Wenjian Luo","Zhijian Chen","Muhammad Luqman Naseem"],"pdf_url":"https://arxiv.org/pdf/2409.17568v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.12598v2","updated":"2024-09-26T06:31:25Z","published":"2024-08-22T17:59:01Z","title":"ND-SDF: Learning Normal Deflection Fields for High-Fidelity Indoor\n Reconstruction","summary":" Neural implicit reconstruction via volume rendering has demonstrated its\neffectiveness in recovering dense 3D surfaces. However, it is non-trivial to\nsimultaneously recover meticulous geometry and preserve smoothness across\nregions with differing characteristics. To address this issue, previous methods\ntypically employ geometric priors, which are often constrained by the\nperformance of the prior models. In this paper, we propose ND-SDF, which learns\na Normal Deflection field to represent the angular deviation between the scene\nnormal and the prior normal. Unlike previous methods that uniformly apply\ngeometric priors on all samples, introducing significant bias in accuracy, our\nproposed normal deflection field dynamically learns and adapts the utilization\nof samples based on their specific characteristics, thereby improving both the\naccuracy and effectiveness of the model. Our method not only obtains smooth\nweakly textured regions such as walls and floors but also preserves the\ngeometric details of complex structures. In addition, we introduce a novel ray\nsampling strategy based on the deflection angle to facilitate the unbiased\nrendering process, which significantly improves the quality and accuracy of\nintricate surfaces, especially on thin structures. Consistent improvements on\nvarious challenging datasets demonstrate the superiority of our method.\n","authors":["Ziyu Tang","Weicai Ye","Yifan Wang","Di Huang","Hujun Bao","Tong He","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17565v1","updated":"2024-09-26T06:27:26Z","published":"2024-09-26T06:27:26Z","title":"Pixel-Space Post-Training of Latent Diffusion Models","summary":" Latent diffusion models (LDMs) have made significant advancements in the\nfield of image generation in recent years. One major advantage of LDMs is their\nability to operate in a compressed latent space, allowing for more efficient\ntraining and deployment. However, despite these advantages, challenges with\nLDMs still remain. For example, it has been observed that LDMs often generate\nhigh-frequency details and complex compositions imperfectly. We hypothesize\nthat one reason for these flaws is due to the fact that all pre- and\npost-training of LDMs are done in latent space, which is typically $8 \\times 8$\nlower spatial-resolution than the output images. To address this issue, we\npropose adding pixel-space supervision in the post-training process to better\npreserve high-frequency details. Experimentally, we show that adding a\npixel-space objective significantly improves both supervised quality\nfine-tuning and preference-based post-training by a large margin on a\nstate-of-the-art DiT transformer and U-Net diffusion models in both visual\nquality and visual flaw metrics, while maintaining the same text alignment\nquality.\n","authors":["Christina Zhang","Simran Motwani","Matthew Yu","Ji Hou","Felix Juefei-Xu","Sam Tsai","Peter Vajda","Zijian He","Jialiang Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16997v2","updated":"2024-09-26T06:13:04Z","published":"2024-09-25T15:02:25Z","title":"INT-FlashAttention: Enabling Flash Attention for INT8 Quantization","summary":" As the foundation of large language models (LLMs), self-attention module\nfaces the challenge of quadratic time and memory complexity with respect to\nsequence length. FlashAttention accelerates attention computation and reduces\nits memory usage by leveraging the GPU memory hierarchy. A promising research\ndirection is to integrate FlashAttention with quantization methods. This paper\nintroduces INT-FlashAttention, the first INT8 quantization architecture\ncompatible with the forward workflow of FlashAttention, which significantly\nimproves the inference speed of FlashAttention on Ampere GPUs. We implement our\nINT-FlashAttention prototype with fully INT8 activations and general\nmatrix-multiplication (GEMM) kernels, making it the first attention operator\nwith fully INT8 input. As a general token-level post-training quantization\nframework, INT-FlashAttention is also compatible with other data formats like\nINT4, etc. Experimental results show INT-FlashAttention achieves 72% faster\ninference speed and 82% smaller quantization error compared to standard\nFlashAttention with FP16 and FP8 data format.\n","authors":["Shimao Chen","Zirui Liu","Zhiying Wu","Ce Zheng","Peizhuang Cong","Zihan Jiang","Yuhan Wu","Lei Su","Tong Yang"],"pdf_url":"https://arxiv.org/pdf/2409.16997v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02569v2","updated":"2024-09-26T05:57:37Z","published":"2024-04-03T08:42:36Z","title":"SliceIt! -- A Dual Simulator Framework for Learning Robot Food Slicing","summary":" Cooking robots can enhance the home experience by reducing the burden of\ndaily chores. However, these robots must perform their tasks dexterously and\nsafely in shared human environments, especially when handling dangerous tools\nsuch as kitchen knives. This study focuses on enabling a robot to autonomously\nand safely learn food-cutting tasks. More specifically, our goal is to enable a\ncollaborative robot or industrial robot arm to perform food-slicing tasks by\nadapting to varying material properties using compliance control. Our approach\ninvolves using Reinforcement Learning (RL) to train a robot to compliantly\nmanipulate a knife, by reducing the contact forces exerted by the food items\nand by the cutting board. However, training the robot in the real world can be\ninefficient, and dangerous, and result in a lot of food waste. Therefore, we\nproposed SliceIt!, a framework for safely and efficiently learning robot\nfood-slicing tasks in simulation. Following a real2sim2real approach, our\nframework consists of collecting a few real food slicing data, calibrating our\ndual simulation environment (a high-fidelity cutting simulator and a robotic\nsimulator), learning compliant control policies on the calibrated simulation\nenvironment, and finally, deploying the policies on the real robot.\n","authors":["Cristian C. Beltran-Hernandez","Nicolas Erbetti","Masashi Hamaya"],"pdf_url":"https://arxiv.org/pdf/2404.02569v2.pdf","comment":"Accepted to ICRA 2024"},{"id":"http://arxiv.org/abs/2406.14990v2","updated":"2024-09-26T05:51:20Z","published":"2024-06-21T09:03:37Z","title":"Learning Variable Compliance Control From a Few Demonstrations for\n Bimanual Robot with Haptic Feedback Teleoperation System","summary":" Automating dexterous, contact-rich manipulation tasks using rigid robots is a\nsignificant challenge in robotics. Rigid robots, defined by their actuation\nthrough position commands, face issues of excessive contact forces due to their\ninability to adapt to contact with the environment, potentially causing damage.\nWhile compliance control schemes have been introduced to mitigate these issues\nby controlling forces via external sensors, they are hampered by the need for\nfine-tuning task-specific controller parameters. Learning from Demonstrations\n(LfD) offers an intuitive alternative, allowing robots to learn manipulations\nthrough observed actions. In this work, we introduce a novel system to enhance\nthe teaching of dexterous, contact-rich manipulations to rigid robots. Our\nsystem is twofold: firstly, it incorporates a teleoperation interface utilizing\nVirtual Reality (VR) controllers, designed to provide an intuitive and\ncost-effective method for task demonstration with haptic feedback. Secondly, we\npresent Comp-ACT (Compliance Control via Action Chunking with Transformers), a\nmethod that leverages the demonstrations to learn variable compliance control\nfrom a few demonstrations. Our methods have been validated across various\ncomplex contact-rich manipulation tasks using single-arm and bimanual robot\nsetups in simulated and real-world environments, demonstrating the\neffectiveness of our system in teaching robots dexterous manipulations with\nenhanced adaptability and safety. Code available at:\nhttps://github.com/omron-sinicx/CompACT\n","authors":["Tatsuya Kamijo","Cristian C. Beltran-Hernandez","Masashi Hamaya"],"pdf_url":"https://arxiv.org/pdf/2406.14990v2.pdf","comment":"Accepted to IROS 2024"},{"id":"http://arxiv.org/abs/2406.06911v3","updated":"2024-09-26T05:47:36Z","published":"2024-06-11T03:09:37Z","title":"AsyncDiff: Parallelizing Diffusion Models by Asynchronous Denoising","summary":" Diffusion models have garnered significant interest from the community for\ntheir great generative ability across various applications. However, their\ntypical multi-step sequential-denoising nature gives rise to high cumulative\nlatency, thereby precluding the possibilities of parallel computation. To\naddress this, we introduce AsyncDiff, a universal and plug-and-play\nacceleration scheme that enables model parallelism across multiple devices. Our\napproach divides the cumbersome noise prediction model into multiple\ncomponents, assigning each to a different device. To break the dependency chain\nbetween these components, it transforms the conventional sequential denoising\ninto an asynchronous process by exploiting the high similarity between hidden\nstates in consecutive diffusion steps. Consequently, each component is\nfacilitated to compute in parallel on separate devices. The proposed strategy\nsignificantly reduces inference latency while minimally impacting the\ngenerative quality. Specifically, for the Stable Diffusion v2.1, AsyncDiff\nachieves a 2.7x speedup with negligible degradation and a 4.0x speedup with\nonly a slight reduction of 0.38 in CLIP Score, on four NVIDIA A5000 GPUs. Our\nexperiments also demonstrate that AsyncDiff can be readily applied to video\ndiffusion models with encouraging performances. The code is available at\nhttps://github.com/czg1225/AsyncDiff.\n","authors":["Zigeng Chen","Xinyin Ma","Gongfan Fang","Zhenxiong Tan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2406.06911v3.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.15763v2","updated":"2024-09-26T05:43:08Z","published":"2024-09-24T05:39:53Z","title":"IRSC: A Zero-shot Evaluation Benchmark for Information Retrieval through\n Semantic Comprehension in Retrieval-Augmented Generation Scenarios","summary":" In Retrieval-Augmented Generation (RAG) tasks using Large Language Models\n(LLMs), the quality of retrieved information is critical to the final output.\nThis paper introduces the IRSC benchmark for evaluating the performance of\nembedding models in multilingual RAG tasks. The benchmark encompasses five\nretrieval tasks: query retrieval, title retrieval, part-of-paragraph retrieval,\nkeyword retrieval, and summary retrieval. Our research addresses the current\nlack of comprehensive testing and effective comparison methods for embedding\nmodels in RAG scenarios. We introduced new metrics: the Similarity of Semantic\nComprehension Index (SSCI) and the Retrieval Capability Contest Index (RCCI),\nand evaluated models such as Snowflake-Arctic, BGE, GTE, and M3E. Our\ncontributions include: 1) the IRSC benchmark, 2) the SSCI and RCCI metrics, and\n3) insights into the cross-lingual limitations of embedding models. The IRSC\nbenchmark aims to enhance the understanding and development of accurate\nretrieval systems in RAG tasks. All code and datasets are available at:\nhttps://github.com/Jasaxion/IRSC_Benchmark\n","authors":["Hai Lin","Shaoxiong Zhan","Junyou Su","Haitao Zheng","Hui Wang"],"pdf_url":"https://arxiv.org/pdf/2409.15763v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17547v1","updated":"2024-09-26T05:33:30Z","published":"2024-09-26T05:33:30Z","title":"Triple Point Masking","summary":" Existing 3D mask learning methods encounter performance bottlenecks under\nlimited data, and our objective is to overcome this limitation. In this paper,\nwe introduce a triple point masking scheme, named TPM, which serves as a\nscalable framework for pre-training of masked autoencoders to achieve\nmulti-mask learning for 3D point clouds. Specifically, we augment the baselines\nwith two additional mask choices (i.e., medium mask and low mask) as our core\ninsight is that the recovery process of an object can manifest in diverse ways.\nPrevious high-masking schemes focus on capturing the global representation but\nlack the fine-grained recovery capability, so that the generated pre-trained\nweights tend to play a limited role in the fine-tuning process. With the\nsupport of the proposed TPM, available methods can exhibit more flexible and\naccurate completion capabilities, enabling the potential autoencoder in the\npre-training stage to consider multiple representations of a single 3D object.\nIn addition, an SVM-guided weight selection module is proposed to fill the\nencoder parameters for downstream networks with the optimal weight during the\nfine-tuning stage, maximizing linear accuracy and facilitating the acquisition\nof intricate representations for new objects. Extensive experiments show that\nthe four baselines equipped with the proposed TPM achieve comprehensive\nperformance improvements on various downstream tasks.\n","authors":["Jiaming Liu","Linghe Kong","Yue Wu","Maoguo Gong","Hao Li","Qiguang Miao","Wenping Ma","Can Qin"],"pdf_url":"https://arxiv.org/pdf/2409.17547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17545v1","updated":"2024-09-26T05:24:14Z","published":"2024-09-26T05:24:14Z","title":"Modulated Intervention Preference Optimization (MIPO): Keey the Easy,\n Refine the Difficult","summary":" Preference optimization methods typically begin training with a well-trained\nSFT model as a reference model. In RLHF and DPO, a regularization term is used\nduring the preference optimization process to prevent the policy model from\ndeviating too far from the reference model's distribution, thereby avoiding the\ngeneration of anomalous responses. When the reference model is already\nwell-aligned with the given data or only requires slight adjustments, this\napproach can produce a well-aligned model. However, if the reference model is\nnot aligned with the given data and requires significant deviation from its\ncurrent state, a regularization term may actually hinder the model alignment.\nIn this study, we propose \\textbf{Modulated Intervention Preference\nOptimization (MIPO)} to address this issue. MIPO modulates the degree of\nintervention from the reference model based on how well the given data is\naligned with it. If the data is well-aligned, the intervention is increased to\nprevent the policy model from diverging significantly from reference model.\nConversely, if the alignment is poor, the interference is reduced to facilitate\nmore extensive training. We compare the performance of MIPO and DPO using\nMistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental\nresults demonstrate that MIPO consistently outperforms DPO across various\nevaluation scenarios.\n","authors":["Cheolhun Jang"],"pdf_url":"https://arxiv.org/pdf/2409.17545v1.pdf","comment":"8pages, submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2402.11645v2","updated":"2024-09-26T05:15:00Z","published":"2024-02-18T16:55:54Z","title":"Image Denoising with Machine Learning: A Novel Approach to Improve\n Quantum Image Processing Quality and Reliability","summary":" Quantum Image Processing (QIP) is a field that aims to utilize the benefits\nof quantum computing for manipulating and analyzing images. However, QIP faces\ntwo challenges: the limitation of qubits and the presence of noise in a quantum\nmachine. In this research, we propose a novel approach to address the issue of\nnoise in QIP. By training and employing a machine learning model that\nidentifies and corrects the noise in quantum-processed images, we can\ncompensate for the noisiness caused by the machine and retrieve a processing\nresult similar to that performed by a classical computer with higher\nefficiency. The model is trained by learning a dataset consisting of both\nexisting processed images and quantum-processed images from open-access\ndatasets. This model will be capable of providing us with the confidence level\nfor each pixel and its potential original value. To assess the model's accuracy\nin compensating for loss and decoherence in QIP, we evaluate it using three\nmetrics: Peak Signal to Noise Ratio (PSNR), Structural Similarity Index (SSIM),\nand Mean Opinion Score (MOS). Additionally, we discuss the applicability of our\nmodel across domains well as its cost effectiveness compared to alternative\nmethods.\n","authors":["Yifan Zhou","Yan Shing Liang"],"pdf_url":"https://arxiv.org/pdf/2402.11645v2.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.17538v1","updated":"2024-09-26T04:56:49Z","published":"2024-09-26T04:56:49Z","title":"On the Implicit Relation Between Low-Rank Adaptation and Differential\n Privacy","summary":" A significant approach in natural language processing involves large-scale\npre-training on general domain data followed by adaptation to specific tasks or\ndomains. As models grow in size, full fine-tuning all parameters becomes\nincreasingly impractical. To address this, some methods for low-rank task\nadaptation of language models have been proposed, e.g. LoRA and FLoRA. These\nmethods keep the pre-trained model weights fixed and incorporate trainable\nlow-rank decomposition matrices into some layers of the transformer\narchitecture, called adapters. This approach significantly reduces the number\nof trainable parameters required for downstream tasks compared to full\nfine-tuning all parameters. In this work, we look at low-rank adaptation from\nthe lens of data privacy. We show theoretically that the low-rank adaptation\nused in LoRA and FLoRA is equivalent to injecting some random noise into the\nbatch gradients w.r.t the adapter parameters coming from their full\nfine-tuning, and we quantify the variance of the injected noise. By\nestablishing a Berry-Esseen type bound on the total variation distance between\nthe noise distribution and a Gaussian distribution with the same variance, we\nshow that the dynamics of LoRA and FLoRA are very close to differentially\nprivate full fine-tuning the adapters, which suggests that low-rank adaptation\nimplicitly provides privacy w.r.t the fine-tuning data. Finally, using\nJohnson-Lindenstrauss lemma, we show that when augmented with gradient\nclipping, low-rank adaptation is almost equivalent to differentially private\nfull fine-tuning adapters with a fixed noise scale.\n","authors":["Saber Malekmohammadi","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2409.17538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17534v1","updated":"2024-09-26T04:41:08Z","published":"2024-09-26T04:41:08Z","title":"Just say what you want: only-prompting self-rewarding online preference\n optimization","summary":" We address the challenge of online Reinforcement Learning from Human Feedback\n(RLHF) with a focus on self-rewarding alignment methods. In online RLHF,\nobtaining feedback requires interaction with the environment, which can be\ncostly when using additional reward models or the GPT-4 API. Current\nself-rewarding approaches rely heavily on the discriminator's judgment\ncapabilities, which are effective for large-scale models but challenging to\ntransfer to smaller ones. To address these limitations, we propose a novel,\nonly-prompting self-rewarding online algorithm that generates preference\ndatasets without relying on judgment capabilities. Additionally, we employ\nfine-grained arithmetic control over the optimality gap between positive and\nnegative examples, generating more hard negatives in the later stages of\ntraining to help the model better capture subtle human preferences. Finally, we\nconduct extensive experiments on two base models, Mistral-7B and\nMistral-Instruct-7B, which significantly bootstrap the performance of the\nreference model, achieving 34.5% in the Length-controlled Win Rates of\nAlpacaEval 2.0.\n","authors":["Ruijie Xu","Zhihan Liu","Yongfei Liu","Shipeng Yan","Zhaoran Wang","Zhi Zhang","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2409.17534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17531v1","updated":"2024-09-26T04:36:19Z","published":"2024-09-26T04:36:19Z","title":"SimVG: A Simple Framework for Visual Grounding with Decoupled\n Multi-modal Fusion","summary":" Visual grounding is a common vision task that involves grounding descriptive\nsentences to the corresponding regions of an image. Most existing methods use\nindependent image-text encoding and apply complex hand-crafted modules or\nencoder-decoder architectures for modal interaction and query reasoning.\nHowever, their performance significantly drops when dealing with complex\ntextual expressions. This is because the former paradigm only utilizes limited\ndownstream data to fit the multi-modal feature fusion. Therefore, it is only\neffective when the textual expressions are relatively simple. In contrast,\ngiven the wide diversity of textual expressions and the uniqueness of\ndownstream training data, the existing fusion module, which extracts multimodal\ncontent from a visual-linguistic context, has not been fully investigated. In\nthis paper, we present a simple yet robust transformer-based framework, SimVG,\nfor visual grounding. Specifically, we decouple visual-linguistic feature\nfusion from downstream tasks by leveraging existing multimodal pre-trained\nmodels and incorporating additional object tokens to facilitate deep\nintegration of downstream and pre-training tasks. Furthermore, we design a\ndynamic weight-balance distillation method in the multi-branch synchronous\nlearning process to enhance the representation capability of the simpler\nbranch. This branch only consists of a lightweight MLP, which simplifies the\nstructure and improves reasoning speed. Experiments on six widely used VG\ndatasets, i.e., RefCOCO/+/g, ReferIt, Flickr30K, and GRefCOCO, demonstrate the\nsuperiority of SimVG. Finally, the proposed method not only achieves\nimprovements in efficiency and convergence speed but also attains new\nstate-of-the-art performance on these benchmarks. Codes and models will be\navailable at \\url{https://github.com/Dmmm1997/SimVG}.\n","authors":["Ming Dai","Lingfeng Yang","Yihao Xu","Zhenhua Feng","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2409.17531v1.pdf","comment":"21pages, 11figures, NeurIPS2024"},{"id":"http://arxiv.org/abs/2310.04696v3","updated":"2024-09-26T04:33:35Z","published":"2023-10-07T06:01:35Z","title":"Serving Deep Learning Model in Relational Databases","summary":" Serving deep learning (DL) models on relational data has become a critical\nrequirement across diverse commercial and scientific domains, sparking growing\ninterest recently. In this visionary paper, we embark on a comprehensive\nexploration of representative architectures to address the requirement. We\nhighlight three pivotal paradigms: The state-of-the-art DL-centric architecture\noffloads DL computations to dedicated DL frameworks. The potential UDF-centric\narchitecture encapsulates one or more tensor computations into User Defined\nFunctions (UDFs) within the relational database management system (RDBMS). The\npotential relation-centric architecture aims to represent a large-scale tensor\ncomputation through relational operators. While each of these architectures\ndemonstrates promise in specific use scenarios, we identify urgent requirements\nfor seamless integration of these architectures and the middle ground\nin-between these architectures. We delve into the gaps that impede the\nintegration and explore innovative strategies to close them. We present a\npathway to establish a novel RDBMS for enabling a broad class of data-intensive\nDL inference applications.\n","authors":["Lixi Zhou","Qi Lin","Kanchan Chowdhury","Saif Masood","Alexandre Eichenberger","Hong Min","Alexander Sim","Jie Wang","Yida Wang","Kesheng Wu","Binhang Yuan","Jia Zou"],"pdf_url":"https://arxiv.org/pdf/2310.04696v3.pdf","comment":"* Authors are ordered alphabetically; Jia Zou is the corresponding\n author"},{"id":"http://arxiv.org/abs/2409.17526v1","updated":"2024-09-26T04:27:44Z","published":"2024-09-26T04:27:44Z","title":"Drone Stereo Vision for Radiata Pine Branch Detection and Distance\n Measurement: Integrating SGBM and Segmentation Models","summary":" Manual pruning of radiata pine trees presents significant safety risks due to\ntheir substantial height and the challenging terrains in which they thrive. To\naddress these risks, this research proposes the development of a drone-based\npruning system equipped with specialized pruning tools and a stereo vision\ncamera, enabling precise detection and trimming of branches. Deep learning\nalgorithms, including YOLO and Mask R-CNN, are employed to ensure accurate\nbranch detection, while the Semi-Global Matching algorithm is integrated to\nprovide reliable distance estimation. The synergy between these techniques\nfacilitates the precise identification of branch locations and enables\nefficient, targeted pruning. Experimental results demonstrate that the combined\nimplementation of YOLO and SGBM enables the drone to accurately detect branches\nand measure their distances from the drone. This research not only improves the\nsafety and efficiency of pruning operations but also makes a significant\ncontribution to the advancement of drone technology in the automation of\nagricultural and forestry practices, laying a foundational framework for\nfurther innovations in environmental management.\n","authors":["Yida Lin","Bing Xue","Mengjie Zhang","Sam Schofield","Richard Green"],"pdf_url":"https://arxiv.org/pdf/2409.17526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08044v2","updated":"2024-09-26T23:47:03Z","published":"2024-07-10T20:52:18Z","title":"RoLoRA: Fine-tuning Rotated Outlier-free LLMs for Effective\n Weight-Activation Quantization","summary":" Low-Rank Adaptation (LoRA), as a representative Parameter-Efficient\nFine-Tuning (PEFT)method, significantly enhances the training efficiency by\nupdating only a small portion of the weights in Large Language Models (LLMs).\nRecently, weight-only quantization techniques have also been applied to LoRA\nmethods to reduce the memory footprint of fine-tuning. However, applying\nweight-activation quantization to the LoRA pipeline is under-explored, and we\nobserve substantial performance degradation primarily due to the presence of\nactivation outliers. In this work, we propose RoLoRA, the first LoRA-based\nscheme for effective weight-activation quantization. RoLoRA utilizes rotation\nfor outlier elimination and proposes rotation-aware fine-tuning to preserve the\noutlier-free characteristics in rotated LLMs. Experimental results show RoLoRA\nconsistently improves low-bit LoRA convergence and post-training quantization\nrobustness in weight-activation settings. We evaluate RoLoRA across\nLLaMA2-7B/13B, LLaMA3-8B models, achieving up to 29.5% absolute accuracy gain\nof 4-bit weight-activation quantized LLaMA2- 13B on commonsense reasoning tasks\ncompared to LoRA baseline. We further demonstrate its effectiveness on Large\nMultimodal Models (LLaVA-1.5-7B). Codes are available at\nhttps://github.com/HuangOwen/RoLoRA\n","authors":["Xijie Huang","Zechun Liu","Shih-Yang Liu","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.08044v2.pdf","comment":"EMNLP 2024 Findings, Codes: https://github.com/HuangOwen/RoLoRA,\n Models:\n https://huggingface.co/collections/ScarletAce/rolora-66f5f228a90681c7c4512b28"},{"id":"http://arxiv.org/abs/2409.18345v1","updated":"2024-09-26T23:46:15Z","published":"2024-09-26T23:46:15Z","title":"A Generalized LLM-Augmented BIM Framework: Application to a\n Speech-to-BIM system","summary":" Performing building information modeling (BIM) tasks is a complex process\nthat imposes a steep learning curve and a heavy cognitive load due to the\nnecessity of remembering sequences of numerous commands. With the rapid\nadvancement of large language models (LLMs), it is foreseeable that BIM tasks,\nincluding querying and managing BIM data, 4D and 5D BIM, design compliance\nchecking, or authoring a design, using written or spoken natural language\n(i.e., text-to-BIM or speech-to-BIM), will soon supplant traditional graphical\nuser interfaces. This paper proposes a generalized LLM-augmented BIM framework\nto expedite the development of LLM-enhanced BIM applications by providing a\nstep-by-step development process. The proposed framework consists of six steps:\ninterpret-fill-match-structure-execute-check. The paper demonstrates the\napplicability of the proposed framework through implementing a speech-to-BIM\napplication, NADIA-S (Natural-language-based Architectural Detailing through\nInteraction with Artificial Intelligence via Speech), using exterior wall\ndetailing as an example.\n","authors":["Ghang Lee","Suhyung Jang","Seokho Hyun"],"pdf_url":"https://arxiv.org/pdf/2409.18345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18343v1","updated":"2024-09-26T23:40:33Z","published":"2024-09-26T23:40:33Z","title":"Improving Agent Behaviors with RL Fine-tuning for Autonomous Driving","summary":" A major challenge in autonomous vehicle research is modeling agent behaviors,\nwhich has critical applications including constructing realistic and reliable\nsimulations for off-board evaluation and forecasting traffic agents motion for\nonboard planning. While supervised learning has shown success in modeling\nagents across various domains, these models can suffer from distribution shift\nwhen deployed at test-time. In this work, we improve the reliability of agent\nbehaviors by closed-loop fine-tuning of behavior models with reinforcement\nlearning. Our method demonstrates improved overall performance, as well as\nimproved targeted metrics such as collision rate, on the Waymo Open Sim Agents\nchallenge. Additionally, we present a novel policy evaluation benchmark to\ndirectly assess the ability of simulated agents to measure the quality of\nautonomous vehicle planners and demonstrate the effectiveness of our approach\non this new benchmark.\n","authors":["Zhenghao Peng","Wenjie Luo","Yiren Lu","Tianyi Shen","Cole Gulino","Ari Seff","Justin Fu"],"pdf_url":"https://arxiv.org/pdf/2409.18343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18340v1","updated":"2024-09-26T23:30:40Z","published":"2024-09-26T23:30:40Z","title":"DRL-STNet: Unsupervised Domain Adaptation for Cross-modality Medical\n Image Segmentation via Disentangled Representation Learning","summary":" Unsupervised domain adaptation (UDA) is essential for medical image\nsegmentation, especially in cross-modality data scenarios. UDA aims to transfer\nknowledge from a labeled source domain to an unlabeled target domain, thereby\nreducing the dependency on extensive manual annotations. This paper presents\nDRL-STNet, a novel framework for cross-modality medical image segmentation that\nleverages generative adversarial networks (GANs), disentangled representation\nlearning (DRL), and self-training (ST). Our method leverages DRL within a GAN\nto translate images from the source to the target modality. Then, the\nsegmentation model is initially trained with these translated images and\ncorresponding source labels and then fine-tuned iteratively using a combination\nof synthetic and real images with pseudo-labels and real labels. The proposed\nframework exhibits superior performance in abdominal organ segmentation on the\nFLARE challenge dataset, surpassing state-of-the-art methods by 11.4% in the\nDice similarity coefficient and by 13.1% in the Normalized Surface Dice metric,\nachieving scores of 74.21% and 80.69%, respectively. The average running time\nis 41 seconds, and the area under the GPU memory-time curve is 11,292 MB. These\nresults indicate the potential of DRL-STNet for enhancing cross-modality\nmedical image segmentation tasks.\n","authors":["Hui Lin","Florian Schiffers","Santiago López-Tapia","Neda Tavakoli","Daniel Kim","Aggelos K. Katsaggelos"],"pdf_url":"https://arxiv.org/pdf/2409.18340v1.pdf","comment":"MICCAI 2024 Challenge, FLARE Challenge, Unsupervised domain\n adaptation, Organ segmentation, Feature disentanglement, Self-training"},{"id":"http://arxiv.org/abs/2409.18339v1","updated":"2024-09-26T23:25:21Z","published":"2024-09-26T23:25:21Z","title":"AER-LLM: Ambiguity-aware Emotion Recognition Leveraging Large Language\n Models","summary":" Recent advancements in Large Language Models (LLMs) have demonstrated great\nsuccess in many Natural Language Processing (NLP) tasks. In addition to their\ncognitive intelligence, exploring their capabilities in emotional intelligence\nis also crucial, as it enables more natural and empathetic conversational AI.\nRecent studies have shown LLMs' capability in recognizing emotions, but they\noften focus on single emotion labels and overlook the complex and ambiguous\nnature of human emotions. This study is the first to address this gap by\nexploring the potential of LLMs in recognizing ambiguous emotions, leveraging\ntheir strong generalization capabilities and in-context learning. We design\nzero-shot and few-shot prompting and incorporate past dialogue as context\ninformation for ambiguous emotion recognition. Experiments conducted using\nthree datasets indicate significant potential for LLMs in recognizing ambiguous\nemotions, and highlight the substantial benefits of including context\ninformation. Furthermore, our findings indicate that LLMs demonstrate a high\ndegree of effectiveness in recognizing less ambiguous emotions and exhibit\npotential for identifying more ambiguous emotions, paralleling human perceptual\ncapabilities.\n","authors":["Xin Hong","Yuan Gong","Vidhyasaharan Sethu","Ting Dang"],"pdf_url":"https://arxiv.org/pdf/2409.18339v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.18335v1","updated":"2024-09-26T23:16:47Z","published":"2024-09-26T23:16:47Z","title":"A Fairness-Driven Method for Learning Human-Compatible Negotiation\n Strategies","summary":" Despite recent advancements in AI and NLP, negotiation remains a difficult\ndomain for AI agents. Traditional game theoretic approaches that have worked\nwell for two-player zero-sum games struggle in the context of negotiation due\nto their inability to learn human-compatible strategies. On the other hand,\napproaches that only use human data tend to be domain-specific and lack the\ntheoretical guarantees provided by strategies grounded in game theory.\nMotivated by the notion of fairness as a criterion for optimality in general\nsum games, we propose a negotiation framework called FDHC which incorporates\nfairness into both the reward design and search to learn human-compatible\nnegotiation strategies. Our method includes a novel, RL+search technique called\nLGM-Zero which leverages a pre-trained language model to retrieve\nhuman-compatible offers from large action spaces. Our results show that our\nmethod is able to achieve more egalitarian negotiation outcomes and improve\nnegotiation quality.\n","authors":["Ryan Shea","Zhou Yu"],"pdf_url":"https://arxiv.org/pdf/2409.18335v1.pdf","comment":"EMNLP Findings 2024"},{"id":"http://arxiv.org/abs/2312.00094v3","updated":"2024-09-26T23:14:27Z","published":"2023-11-30T13:07:19Z","title":"Fast ODE-based Sampling for Diffusion Models in Around 5 Steps","summary":" Sampling from diffusion models can be treated as solving the corresponding\nordinary differential equations (ODEs), with the aim of obtaining an accurate\nsolution with as few number of function evaluations (NFE) as possible.\nRecently, various fast samplers utilizing higher-order ODE solvers have emerged\nand achieved better performance than the initial first-order one. However,\nthese numerical methods inherently result in certain approximation errors,\nwhich significantly degrades sample quality with extremely small NFE (e.g.,\naround 5). In contrast, based on the geometric observation that each sampling\ntrajectory almost lies in a two-dimensional subspace embedded in the ambient\nspace, we propose Approximate MEan-Direction Solver (AMED-Solver) that\neliminates truncation errors by directly learning the mean direction for fast\ndiffusion sampling. Besides, our method can be easily used as a plugin to\nfurther improve existing ODE-based samplers. Extensive experiments on image\nsynthesis with the resolution ranging from 32 to 512 demonstrate the\neffectiveness of our method. With only 5 NFE, we achieve 6.61 FID on CIFAR-10,\n10.74 FID on ImageNet 64$\\times$64, and 13.20 FID on LSUN Bedroom. Our code is\navailable at https://github.com/zju-pi/diff-sampler.\n","authors":["Zhenyu Zhou","Defang Chen","Can Wang","Chun Chen"],"pdf_url":"https://arxiv.org/pdf/2312.00094v3.pdf","comment":"Accepted by CVPR 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2406.05375v2","updated":"2024-09-26T22:42:49Z","published":"2024-06-08T07:00:31Z","title":"LEMMA-RCA: A Large Multi-modal Multi-domain Dataset for Root Cause\n Analysis","summary":" Root cause analysis (RCA) is crucial for enhancing the reliability and\nperformance of complex systems. However, progress in this field has been\nhindered by the lack of large-scale, open-source datasets tailored for RCA. To\nbridge this gap, we introduce LEMMA-RCA, a large dataset designed for diverse\nRCA tasks across multiple domains and modalities. LEMMA-RCA features various\nreal-world fault scenarios from IT and OT operation systems, encompassing\nmicroservices, water distribution, and water treatment systems, with hundreds\nof system entities involved. We evaluate the quality of LEMMA-RCA by testing\nthe performance of eight baseline methods on this dataset under various\nsettings, including offline and online modes as well as single and multiple\nmodalities. Our experimental results demonstrate the high quality of LEMMA-RCA.\nThe dataset is publicly available at https://lemma-rca.github.io/.\n","authors":["Lecheng Zheng","Zhengzhang Chen","Dongjie Wang","Chengyuan Deng","Reon Matsuoka","Haifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2406.05375v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05892v4","updated":"2024-09-26T22:39:08Z","published":"2024-04-08T22:20:59Z","title":"Eagle and Finch: RWKV with Matrix-Valued States and Dynamic Recurrence","summary":" We present Eagle (RWKV-5) and Finch (RWKV-6), sequence models improving upon\nthe RWKV (RWKV-4) architecture. Our architectural design advancements include\nmulti-headed matrix-valued states and a dynamic recurrence mechanism that\nimprove expressivity while maintaining the inference efficiency characteristics\nof RNNs. We introduce a new multilingual corpus with 1.12 trillion tokens and a\nfast tokenizer based on greedy matching for enhanced multilinguality. We\ntrained four Eagle models, ranging from 0.46 to 7.5 billion parameters, and two\nFinch models with 1.6 and 3.1 billion parameters and find that they achieve\ncompetitive performance across a wide variety of benchmarks. We release all our\nmodels on HuggingFace under the Apache 2.0 license. Models at:\nhttps://huggingface.co/RWKV Training code at: https://github.com/RWKV/RWKV-LM\nInference code at: https://github.com/RWKV/ChatRWKV Time-parallel training code\nat: https://github.com/RWKV/RWKV-infctx-trainer\n","authors":["Bo Peng","Daniel Goldstein","Quentin Anthony","Alon Albalak","Eric Alcaide","Stella Biderman","Eugene Cheah","Xingjian Du","Teddy Ferdinan","Haowen Hou","Przemysław Kazienko","Kranthi Kiran GV","Jan Kocoń","Bartłomiej Koptyra","Satyapriya Krishna","Ronald McClelland Jr.","Jiaju Lin","Niklas Muennighoff","Fares Obeid","Atsushi Saito","Guangyu Song","Haoqin Tu","Cahya Wirawan","Stanisław Woźniak","Ruichong Zhang","Bingchen Zhao","Qihang Zhao","Peng Zhou","Jian Zhu","Rui-Jie Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.05892v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18324v1","updated":"2024-09-26T22:31:09Z","published":"2024-09-26T22:31:09Z","title":"Input-Dependent Power Usage in GPUs","summary":" GPUs are known to be power-hungry, and due to the boom in artificial\nintelligence, they are currently the major contributors to the high power\ndemands of upcoming datacenters. Most GPU usage in these popular workloads\nconsist of large general matrix-matrix multiplications (GEMMs), which have\ntherefore been optimized to achieve high utilization of hardware resources. In\nthis work, we show that modifying the input data to GEMMs, while maintaining\nthe matrix shapes and sizes can notably change the power consumption of these\nkernels. We experiment with four kinds of input variations: value distribution,\nbit similarity, placement, and sparsity, across different data types. Our\nfindings indicate that these variations can change the GPU power usage during\nGEMM by almost 40%. We hypothesize that input-dependent power usage variations\noccur due to changes in the number of bit flips in the GPUs. We propose\nleveraging this property through compiler and scheduler optimizations to manage\npower and reduce energy consumption.\n","authors":["Theo Gregersen","Pratyush Patel","Esha Choukse"],"pdf_url":"https://arxiv.org/pdf/2409.18324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13994v2","updated":"2024-09-26T22:24:13Z","published":"2024-09-21T03:09:10Z","title":"Contrastive Learning for Knowledge-Based Question Generation in Large\n Language Models","summary":" With the rapid development of artificial intelligence technology, especially\nthe increasingly widespread application of question-and-answer systems,\nhigh-quality question generation has become a key component in supporting the\ndevelopment of these systems. This article focuses on knowledge-based question\ngeneration technology, which aims to enable computers to simulate the human\nquestioning process based on understanding specific texts or knowledge bases.\nIn light of the issues of hallucination and knowledge gaps present in\nlarge-scale language models when applied to knowledge-intensive tasks, this\npaper proposes an enhanced question generation method that incorporates\ncontrastive learning. This method utilizes multiple models to jointly mine\ndomain knowledge and uses contrastive learning to guide the model in reducing\nnoise and hallucinations in generation. Experimental results show that by\ndesigning prompts containing contrasting examples, the model's performance in\nquestion generation improves considerably, particularly when contrasting\ninstructions and examples are used simultaneously, leading to the highest\nquality of generated questions and improved accuracy. These results demonstrate\nthat the method proposed in this study, which combines contrasting context and\nchain-of-thought prompts, can effectively improve both the quality and the\npracticality of question generation.\n","authors":["Zhenhong Zhang","Jiajing Chen","Weiyan Shi","Lingjie Yi","Chihang Wang","Qian Yu"],"pdf_url":"https://arxiv.org/pdf/2409.13994v2.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.18319v1","updated":"2024-09-26T21:59:11Z","published":"2024-09-26T21:59:11Z","title":"Cross-Institutional Structured Radiology Reporting for Lung Cancer\n Screening Using a Dynamic Template-Constrained Large Language Model","summary":" Structured radiology reporting is advantageous for optimizing clinical\nworkflows and patient outcomes. Current LLMs in creating structured reports\nface the challenges of formatting errors, content hallucinations, and privacy\nleakage concerns when uploaded to external servers. We aim to develop an\nenhanced open-source LLM for creating structured and standardized LCS reports\nfrom free-text descriptions. After institutional IRB approvals, 5,442\nde-identified LCS reports from two institutions were retrospectively analyzed.\n500 reports were randomly selected from the two institutions evenly and then\nmanually labeled for evaluation. Two radiologists from the two institutions\ndeveloped a standardized template including 29 features for lung nodule\nreporting. We proposed template-constrained decoding to enhance\nstate-of-the-art open-source LLMs, including LLAMA, Qwen, and Mistral. The LLM\nperformance was extensively evaluated in terms of F1 score, confidence\ninterval, McNemar test, and z-test. Based on the structured reports created\nfrom the large-scale dataset, a nodule-level retrieval system was prototyped\nand an automatic statistical analysis was performed. Our software,\nvLLM-structure, is publicly available for local deployment with enhanced LLMs.\nOur template-constrained decoding approach consistently enhanced the LLM\nperformance on multi-institutional datasets, with neither formatting errors nor\ncontent hallucinations. Our method improved the best open-source LLAMA-3.1 405B\nby up to 10.42%, and outperformed GPT-4o by 17.19%. A novel nodule retrieval\nsystem was successfully prototyped and demonstrated on a large-scale multimodal\ndatabase using our enhanced LLM technologies. The automatically derived\nstatistical distributions were closely consistent with the prior findings in\nterms of nodule type, location, size, status, and Lung-RADS.\n","authors":["Chuang Niu","Parisa Kaviani","Qing Lyu","Mannudeep K. Kalra","Christopher T. Whitlow","Ge Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15997v2","updated":"2024-09-26T21:56:01Z","published":"2024-09-24T11:57:12Z","title":"Improvements to SDXL in NovelAI Diffusion V3","summary":" In this technical report, we document the changes we made to SDXL in the\nprocess of training NovelAI Diffusion V3, our state of the art anime image\ngeneration model.\n","authors":["Juan Ossa","Eren Doğan","Alex Birch","F. Johnson"],"pdf_url":"https://arxiv.org/pdf/2409.15997v2.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.18313v1","updated":"2024-09-26T21:44:11Z","published":"2024-09-26T21:44:11Z","title":"Embodied-RAG: General non-parametric Embodied Memory for Retrieval and\n Generation","summary":" There is no limit to how much a robot might explore and learn, but all of\nthat knowledge needs to be searchable and actionable. Within language research,\nretrieval augmented generation (RAG) has become the workhouse of large-scale\nnon-parametric knowledge, however existing techniques do not directly transfer\nto the embodied domain, which is multimodal, data is highly correlated, and\nperception requires abstraction.\n To address these challenges, we introduce Embodied-RAG, a framework that\nenhances the foundational model of an embodied agent with a non-parametric\nmemory system capable of autonomously constructing hierarchical knowledge for\nboth navigation and language generation. Embodied-RAG handles a full range of\nspatial and semantic resolutions across diverse environments and query types,\nwhether for a specific object or a holistic description of ambiance. At its\ncore, Embodied-RAG's memory is structured as a semantic forest, storing\nlanguage descriptions at varying levels of detail. This hierarchical\norganization allows the system to efficiently generate context-sensitive\noutputs across different robotic platforms. We demonstrate that Embodied-RAG\neffectively bridges RAG to the robotics domain, successfully handling over 200\nexplanation and navigation queries across 19 environments, highlighting its\npromise for general-purpose non-parametric system for embodied agents.\n","authors":["Quanting Xie","So Yeon Min","Tianyi Zhang","Aarav Bajaj","Ruslan Salakhutdinov","Matthew Johnson-Roberson","Yonatan Bisk"],"pdf_url":"https://arxiv.org/pdf/2409.18313v1.pdf","comment":"Web: https://quanting-xie.github.io/Embodied-RAG-web/"},{"id":"http://arxiv.org/abs/2408.09698v3","updated":"2024-09-26T21:28:42Z","published":"2024-08-19T04:44:32Z","title":"Harnessing Multimodal Large Language Models for Multimodal Sequential\n Recommendation","summary":" Recent advances in Large Language Models (LLMs) have demonstrated significant\npotential in the field of Recommendation Systems (RSs). Most existing studies\nhave focused on converting user behavior logs into textual prompts and\nleveraging techniques such as prompt tuning to enable LLMs for recommendation\ntasks. Meanwhile, research interest has recently grown in multimodal\nrecommendation systems that integrate data from images, text, and other sources\nusing modality fusion techniques. This introduces new challenges to the\nexisting LLM-based recommendation paradigm which relies solely on text modality\ninformation. Moreover, although Multimodal Large Language Models (MLLMs)\ncapable of processing multi-modal inputs have emerged, how to equip MLLMs with\nmulti-modal recommendation capabilities remains largely unexplored. To this\nend, in this paper, we propose the Multimodal Large Language Model-enhanced\nMultimodaln Sequential Recommendation (MLLM-MSR) model. To capture the dynamic\nuser preference, we design a two-stage user preference summarization method.\nSpecifically, we first utilize an MLLM-based item-summarizer to extract image\nfeature given an item and convert the image into text. Then, we employ a\nrecurrent user preference summarization generation paradigm to capture the\ndynamic changes in user preferences based on an LLM-based user-summarizer.\nFinally, to enable the MLLM for multi-modal recommendation task, we propose to\nfine-tune a MLLM-based recommender using Supervised Fine-Tuning (SFT)\ntechniques. Extensive evaluations across various datasets validate the\neffectiveness of MLLM-MSR, showcasing its superior ability to capture and adapt\nto the evolving dynamics of user preferences.\n","authors":["Yuyang Ye","Zhi Zheng","Yishan Shen","Tianshu Wang","Hengruo Zhang","Peijun Zhu","Runlong Yu","Kai Zhang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.09698v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10237v5","updated":"2024-09-26T21:24:00Z","published":"2023-12-15T22:09:04Z","title":"A Distributed Privacy Preserving Model for the Detection of Alzheimer's\n Disease","summary":" In the era of rapidly advancing medical technologies, the segmentation of\nmedical data has become inevitable, necessitating the development of privacy\npreserving machine learning algorithms that can train on distributed data.\nConsolidating sensitive medical data is not always an option particularly due\nto the stringent privacy regulations imposed by the Health Insurance\nPortability and Accountability Act (HIPAA). In this paper, I introduce a HIPAA\ncompliant framework that can train from distributed data. I then propose a\nmultimodal vertical federated model for Alzheimer's Disease (AD) detection, a\nserious neurodegenerative condition that can cause dementia, severely impairing\nbrain function and hindering simple tasks, especially without preventative\ncare. This vertical federated learning (VFL) model offers a distributed\narchitecture that enables collaborative learning across diverse sources of\nmedical data while respecting privacy constraints imposed by HIPAA. The VFL\narchitecture proposed herein offers a novel distributed architecture, enabling\ncollaborative learning across diverse sources of medical data while respecting\nstatutory privacy constraints. By leveraging multiple modalities of data, the\nrobustness and accuracy of AD detection can be enhanced. This model not only\ncontributes to the advancement of federated learning techniques but also holds\npromise for overcoming the hurdles posed by data segmentation in medical\nresearch.\n","authors":["Paul K. Mandal"],"pdf_url":"https://arxiv.org/pdf/2312.10237v5.pdf","comment":"15 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.18301v1","updated":"2024-09-26T21:16:51Z","published":"2024-09-26T21:16:51Z","title":"Harnessing Wavelet Transformations for Generalizable Deepfake Forgery\n Detection","summary":" The evolution of digital image manipulation, particularly with the\nadvancement of deep generative models, significantly challenges existing\ndeepfake detection methods, especially when the origin of the deepfake is\nobscure. To tackle the increasing complexity of these forgeries, we propose\n\\textbf{Wavelet-CLIP}, a deepfake detection framework that integrates wavelet\ntransforms with features derived from the ViT-L/14 architecture, pre-trained in\nthe CLIP fashion. Wavelet-CLIP utilizes Wavelet Transforms to deeply analyze\nboth spatial and frequency features from images, thus enhancing the model's\ncapability to detect sophisticated deepfakes. To verify the effectiveness of\nour approach, we conducted extensive evaluations against existing\nstate-of-the-art methods for cross-dataset generalization and detection of\nunseen images generated by standard diffusion models. Our method showcases\noutstanding performance, achieving an average AUC of 0.749 for cross-data\ngeneralization and 0.893 for robustness against unseen deepfakes, outperforming\nall compared methods. The code can be reproduced from the repo:\n\\url{https://github.com/lalithbharadwajbaru/Wavelet-CLIP}\n","authors":["Lalith Bharadwaj Baru","Shilhora Akshay Patel","Rohit Boddeda"],"pdf_url":"https://arxiv.org/pdf/2409.18301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18300v1","updated":"2024-09-26T21:15:22Z","published":"2024-09-26T21:15:22Z","title":"SOAR: Self-supervision Optimized UAV Action Recognition with Efficient\n Object-Aware Pretraining","summary":" We introduce SOAR, a novel Self-supervised pretraining algorithm for aerial\nfootage captured by Unmanned Aerial Vehicles (UAVs). We incorporate human\nobject knowledge throughout the pretraining process to enhance UAV video\npretraining efficiency and downstream action recognition performance. This is\nin contrast to prior works that primarily incorporate object information during\nthe fine-tuning stage. Specifically, we first propose a novel object-aware\nmasking strategy designed to retain the visibility of certain patches related\nto objects throughout the pretraining phase. Second, we introduce an\nobject-aware loss function that utilizes object information to adjust the\nreconstruction loss, preventing bias towards less informative background\npatches. In practice, SOAR with a vanilla ViT backbone, outperforms best UAV\naction recognition models, recording a 9.7% and 21.4% boost in top-1 accuracy\non the NEC-Drone and UAV-Human datasets, while delivering an inference speed of\n18.7ms per video, making it 2x to 5x faster. Additionally, SOAR obtains\ncomparable accuracy to prior self-supervised learning (SSL) methods while\nrequiring 87.5% less pretraining time and 25% less memory usage\n","authors":["Ruiqi Xian","Xiyang Wu","Tianrui Guan","Xijun Wang","Boqing Gong","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2409.18300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18297v1","updated":"2024-09-26T21:10:17Z","published":"2024-09-26T21:10:17Z","title":"Flat'n'Fold: A Diverse Multi-Modal Dataset for Garment Perception and\n Manipulation","summary":" We present Flat'n'Fold, a novel large-scale dataset for garment manipulation\nthat addresses critical gaps in existing datasets. Comprising 1,212 human and\n887 robot demonstrations of flattening and folding 44 unique garments across 8\ncategories, Flat'n'Fold surpasses prior datasets in size, scope, and diversity.\nOur dataset uniquely captures the entire manipulation process from crumpled to\nfolded states, providing synchronized multi-view RGB-D images, point clouds,\nand action data, including hand or gripper positions and rotations. We quantify\nthe dataset's diversity and complexity compared to existing benchmarks and show\nthat our dataset features natural and diverse manipulations of real-world\ndemonstrations of human and robot demonstrations in terms of visual and action\ninformation. To showcase Flat'n'Fold's utility, we establish new benchmarks for\ngrasping point prediction and subtask decomposition. Our evaluation of\nstate-of-the-art models on these tasks reveals significant room for\nimprovement. This underscores Flat'n'Fold's potential to drive advances in\nrobotic perception and manipulation of deformable objects. Our dataset can be\ndownloaded at https://cvas-ug.github.io/flat-n-fold\n","authors":["Lipeng Zhuang","Shiyu Fan","Yingdong Ru","Florent Audonnet","Paul Henderson","Gerardo Aragon-Camarasa"],"pdf_url":"https://arxiv.org/pdf/2409.18297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18295v1","updated":"2024-09-26T21:06:53Z","published":"2024-09-26T21:06:53Z","title":"Enhancing Lossy Compression Through Cross-Field Information for\n Scientific Applications","summary":" Lossy compression is one of the most effective methods for reducing the size\nof scientific data containing multiple data fields. It reduces information\ndensity through prediction or transformation techniques to compress the data.\nPrevious approaches use local information from a single target field when\npredicting target data points, limiting their potential to achieve higher\ncompression ratios. In this paper, we identified significant cross-field\ncorrelations within scientific datasets. We propose a novel hybrid prediction\nmodel that utilizes CNN to extract cross-field information and combine it with\nexisting local field information. Our solution enhances the prediction accuracy\nof lossy compressors, leading to improved compression ratios without\ncompromising data quality. We evaluate our solution on three scientific\ndatasets, demonstrating its ability to improve compression ratios by up to 25%\nunder specific error bounds. Additionally, our solution preserves more data\ndetails and reduces artifacts compared to baseline approaches.\n","authors":["Youyuan Liu","Wenqi Jia","Taolue Yang","Miao Yin","Sian Jin"],"pdf_url":"https://arxiv.org/pdf/2409.18295v1.pdf","comment":"9 pages, 9 figures, accepted by DRBSD-10"},{"id":"http://arxiv.org/abs/2409.18290v1","updated":"2024-09-26T21:00:51Z","published":"2024-09-26T21:00:51Z","title":"Retrospective Comparative Analysis of Prostate Cancer In-Basket\n Messages: Responses from Closed-Domain LLM vs. Clinical Teams","summary":" In-basket message interactions play a crucial role in physician-patient\ncommunication, occurring during all phases (pre-, during, and post) of a\npatient's care journey. However, responding to these patients' inquiries has\nbecome a significant burden on healthcare workflows, consuming considerable\ntime for clinical care teams. To address this, we introduce RadOnc-GPT, a\nspecialized Large Language Model (LLM) powered by GPT-4 that has been designed\nwith a focus on radiotherapeutic treatment of prostate cancer with advanced\nprompt engineering, and specifically designed to assist in generating\nresponses. We integrated RadOnc-GPT with patient electronic health records\n(EHR) from both the hospital-wide EHR database and an internal,\nradiation-oncology-specific database. RadOnc-GPT was evaluated on 158\npreviously recorded in-basket message interactions. Quantitative natural\nlanguage processing (NLP) analysis and two grading studies with clinicians and\nnurses were used to assess RadOnc-GPT's responses. Our findings indicate that\nRadOnc-GPT slightly outperformed the clinical care team in \"Clarity\" and\n\"Empathy,\" while achieving comparable scores in \"Completeness\" and\n\"Correctness.\" RadOnc-GPT is estimated to save 5.2 minutes per message for\nnurses and 2.4 minutes for clinicians, from reading the inquiry to sending the\nresponse. Employing RadOnc-GPT for in-basket message draft generation has the\npotential to alleviate the workload of clinical care teams and reduce\nhealthcare costs by producing high-quality, timely responses.\n","authors":["Yuexing Hao","Jason M. Holmes","Jared Hobson","Alexandra Bennett","Daniel K. Ebner","David M. Routman","Satomi Shiraishi","Samir H. Patel","Nathan Y. Yu","Chris L. Hallemeier","Brooke E. Ball","Mark R. Waddle","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18289v1","updated":"2024-09-26T21:00:45Z","published":"2024-09-26T21:00:45Z","title":"Criticality and Safety Margins for Reinforcement Learning","summary":" State of the art reinforcement learning methods sometimes encounter unsafe\nsituations. Identifying when these situations occur is of interest both for\npost-hoc analysis and during deployment, where it might be advantageous to call\nout to a human overseer for help. Efforts to gauge the criticality of different\npoints in time have been developed, but their accuracy is not well established\ndue to a lack of ground truth, and they are not designed to be easily\ninterpretable by end users. Therefore, we seek to define a criticality\nframework with both a quantifiable ground truth and a clear significance to\nusers. We introduce true criticality as the expected drop in reward when an\nagent deviates from its policy for n consecutive random actions. We also\nintroduce the concept of proxy criticality, a low-overhead metric that has a\nstatistically monotonic relationship to true criticality. Safety margins make\nthese interpretable, when defined as the number of random actions for which\nperformance loss will not exceed some tolerance with high confidence. We\ndemonstrate this approach in several environment-agent combinations; for an A3C\nagent in an Atari Beamrider environment, the lowest 5% of safety margins\ncontain 47% of agent losses; i.e., supervising only 5% of decisions could\npotentially prevent roughly half of an agent's errors. This criticality\nframework measures the potential impacts of bad decisions, even before those\ndecisions are made, allowing for more effective debugging and oversight of\nautonomous agents.\n","authors":["Alexander Grushin","Walt Woods","Alvaro Velasquez","Simon Khan"],"pdf_url":"https://arxiv.org/pdf/2409.18289v1.pdf","comment":"17 pages, 10 figures. This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"}]},"2024-09-27T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2409.18052v2","updated":"2024-09-27T02:09:44Z","published":"2024-09-26T16:55:44Z","title":"Explaining Explaining","summary":" Explanation is key to people having confidence in high-stakes AI systems.\nHowever, machine-learning-based systems -- which account for almost all current\nAI -- can't explain because they are usually black boxes. The explainable AI\n(XAI) movement hedges this problem by redefining \"explanation\". The\nhuman-centered explainable AI (HCXAI) movement identifies the\nexplanation-oriented needs of users but can't fulfill them because of its\ncommitment to machine learning. In order to achieve the kinds of explanations\nneeded by real people operating in critical domains, we must rethink how to\napproach AI. We describe a hybrid approach to developing cognitive agents that\nuses a knowledge-based infrastructure supplemented by data obtained through\nmachine learning when applicable. These agents will serve as assistants to\nhumans who will bear ultimate responsibility for the decisions and actions of\nthe human-robot team. We illustrate the explanatory potential of such agents\nusing the under-the-hood panels of a demonstration system in which a team of\nsimulated robots collaborate on a search task assigned by a human.\n","authors":["Sergei Nirenburg","Marjorie McShane","Kenneth W. Goodman","Sanjay Oruganti"],"pdf_url":"https://arxiv.org/pdf/2409.18052v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17618v2","updated":"2024-09-27T01:45:26Z","published":"2024-09-26T08:10:28Z","title":"Learning Occlusion-aware Decision-making from Agent Interaction via\n Active Perception","summary":" Occlusion-aware decision-making is essential in autonomous driving due to the\nhigh uncertainty of various occlusions. Recent occlusion-aware decision-making\nmethods encounter issues such as high computational complexity, scenario\nscalability challenges, or reliance on limited expert data. Benefiting from\nautomatically generating data by exploration randomization, we uncover that\nreinforcement learning (RL) may show promise in occlusion-aware\ndecision-making. However, previous occlusion-aware RL faces challenges in\nexpanding to various dynamic and static occlusion scenarios, low learning\nefficiency, and lack of predictive ability. To address these issues, we\nintroduce Pad-AI, a self-reinforcing framework to learn occlusion-aware\ndecision-making through active perception. Pad-AI utilizes vectorized\nrepresentation to represent occluded environments efficiently and learns over\nthe semantic motion primitives to focus on high-level active perception\nexploration. Furthermore, Pad-AI integrates prediction and RL within a unified\nframework to provide risk-aware learning and security guarantees. Our framework\nwas tested in challenging scenarios under both dynamic and static occlusions\nand demonstrated efficient and general perception-aware exploration performance\nto other strong baselines in closed-loop evaluations.\n","authors":["Jie Jia","Yiming Shu","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2409.17618v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18953v1","updated":"2024-09-27T17:56:04Z","published":"2024-09-27T17:56:04Z","title":"UniCal: Unified Neural Sensor Calibration","summary":" Self-driving vehicles (SDVs) require accurate calibration of LiDARs and\ncameras to fuse sensor data accurately for autonomy. Traditional calibration\nmethods typically leverage fiducials captured in a controlled and structured\nscene and compute correspondences to optimize over. These approaches are costly\nand require substantial infrastructure and operations, making it challenging to\nscale for vehicle fleets. In this work, we propose UniCal, a unified framework\nfor effortlessly calibrating SDVs equipped with multiple LiDARs and cameras.\nOur approach is built upon a differentiable scene representation capable of\nrendering multi-view geometrically and photometrically consistent sensor\nobservations. We jointly learn the sensor calibration and the underlying scene\nrepresentation through differentiable volume rendering, utilizing outdoor\nsensor data without the need for specific calibration fiducials. This\n\"drive-and-calibrate\" approach significantly reduces costs and operational\noverhead compared to existing calibration systems, enabling efficient\ncalibration for large SDV fleets at scale. To ensure geometric consistency\nacross observations from different sensors, we introduce a novel surface\nalignment loss that combines feature-based registration with neural rendering.\nComprehensive evaluations on multiple datasets demonstrate that UniCal\noutperforms or matches the accuracy of existing calibration approaches while\nbeing more efficient, demonstrating the value of UniCal for scalable\ncalibration.\n","authors":["Ze Yang","George Chen","Haowei Zhang","Kevin Ta","Ioan Andrei Bârsan","Daniel Murphy","Sivabalan Manivasagam","Raquel Urtasun"],"pdf_url":"https://arxiv.org/pdf/2409.18953v1.pdf","comment":"ECCV 2024. Project page: https://waabi.ai/unical/"},{"id":"http://arxiv.org/abs/2409.18939v1","updated":"2024-09-27T17:40:01Z","published":"2024-09-27T17:40:01Z","title":"Towards Super-Nominal Payload Handling: Inverse Dynamics Analysis for\n Multi-Skill Robotic Manipulation","summary":" Motion planning for articulated robots has traditionally been governed by\nalgorithms that operate within manufacturer-defined payload limits. Our\nempirical analysis of the Franka Emika Panda robot demonstrates that this\napproach unnecessarily restricts the robot's dynamically-reachable task space.\nThese results establish an expanded operational envelope for such robots,\nshowing that they can handle payloads of more than twice their rated capacity.\nAdditionally, our preliminary findings indicate that integrating non-prehensile\nmotion primitives with grasping-based manipulation has the potential to further\nincrease the success rates of manipulation tasks involving payloads exceeding\nnominal limits.\n","authors":["Anuj Pasricha","Alessandro Roncone"],"pdf_url":"https://arxiv.org/pdf/2409.18939v1.pdf","comment":"Accepted as an extended abstract to ICRA@40"},{"id":"http://arxiv.org/abs/2403.16877v2","updated":"2024-09-27T17:14:26Z","published":"2024-03-25T15:42:09Z","title":"Proprioception Is All You Need: Terrain Classification for Boreal\n Forests","summary":" Recent works in field robotics highlighted the importance of resiliency\nagainst different types of terrains. Boreal forests, in particular, are home to\nmany mobility-impeding terrains that should be considered for off-road\nautonomous navigation. Also, being one of the largest land biomes on Earth,\nboreal forests are an area where autonomous vehicles are expected to become\nincreasingly common. In this paper, we address this issue by introducing\nBorealTC, a publicly available dataset for proprioceptive-based terrain\nclassification (TC). Recorded with a Husky A200, our dataset contains 116 min\nof Inertial Measurement Unit (IMU), motor current, and wheel odometry data,\nfocusing on typical boreal forest terrains, notably snow, ice, and silty loam.\nCombining our dataset with another dataset from the state-of-the-art, we\nevaluate both a Convolutional Neural Network (CNN) and the novel state space\nmodel (SSM)-based Mamba architecture on a TC task. Interestingly, we show that\nwhile CNN outperforms Mamba on each separate dataset, Mamba achieves greater\naccuracy when trained on a combination of both. In addition, we demonstrate\nthat Mamba's learning capacity is greater than a CNN for increasing amounts of\ndata. We show that the combination of two TC datasets yields a latent space\nthat can be interpreted with the properties of the terrains. We also discuss\nthe implications of merging datasets on classification. Our source code and\ndataset are publicly available online:\nhttps://github.com/norlab-ulaval/BorealTC.\n","authors":["Damien LaRocque","William Guimont-Martin","David-Alexandre Duclos","Philippe Giguère","François Pomerleau"],"pdf_url":"https://arxiv.org/pdf/2403.16877v2.pdf","comment":"Accepted to the 2024 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2409.07924v2","updated":"2024-09-27T16:49:31Z","published":"2024-09-12T10:45:19Z","title":"Universal Trajectory Optimization Framework for Differential Drive Robot\n Class","summary":" Differential drive robots are widely used in various scenarios thanks to\ntheir straightforward principle, from household service robots to disaster\nresponse field robots. There are several types of driving mechanisms for\nreal-world applications, including two-wheeled, four-wheeled skid-steering,\ntracked robots, and so on. The differences in the driving mechanisms usually\nrequire specific kinematic modeling when precise control is desired.\nFurthermore, the nonholonomic dynamics and possible lateral slip lead to\ndifferent degrees of difficulty in getting feasible and high-quality\ntrajectories. Therefore, a comprehensive trajectory optimization framework to\ncompute trajectories efficiently for various kinds of differential drive robots\nis highly desirable. In this paper, we propose a universal trajectory\noptimization framework that can be applied to differential drive robots,\nenabling the generation of high-quality trajectories within a restricted\ncomputational timeframe. We introduce a novel trajectory representation based\non polynomial parameterization of motion states or their integrals, such as\nangular and linear velocities, which inherently matches the robots' motion to\nthe control principle. The trajectory optimization problem is formulated to\nminimize complexity while prioritizing safety and operational efficiency. We\nthen build a full-stack autonomous planning and control system to demonstrate\nits feasibility and robustness. We conduct extensive simulations and real-world\ntesting in crowded environments with three kinds of differential drive robots\nto validate the effectiveness of our approach.\n","authors":["Mengke Zhang","Nanhe Chen","Hu Wang","Jianxiong Qiu","Zhichao Han","Qiuyu Ren","Chao Xu","Fei Gao","Yanjun Cao"],"pdf_url":"https://arxiv.org/pdf/2409.07924v2.pdf","comment":"15 pages, 15 figures"},{"id":"http://arxiv.org/abs/2409.08276v3","updated":"2024-09-27T16:09:13Z","published":"2024-09-12T17:59:44Z","title":"AnySkin: Plug-and-play Skin Sensing for Robotic Touch","summary":" While tactile sensing is widely accepted as an important and useful sensing\nmodality, its use pales in comparison to other sensory modalities like vision\nand proprioception. AnySkin addresses the critical challenges that impede the\nuse of tactile sensing -- versatility, replaceability, and data reusability.\nBuilding on the simplistic design of ReSkin, and decoupling the sensing\nelectronics from the sensing interface, AnySkin simplifies integration making\nit as straightforward as putting on a phone case and connecting a charger.\nFurthermore, AnySkin is the first uncalibrated tactile-sensor with\ncross-instance generalizability of learned manipulation policies. To summarize,\nthis work makes three key contributions: first, we introduce a streamlined\nfabrication process and a design tool for creating an adhesive-free, durable\nand easily replaceable magnetic tactile sensor; second, we characterize slip\ndetection and policy learning with the AnySkin sensor; and third, we\ndemonstrate zero-shot generalization of models trained on one instance of\nAnySkin to new instances, and compare it with popular existing tactile\nsolutions like DIGIT and ReSkin. Videos of experiments, fabrication details and\ndesign files can be found on https://any-skin.github.io/\n","authors":["Raunaq Bhirangi","Venkatesh Pattabiraman","Enes Erciyes","Yifeng Cao","Tess Hellebrekers","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2409.08276v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18862v1","updated":"2024-09-27T15:57:52Z","published":"2024-09-27T15:57:52Z","title":"Safe Decentralized Multi-Agent Control using Black-Box Predictors,\n Conformal Decision Policies, and Control Barrier Functions","summary":" We address the challenge of safe control in decentralized multi-agent robotic\nsettings, where agents use uncertain black-box models to predict other agents'\ntrajectories. We use the recently proposed conformal decision theory to adapt\nthe restrictiveness of control barrier functions-based safety constraints based\non observed prediction errors. We use these constraints to synthesize\ncontrollers that balance between the objectives of safety and task\naccomplishment, despite the prediction errors. We provide an upper bound on the\naverage over time of the value of a monotonic function of the difference\nbetween the safety constraint based on the predicted trajectories and the\nconstraint based on the ground truth ones. We validate our theory through\nexperimental results showing the performance of our controllers when navigating\na robot in the multi-agent scenes in the Stanford Drone Dataset.\n","authors":["Sacha Huriot","Hussein Sibai"],"pdf_url":"https://arxiv.org/pdf/2409.18862v1.pdf","comment":"6 pages, 1 figure, submitted for ICRA 2025"},{"id":"http://arxiv.org/abs/2409.06395v3","updated":"2024-09-27T15:37:41Z","published":"2024-09-10T10:22:46Z","title":"Soft Acoustic Curvature Sensor: Design and Development","summary":" This paper introduces a novel Soft Acoustic Curvature (SAC) sensor. SAC\nincorporates integrated audio components and features an acoustic channel\nwithin a flexible structure. A reference acoustic wave, generated by a speaker\nat one end of the channel, propagates and is received by a microphone at the\nother channel's end. Our previous study revealed that acoustic wave energy\ndissipation varies with acoustic channel deformation, leading us to design a\nnovel channel capable of large deformation due to bending. We then use Machine\nLearning (ML) models to establish a complex mapping between channel\ndeformations and sound modulation. Various sound frequencies and ML models were\nevaluated to enhance curvature detection accuracy. The sensor, constructed\nusing soft material and 3D printing, was validated experimentally, with\ncurvature measurement errors remaining within 3.5 m-1 for a range of 0 to 60\nm-1 curvatures. These results demonstrate the effectiveness of the proposed\nmethod for estimating curvatures. With its flexible structure, the SAC sensor\nholds potential for applications in soft robotics, including shape measurement\nfor continuum manipulators, soft grippers, and wearable devices.\n","authors":["Mohammad Sheikh Sofla","Hanita Golshanian","Vishnu Rajendran S","Amir Ghalamzan E"],"pdf_url":"https://arxiv.org/pdf/2409.06395v3.pdf","comment":"To appear in Robotics and Automation Letter"},{"id":"http://arxiv.org/abs/2409.18794v1","updated":"2024-09-27T14:47:18Z","published":"2024-09-27T14:47:18Z","title":"Open-Nav: Exploring Zero-Shot Vision-and-Language Navigation in\n Continuous Environment with Open-Source LLMs","summary":" Vision-and-Language Navigation (VLN) tasks require an agent to follow textual\ninstructions to navigate through 3D environments. Traditional approaches use\nsupervised learning methods, relying heavily on domain-specific datasets to\ntrain VLN models. Recent methods try to utilize closed-source large language\nmodels (LLMs) like GPT-4 to solve VLN tasks in zero-shot manners, but face\nchallenges related to expensive token costs and potential data breaches in\nreal-world applications. In this work, we introduce Open-Nav, a novel study\nthat explores open-source LLMs for zero-shot VLN in the continuous environment.\nOpen-Nav employs a spatial-temporal chain-of-thought (CoT) reasoning approach\nto break down tasks into instruction comprehension, progress estimation, and\ndecision-making. It enhances scene perceptions with fine-grained object and\nspatial knowledge to improve LLM's reasoning in navigation. Our extensive\nexperiments in both simulated and real-world environments demonstrate that\nOpen-Nav achieves competitive performance compared to using closed-source LLMs.\n","authors":["Yanyuan Qiao","Wenqi Lyu","Hui Wang","Zixu Wang","Zerui Li","Yuan Zhang","Mingkui Tan","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2409.18794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18788v1","updated":"2024-09-27T14:36:20Z","published":"2024-09-27T14:36:20Z","title":"Excavating in the Wild: The GOOSE-Ex Dataset for Semantic Segmentation","summary":" The successful deployment of deep learning-based techniques for autonomous\nsystems is highly dependent on the data availability for the respective system\nin its deployment environment. Especially for unstructured outdoor\nenvironments, very few datasets exist for even fewer robotic platforms and\nscenarios. In an earlier work, we presented the German Outdoor and Offroad\nDataset (GOOSE) framework along with 10000 multimodal frames from an offroad\nvehicle to enhance the perception capabilities in unstructured environments. In\nthis work, we address the generalizability of the GOOSE framework. To\naccomplish this, we open-source the GOOSE-Ex dataset, which contains additional\n5000 labeled multimodal frames from various completely different environments,\nrecorded on a robotic excavator and a quadruped platform. We perform a\ncomprehensive analysis of the semantic segmentation performance on different\nplatforms and sensor modalities in unseen environments. In addition, we\ndemonstrate how the combined datasets can be utilized for different downstream\napplications or competitions such as offroad navigation, object manipulation or\nscene completion. The dataset, its platform documentation and pre-trained\nstate-of-the-art models for offroad perception will be made available on\nhttps://goose-dataset.de/.\n \\\n","authors":["Raphael Hagmanns","Peter Mortimer","Miguel Granero","Thorsten Luettel","Janko Petereit"],"pdf_url":"https://arxiv.org/pdf/2409.18788v1.pdf","comment":"Submitted to IEEE for review"},{"id":"http://arxiv.org/abs/2403.11876v2","updated":"2024-09-27T14:18:56Z","published":"2024-03-18T15:28:35Z","title":"Deep Bayesian Future Fusion for Self-Supervised, High-Resolution,\n Off-Road Mapping","summary":" High-speed off-road navigation requires long-range, high-resolution maps to\nenable robots to safely navigate over different surfaces while avoiding\ndangerous obstacles. However, due to limited computational power and sensing\nnoise, most approaches to off-road mapping focus on producing coarse (20-40cm)\nmaps of the environment. In this paper, we propose Future Fusion, a framework\ncapable of generating dense, high-resolution maps from sparse sensing data (30m\nforward at 2cm). This is accomplished by - (1) the efficient realization of the\nwell-known Bayes filtering within the standard deep learning models that\nexplicitly accounts for the sparsity pattern in stereo and LiDAR depth data,\nand (2) leveraging perceptual losses common in generative image completion. The\nproposed methodology outperforms the conventional baselines. Moreover, the\nlearned features and the completed dense maps lead to improvements in the\ndownstream navigation task.\n","authors":["Shubhra Aich","Wenshan Wang","Parv Maheshwari","Matthew Sivaprakasam","Samuel Triest","Cherie Ho","Jason M. Gregory","John G. Rogers III","Sebastian Scherer"],"pdf_url":"https://arxiv.org/pdf/2403.11876v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18775v1","updated":"2024-09-27T14:17:55Z","published":"2024-09-27T14:17:55Z","title":"A POMDP-based hierarchical planning framework for manipulation under\n pose uncertainty","summary":" Robots often face challenges in domestic environments where visual feedback\nis ineffective, such as retrieving objects obstructed by occlusions or finding\na light switch in the dark. In these cases, utilizing contacts to localize the\ntarget object can be effective. We propose an online planning framework using\nbinary contact signals for manipulation tasks with pose uncertainty, formulated\nas a Partially Observable Markov Decision Process (POMDP). Naively representing\nthe belief as a particle set makes planning infeasible due to the large\nuncertainties in domestic settings, as identifying the best sequence of actions\nrequires rolling out thousands of actions across millions of particles, taking\nsignificant compute time. To address this, we propose a hierarchical belief\nrepresentation. Initially, we represent the uncertainty coarsely in a 3D\nvolumetric space. Policies that refine uncertainty in this space are computed\nand executed, and once uncertainty is sufficiently reduced, the problem is\ntranslated back into the particle space for further refinement before task\ncompletion. We utilize a closed-loop planning and execution framework with a\nheuristic-search-based anytime solver that computes partial policies within a\nlimited time budget. The performance of the framework is demonstrated both in\nreal world and in simulation on the high-precision task of inserting a plug\ninto a port using a UR10e manipulator, resolving positional uncertainties up to\n50 centimeters and angular uncertainties close to $2\\pi$. Experimental results\nhighlight the framework's effectiveness, achieving a 93\\% success rate in the\nreal world and over 50\\% improvement in solution quality compared to greedy\nbaselines, significantly accelerating planning and enabling real-time solutions\nfor complex problems.\n","authors":["Muhammad Suhail Saleem","Rishi Veerapaneni","Maxim Likhachev"],"pdf_url":"https://arxiv.org/pdf/2409.18775v1.pdf","comment":"Under review (2025 IEEE International Conference on Robotics &\n Automation)"},{"id":"http://arxiv.org/abs/2409.18768v1","updated":"2024-09-27T14:12:49Z","published":"2024-09-27T14:12:49Z","title":"Learning from Demonstration with Implicit Nonlinear Dynamics Models","summary":" Learning from Demonstration (LfD) is a useful paradigm for training policies\nthat solve tasks involving complex motions. In practice, the successful\napplication of LfD requires overcoming error accumulation during policy\nexecution, i.e. the problem of drift due to errors compounding over time and\nthe consequent out-of-distribution behaviours. Existing works seek to address\nthis problem through scaling data collection, correcting policy errors with a\nhuman-in-the-loop, temporally ensembling policy predictions or through learning\nthe parameters of a dynamical system model. In this work, we propose and\nvalidate an alternative approach to overcoming this issue. Inspired by\nreservoir computing, we develop a novel neural network layer that includes a\nfixed nonlinear dynamical system with tunable dynamical properties. We validate\nthe efficacy of our neural network layer on the task of reproducing human\nhandwriting motions using the LASA Human Handwriting Dataset. Through empirical\nexperiments we demonstrate that incorporating our layer into existing neural\nnetwork architectures addresses the issue of compounding errors in LfD.\nFurthermore, we perform a comparative evaluation against existing approaches\nincluding a temporal ensemble of policy predictions and an Echo State Networks\n(ESNs) implementation. We find that our approach yields greater policy\nprecision and robustness on the handwriting task while also generalising to\nmultiple dynamics regimes and maintaining competitive latency scores.\n","authors":["Peter David Fagan","Subramanian Ramamoorthy"],"pdf_url":"https://arxiv.org/pdf/2409.18768v1.pdf","comment":"21 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.18755v1","updated":"2024-09-27T13:47:17Z","published":"2024-09-27T13:47:17Z","title":"Transparency evaluation for the Kinematic Design of the Harnesses\n through Human-Exoskeleton Interaction Modeling","summary":" Lower Limb Exoskeletons (LLEs) are wearable robots that provide mechanical\npower to the user. Human-exoskeleton (HE) connections must preserve the user's\nnatural behavior during the interaction, avoiding undesired forces. Therefore,\nnumerous works focus on their minimization. Given the inherent complications of\nrepeatedly prototyping and experimentally testing a device, modeling the\nexoskeleton and its physical interaction with the user emerges as a valuable\napproach for assessing the design effects. This paper proposes a novel method\nto compare different exoskeleton configurations with a flexible simulation\ntool. This approach contemplates simulating the dynamics of the device,\nincluding its interaction with the wearer, to evaluate multiple connection\nmechanism designs along with the kinematics and actuation of the LLE. This\nevaluation is based on the minimization of the interaction wrenches through an\noptimization process that includes the impedance parameters at the interfaces\nas optimization variables and the similarity of the LLE's joint variables\ntrajectories with the motion of the wearer's articulations. Exploratory tests\nare conducted using the Wearable Walker LLE in different configurations and\nmeasuring the interaction forces. Experimental data are then compared to the\noptimization outcomes, proving that the proposed method provides contact wrench\nestimations consistent with the collected measurements and previous outcomes\nfrom the literature. Copyright 2024 IEEE. Personal use of this material is\npermitted. Permission from IEEE must be obtained for all other uses, in any\ncurrent or future media, including reprinting/republishing this material for\nadvertising or promotional purposes, creating new collective works, for resale\nor redistribution to servers or lists, or reuse of any copyrighted component of\nthis work in other works.\n","authors":["Riccardo Bezzini","Carlo Alberto Avizzano","Francesco Porcini","Alessandro Filippeschi"],"pdf_url":"https://arxiv.org/pdf/2409.18755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18752v1","updated":"2024-09-27T13:44:36Z","published":"2024-09-27T13:44:36Z","title":"Royal Reveals: LiDAR Mapping of Kronborg Castle, Echoes of Hamlet's\n Halls","summary":" This paper presents a large scale dataset from a meticulous 360-degree LiDAR\n(Light Detection and Ranging) scan conducted on Kronborg Castle, a renowned\nRenaissance fortress located in Elsinore (Helsing{\\o}r), Denmark, famously\nassociated with Shakespeare's \"Hamlet.\" Utilising a vertical mounted, gimbal\nstabilised, 16 channel, 360-degree Velodyne VLP-16 LiDAR scanner, paired with\nan Intel RealSense L515 depth camera. This research offers an unparalleled\ndigital representation of the castle's intricate architectural details and\nstructural nuances, enabling fellow researchers to conduct experiments\nutilising the data for SLAM (Simultaneous Localisation and Mapping) as well as\nfloorplan generation.\n","authors":["Leon Davies","Simon Sølvsten"],"pdf_url":"https://arxiv.org/pdf/2409.18752v1.pdf","comment":"4 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2409.18745v1","updated":"2024-09-27T13:38:06Z","published":"2024-09-27T13:38:06Z","title":"A study on the effects of mixed explicit and implicit communications in\n human-virtual-agent interactions","summary":" Communication between humans and robots (or virtual agents) is essential for\ninteraction and often inspired by human communication, which uses gestures,\nfacial expressions, gaze direction, and other explicit and implicit means. This\nwork presents an interaction experiment where humans and virtual agents\ninteract through explicit (gestures, manual entries using mouse and keyboard,\nvoice, sound, and information on screen) and implicit (gaze direction,\nlocation, facial expressions, and raise of eyebrows) communication to evaluate\nthe effect of mixed explicit-implicit communication against purely explicit\ncommunication. Results obtained using Bayesian parameter estimation show that\nthe number of errors and task execution time did not significantly change when\nmixed explicit and implicit communications were used, and neither the perceived\nefficiency of the interaction. In contrast, acceptance, sociability, and\ntransparency of the virtual agent increased when using mixed communication\nmodalities (88.3%, 92%, and 92.9% of the effect size posterior distribution of\neach variable, respectively, were above the upper limit of the region of\npractical equivalence). This suggests that task-related measures, such as time,\nnumber of errors, and perceived efficiency of the interaction, have not been\ninfluenced by the communication type in our particular experiment. However, the\nimprovement of subjective measures related to the virtual agent, such as\nacceptance, sociability, and transparency, suggests that humans are more\nreceptive to mixed explicit and implicit communications.\n","authors":["Ana Christina Almada Campos","Bruno Vilhena Adorno"],"pdf_url":"https://arxiv.org/pdf/2409.18745v1.pdf","comment":"22 pages, 12 figures, 4 tables. Under review for International\n Journal of Social Robotics"},{"id":"http://arxiv.org/abs/2409.18743v1","updated":"2024-09-27T13:33:52Z","published":"2024-09-27T13:33:52Z","title":"OpenObject-NAV: Open-Vocabulary Object-Oriented Navigation Based on\n Dynamic Carrier-Relationship Scene Graph","summary":" In everyday life, frequently used objects like cups often have unfixed\npositions and multiple instances within the same category, and their carriers\nfrequently change as well. As a result, it becomes challenging for a robot to\nefficiently navigate to a specific instance. To tackle this challenge, the\nrobot must capture and update scene changes and plans continuously. However,\ncurrent object navigation approaches primarily focus on semantic-level and lack\nthe ability to dynamically update scene representation. This paper captures the\nrelationships between frequently used objects and their static carriers. It\nconstructs an open-vocabulary Carrier-Relationship Scene Graph (CRSG) and\nupdates the carrying status during robot navigation to reflect the dynamic\nchanges of the scene. Based on the CRSG, we further propose an instance\nnavigation strategy that models the navigation process as a Markov Decision\nProcess. At each step, decisions are informed by Large Language Model's\ncommonsense knowledge and visual-language feature similarity. We designed a\nseries of long-sequence navigation tasks for frequently used everyday items in\nthe Habitat simulator. The results demonstrate that by updating the CRSG, the\nrobot can efficiently navigate to moved targets. Additionally, we deployed our\nalgorithm on a real robot and validated its practical effectiveness.\n","authors":["Yujie Tang","Meiling Wang","Yinan Deng","Zibo Zheng","Jiagui Zhong","Yufeng Yue"],"pdf_url":"https://arxiv.org/pdf/2409.18743v1.pdf","comment":"Project website: https://openobject-nav.github.io/"},{"id":"http://arxiv.org/abs/2409.18741v1","updated":"2024-09-27T13:32:12Z","published":"2024-09-27T13:32:12Z","title":"Optimum Configuration for Hovering n-Quadrotors carrying a Slung Payload","summary":" This work proposes a strategy for organising quadrotors around a payload to\nenable hovering without external stimuli, together with a MATLAB software for\nmodelling the dynamics of a quadrotor-payload system. Based on geometric\nconcepts, the proposed design keeps the payload and system centre of mass\naligned. Hovering tests that are successful confirm the method's efficiency.\nMoreover, the algorithm is improved to take thrust capacities and propeller\ndistances into account, calculating the minimum number of quadrotors needed for\nhovering. The algorithm's effectiveness is demonstrated by numerical examples,\nwhich reveal that larger quadrotors may require fewer units while smaller ones\ngive greater flexibility. Our code can be found at:\n\\href{https://github.com/Hosnooo/Swarm-Slung-Payload}{https://github.com/Hosnooo/Swarm-Slung-Payload}\n","authors":["Mohssen E. Elshaar","Pansie A. khodary","Meral L. Badr","Mohamad A. Sayegh","Zeyad M. Manaa","Ayman M. Abdallah"],"pdf_url":"https://arxiv.org/pdf/2409.18741v1.pdf","comment":"accepted for publication at AIAA SCITECH 2025"},{"id":"http://arxiv.org/abs/2404.04857v3","updated":"2024-09-27T13:24:04Z","published":"2024-04-07T08:04:33Z","title":"Learning Adaptive Multi-Objective Robot Navigation Incorporating\n Demonstrations","summary":" Preference-aligned robot navigation in human environments is typically\nachieved through learning-based approaches, utilizing user feedback or\ndemonstrations for personalization. However, personal preferences are subject\nto change and might even be context-dependent. Yet traditional reinforcement\nlearning (RL) approaches with static reward functions often fall short in\nadapting to these varying user preferences, inevitably reflecting\ndemonstrations once training is completed. This paper introduces a framework\nthat combines multi-objective reinforcement learning (MORL) with\ndemonstration-based learning. Our approach allows for dynamic adaptation to\nchanging user preferences without retraining. It fluently modulates between\nreward-defined preference objectives and the amount of demonstration data\nreflection. Through rigorous evaluations, including a sim-to-real transfer on\ntwo robots, we demonstrate our framework's capability to reflect user\npreferences accurately while achieving high navigational performance in terms\nof collision avoidance and goal pursuance.\n","authors":["Jorge de Heuvel","Tharun Sethuraman","Maren Bennewitz"],"pdf_url":"https://arxiv.org/pdf/2404.04857v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18707v1","updated":"2024-09-27T12:50:52Z","published":"2024-09-27T12:50:52Z","title":"Discrete Policy: Learning Disentangled Action Space for Multi-Task\n Robotic Manipulation","summary":" Learning visuomotor policy for multi-task robotic manipulation has been a\nlong-standing challenge for the robotics community. The difficulty lies in the\ndiversity of action space: typically, a goal can be accomplished in multiple\nways, resulting in a multimodal action distribution for a single task. The\ncomplexity of action distribution escalates as the number of tasks increases.\nIn this work, we propose \\textbf{Discrete Policy}, a robot learning method for\ntraining universal agents capable of multi-task manipulation skills. Discrete\nPolicy employs vector quantization to map action sequences into a discrete\nlatent space, facilitating the learning of task-specific codes. These codes are\nthen reconstructed into the action space conditioned on observations and\nlanguage instruction. We evaluate our method on both simulation and multiple\nreal-world embodiments, including both single-arm and bimanual robot settings.\nWe demonstrate that our proposed Discrete Policy outperforms a well-established\nDiffusion Policy baseline and many state-of-the-art approaches, including ACT,\nOcto, and OpenVLA. For example, in a real-world multi-task training setting\nwith five tasks, Discrete Policy achieves an average success rate that is 26\\%\nhigher than Diffusion Policy and 15\\% higher than OpenVLA. As the number of\ntasks increases to 12, the performance gap between Discrete Policy and\nDiffusion Policy widens to 32.5\\%, further showcasing the advantages of our\napproach. Our work empirically demonstrates that learning multi-task policies\nwithin the latent space is a vital step toward achieving general-purpose\nagents.\n","authors":["Kun Wu","Yichen Zhu","Jinming Li","Junjie Wen","Ning Liu","Zhiyuan Xu","Qinru Qiu","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2409.18707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12514v3","updated":"2024-09-27T12:23:06Z","published":"2024-09-19T07:10:18Z","title":"TinyVLA: Towards Fast, Data-Efficient Vision-Language-Action Models for\n Robotic Manipulation","summary":" Vision-Language-Action (VLA) models have shown remarkable potential in\nvisuomotor control and instruction comprehension through end-to-end learning\nprocesses. However, current VLA models face significant challenges: they are\nslow during inference and require extensive pre-training on large amounts of\nrobotic data, making real-world deployment difficult. In this paper, we\nintroduce a new family of compact vision-language-action models, called\nTinyVLA, which offers two key advantages over existing VLA models: (1) faster\ninference speeds, and (2) improved data efficiency, eliminating the need for\npre-training stage. Our framework incorporates two essential components to\nbuild TinyVLA: (1) initializing the policy backbone with robust, high-speed\nmultimodal models, and (2) integrating a diffusion policy decoder during\nfine-tuning to enable precise robot actions. We conducted extensive evaluations\nof TinyVLA in both simulation and on real robots, demonstrating that our\napproach significantly outperforms the state-of-the-art VLA model, OpenVLA, in\nterms of speed and data efficiency, while delivering comparable or superior\nperformance. Additionally, TinyVLA exhibits strong generalization capabilities\nacross various dimensions, including language instructions, novel objects,\nunseen positions, changes in object appearance, background variations, and\nenvironmental shifts, often matching or exceeding the performance of OpenVLA.\nWe believe that \\methodname offers an interesting perspective on utilizing\npre-trained multimodal models for policy learning. Our project is at\nhttps://tiny-vla.github.io.\n","authors":["Junjie Wen","Yichen Zhu","Jinming Li","Minjie Zhu","Kun Wu","Zhiyuan Xu","Ning Liu","Ran Cheng","Chaomin Shen","Yaxin Peng","Feifei Feng","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2409.12514v3.pdf","comment":"add more citations"},{"id":"http://arxiv.org/abs/2409.18649v1","updated":"2024-09-27T11:30:37Z","published":"2024-09-27T11:30:37Z","title":"Automatic Gain Tuning for Humanoid Robots Walking Architectures Using\n Gradient-Free Optimization Techniques","summary":" Developing sophisticated control architectures has endowed robots,\nparticularly humanoid robots, with numerous capabilities. However, tuning these\narchitectures remains a challenging and time-consuming task that requires\nexpert intervention. In this work, we propose a methodology to automatically\ntune the gains of all layers of a hierarchical control architecture for walking\nhumanoids. We tested our methodology by employing different gradient-free\noptimization methods: Genetic Algorithm (GA), Covariance Matrix Adaptation\nEvolution Strategy (CMA-ES), Evolution Strategy (ES), and Differential\nEvolution (DE). We validated the parameter found both in simulation and on the\nreal ergoCub humanoid robot. Our results show that GA achieves the fastest\nconvergence (10 x 10^3 function evaluations vs 25 x 10^3 needed by the other\nalgorithms) and 100% success rate in completing the task both in simulation and\nwhen transferred on the real robotic platform. These findings highlight the\npotential of our proposed method to automate the tuning process, reducing the\nneed for manual intervention.\n","authors":["Carlotta Sartore","Marco Rando","Giulio Romualdi","Cesare Molinari","Lorenzo Rosasco","Daniele Pucci"],"pdf_url":"https://arxiv.org/pdf/2409.18649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13978v2","updated":"2024-09-27T11:28:18Z","published":"2024-09-21T02:01:55Z","title":"FracGM: A Fast Fractional Programming Technique for Geman-McClure Robust\n Estimator","summary":" Robust estimation is essential in computer vision, robotics, and navigation,\naiming to minimize the impact of outlier measurements for improved accuracy. We\npresent a fast algorithm for Geman-McClure robust estimation, FracGM,\nleveraging fractional programming techniques. This solver reformulates the\noriginal non-convex fractional problem to a convex dual problem and a linear\nequation system, iteratively solving them in an alternating optimization\npattern. Compared to graduated non-convexity approaches, this strategy exhibits\na faster convergence rate and better outlier rejection capability. In addition,\nthe global optimality of the proposed solver can be guaranteed under given\nconditions. We demonstrate the proposed FracGM solver with Wahba's rotation\nproblem and 3-D point-cloud registration along with relaxation pre-processing\nand projection post-processing. Compared to state-of-the-art algorithms, when\nthe outlier rates increase from 20% to 80%, FracGM shows 53% and 88% lower\nrotation and translation increases. In real-world scenarios, FracGM achieves\nbetter results in 13 out of 18 outcomes, while having a 19.43% improvement in\nthe computation time.\n","authors":["Bang-Shien Chen","Yu-Kai Lin","Jian-Yu Chen","Chih-Wei Huang","Jann-Long Chern","Ching-Cherng Sun"],"pdf_url":"https://arxiv.org/pdf/2409.13978v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.18641v1","updated":"2024-09-27T11:19:41Z","published":"2024-09-27T11:19:41Z","title":"Pseudo-kinematic trajectory control of tracked vehicles","summary":" Tracked vehicles are used in complex scenarios, where motion planning and\nnavigation can be very complex. They have complex dynamics, with many\nparameters that are difficult to identify and that change significantly based\non the operating conditions. We propose a simple pseudo-kinematic model, where\nthe intricate dynamic effects underlying the vehicle's motion are captured in a\nsmall set of velocity-dependent parameters. This choice enables the development\nof a Lyapunov-based trajectory controller with guaranteed performance and small\ncomputation time. We demonstrate the correctness of our approach with both\nsimulation and experimental data.\n","authors":["Michele Focchi","Daniele Fontanelli","Luigi Palopoli"],"pdf_url":"https://arxiv.org/pdf/2409.18641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08424v2","updated":"2024-09-27T10:05:56Z","published":"2024-04-12T12:15:14Z","title":"Comparing Apples to Oranges: LLM-powered Multimodal Intention Prediction\n in an Object Categorization Task","summary":" Human intention-based systems enable robots to perceive and interpret user\nactions to interact with humans and adapt to their behavior proactively.\nTherefore, intention prediction is pivotal in creating a natural interaction\nwith social robots in human-designed environments. In this paper, we examine\nusing Large Language Models (LLMs) to infer human intention in a collaborative\nobject categorization task with a physical robot. We propose a novel multimodal\napproach that integrates user non-verbal cues, like hand gestures, body poses,\nand facial expressions, with environment states and user verbal cues to predict\nuser intentions in a hierarchical architecture. Our evaluation of five LLMs\nshows the potential for reasoning about verbal and non-verbal user cues,\nleveraging their context-understanding and real-world knowledge to support\nintention prediction while collaborating on a task with a social robot.\n","authors":["Hassan Ali","Philipp Allgeuer","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2404.08424v2.pdf","comment":"Accepted at ICSR 2024,14 pages,5 figures,2 tables; work was co-funded\n by Horizon Europe project TERAIS under Grant agreement number 101079338"},{"id":"http://arxiv.org/abs/2409.18592v1","updated":"2024-09-27T09:51:45Z","published":"2024-09-27T09:51:45Z","title":"From One to the Power of Many: Augmentations for Invariance to\n Multi-LiDAR Perception from Single-Sensor Datasets","summary":" Recently, LiDAR perception methods for autonomous vehicles, powered by deep\nneural networks have experienced steep growth in performance on classic\nbenchmarks, such as nuScenes and SemanticKITTI. However, there are still large\ngaps in performance when deploying models trained on such single-sensor setups\nto modern multi-sensor vehicles. In this work, we investigate if a lack of\ninvariance may be responsible for these performance gaps, and propose some\ninitial solutions in the form of application-specific data augmentations, which\ncan facilitate better transfer to multi-sensor LiDAR setups. We provide\nexperimental evidence that our proposed augmentations improve generalization\nacross LiDAR sensor setups, and investigate how these augmentations affect the\nmodels' invariance properties on simulations of different LiDAR sensor setups.\n","authors":["Marc Uecker","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2409.18592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18586v1","updated":"2024-09-27T09:45:21Z","published":"2024-09-27T09:45:21Z","title":"Analysis of Truncated Singular Value Decomposition for Koopman\n Operator-Based Lane Change Model","summary":" Understanding and modeling complex dynamic systems is crucial for enhancing\nvehicle performance and safety, especially in the context of autonomous\ndriving. Recently, popular methods such as Koopman operators and their\napproximators, known as Extended Dynamic Mode Decomposition (EDMD), have\nemerged for their effectiveness in transforming strongly nonlinear system\nbehavior into linear representations. This allows them to be integrated with\nconventional linear controllers. To achieve this, Singular Value Decomposition\n(SVD), specifically truncated SVD, is employed to approximate Koopman operators\nfrom extensive datasets efficiently. This study evaluates different basis\nfunctions used in EDMD and ranks for truncated SVD for representing lane change\nbehavior models, aiming to balance computational efficiency with information\nloss. The findings, however, suggest that the technique of truncated SVD does\nnot necessarily achieve substantial reductions in computational training time\nand results in significant information loss.\n","authors":["Chinnawut Nantabut"],"pdf_url":"https://arxiv.org/pdf/2409.18586v1.pdf","comment":"Submitted to the 21st International Conference on Informatics in\n Control, Automation and Robotics (ICINCO 2024)"},{"id":"http://arxiv.org/abs/2409.18585v1","updated":"2024-09-27T09:43:33Z","published":"2024-09-27T09:43:33Z","title":"Unscented Transform-based Pure Pursuit Path-Tracking Algorithm under\n Uncertainty","summary":" Automated driving has become more and more popular due to its potential to\neliminate road accidents by taking over driving tasks from humans. One of the\nremaining challenges is to follow a planned path autonomously, especially when\nuncertainties in self-localizing or understanding the surroundings can\ninfluence the decisions made by autonomous vehicles, such as calculating how\nmuch they need to steer to minimize tracking errors. In this paper, a modified\ngeometric pure pursuit path-tracking algorithm is proposed, taking into\nconsideration such uncertainties using the unscented transform. The algorithm\nis tested through simulations for typical road geometries, such as straight and\ncircular lines.\n","authors":["Chinnawut Nantabut"],"pdf_url":"https://arxiv.org/pdf/2409.18585v1.pdf","comment":"Submitted to the 21st International Conference on Informatics in\n Control, Automation and Robotics (ICINCO 2024)"},{"id":"http://arxiv.org/abs/2404.00343v2","updated":"2024-09-27T08:29:50Z","published":"2024-03-30T12:46:15Z","title":"Commonsense Scene Graph-based Target Localization for Object Search","summary":" Object search is a fundamental skill for household robots, yet the core\nproblem lies in the robot's ability to locate the target object accurately. The\ndynamic nature of household environments, characterized by the arbitrary\nplacement of daily objects by users, makes it challenging to perform target\nlocalization. To efficiently locate the target object, the robot needs to be\nequipped with knowledge at both the object and room level. However, existing\napproaches rely solely on one type of knowledge, leading to unsatisfactory\nobject localization performance and, consequently, inefficient object search\nprocesses. To address this problem, we propose a commonsense scene graph-based\ntarget localization, CSG-TL, to enhance target object search in the household\nenvironment. Given the pre-built map with stationary items, the robot models\nthe room-level knowledge with object-level commonsense knowledge generated by a\nlarge language model (LLM) to a commonsense scene graph (CSG), supporting both\ntypes of knowledge for CSG-TL. To demonstrate the superiority of CSG-TL on\ntarget localization, extensive experiments are performed on the real-world\nScanNet dataset and the AI2THOR simulator. Moreover, we have extended CSG-TL to\nan object search framework, CSG-OS, validated in both simulated and real-world\nenvironments. Code and videos are available at\nhttps://sites.google.com/view/csg-os.\n","authors":["Wenqi Ge","Chao Tang","Hong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.00343v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18545v1","updated":"2024-09-27T08:27:36Z","published":"2024-09-27T08:27:36Z","title":"An Epistemic Human-Aware Task Planner which Anticipates Human Beliefs\n and Decisions","summary":" We present a substantial extension of our Human-Aware Task Planning\nframework, tailored for scenarios with intermittent shared execution\nexperiences and significant belief divergence between humans and robots,\nparticularly due to the uncontrollable nature of humans. Our objective is to\nbuild a robot policy that accounts for uncontrollable human behaviors, thus\nenabling the anticipation of possible advancements achieved by the robot when\nthe execution is not shared, e.g. when humans are briefly absent from the\nshared environment to complete a subtask. But, this anticipation is considered\nfrom the perspective of humans who have access to an estimated model for the\nrobot. To this end, we propose a novel planning framework and build a solver\nbased on AND-OR search, which integrates knowledge reasoning, including\nsituation assessment by perspective taking. Our approach dynamically models and\nmanages the expansion and contraction of potential advances while precisely\nkeeping track of when (and when not) agents share the task execution\nexperience. The planner systematically assesses the situation and ignores\nworlds that it has reason to think are impossible for humans. Overall, our new\nsolver can estimate the distinct beliefs of the human and the robot along\npotential courses of action, enabling the synthesis of plans where the robot\nselects the right moment for communication, i.e. informing, or replying to an\ninquiry, or defers ontic actions until the execution experiences can be shared.\nPreliminary experiments in two domains, one novel and one adapted, demonstrate\nthe effectiveness of the framework.\n","authors":["Shashank Shekhar","Anthony Favier","Rachid Alami"],"pdf_url":"https://arxiv.org/pdf/2409.18545v1.pdf","comment":"15 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2406.02370v4","updated":"2024-09-27T08:16:11Z","published":"2024-06-04T14:49:07Z","title":"Query-based Semantic Gaussian Field for Scene Representation in\n Reinforcement Learning","summary":" Latent scene representation plays a significant role in training\nreinforcement learning (RL) agents. To obtain good latent vectors describing\nthe scenes, recent works incorporate the 3D-aware latent-conditioned NeRF\npipeline into scene representation learning. However, these NeRF-related\nmethods struggle to perceive 3D structural information due to the inefficient\ndense sampling in volumetric rendering. Moreover, they lack fine-grained\nsemantic information included in their scene representation vectors because\nthey evenly consider free and occupied spaces. Both of them can destroy the\nperformance of downstream RL tasks. To address the above challenges, we propose\na novel framework that adopts the efficient 3D Gaussian Splatting (3DGS) to\nlearn 3D scene representation for the first time. In brief, we present the\nQuery-based Generalizable 3DGS to bridge the 3DGS technique and scene\nrepresentations with more geometrical awareness than those in NeRFs. Moreover,\nwe present the Hierarchical Semantics Encoding to ground the fine-grained\nsemantic features to 3D Gaussians and further distilled to the scene\nrepresentation vectors. We conduct extensive experiments on two RL platforms\nincluding Maniskill2 and Robomimic across 10 different tasks. The results show\nthat our method outperforms the other 5 baselines by a large margin. We achieve\nthe best success rates on 8 tasks and the second-best on the other two tasks.\n","authors":["Jiaxu Wang","Ziyi Zhang","Qiang Zhang","Jia Li","Jingkai Sun","Mingyuan Sun","Junhao He","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2406.02370v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15256v4","updated":"2024-09-27T07:16:23Z","published":"2024-04-23T17:42:45Z","title":"TOP-Nav: Legged Navigation Integrating Terrain, Obstacle and\n Proprioception Estimation","summary":" Legged navigation is typically examined within open-world, off-road, and\nchallenging environments. In these scenarios, estimating external disturbances\nrequires a complex synthesis of multi-modal information. This underlines a\nmajor limitation in existing works that primarily focus on avoiding obstacles.\nIn this work, we propose TOP-Nav, a novel legged navigation framework that\nintegrates a comprehensive path planner with Terrain awareness, Obstacle\navoidance and close-loop Proprioception. TOP-Nav underscores the synergies\nbetween vision and proprioception in both path and motion planning. Within the\npath planner, we present and integrate a terrain estimator that enables the\nrobot to select waypoints on terrains with higher traversability while\neffectively avoiding obstacles. In the motion planning level, we not only\nimplement a locomotion controller to track the navigation commands, but also\nconstruct a proprioception advisor to provide motion evaluations for the path\nplanner. Based on the close-loop motion feedback, we make online corrections\nfor the vision-based terrain and obstacle estimations. Consequently, TOP-Nav\nachieves open-world navigation that the robot can handle terrains or\ndisturbances beyond the distribution of prior knowledge and overcomes\nconstraints imposed by visual conditions. Building upon extensive experiments\nconducted in both simulation and real-world environments, TOP-Nav demonstrates\nsuperior performance in open-world navigation compared to existing methods.\n","authors":["Junli Ren","Yikai Liu","Yingru Dai","Junfeng Long","Guijin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.15256v4.pdf","comment":"Published on CoRL 2024"},{"id":"http://arxiv.org/abs/2409.14738v2","updated":"2024-09-27T07:13:22Z","published":"2024-09-23T06:34:06Z","title":"Enabling On-Chip High-Frequency Adaptive Linear Optimal Control via\n Linearized Gaussian Process","summary":" Unpredictable and complex aerodynamic effects pose significant challenges to\nachieving precise flight control, such as the downwash effect from upper\nvehicles to lower ones. Conventional methods often struggle to accurately model\nthese interactions, leading to controllers that require large safety margins\nbetween vehicles. Moreover, the controller on real drones usually requires\nhigh-frequency and has limited on-chip computation, making the adaptive control\ndesign more difficult to implement. To address these challenges, we incorporate\nGaussian process (GP) to model the adaptive external aerodynamics with linear\nmodel predictive control. The GP is linearized to enable real-time\nhigh-frequency solutions. Moreover, to handle the error caused by\nlinearization, we integrate end-to-end Bayesian optimization during sample\ncollection stages to improve the control performance. Experimental results on\nboth simulations and real quadrotors show that we can achieve real-time\nsolvable computation speed with acceptable tracking errors.\n","authors":["Yuan Gao","Yinyi Lai","Jun Wang","Yini Fang"],"pdf_url":"https://arxiv.org/pdf/2409.14738v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11247v2","updated":"2024-09-27T06:22:50Z","published":"2024-03-17T15:41:35Z","title":"Compact 3D Gaussian Splatting For Dense Visual SLAM","summary":" Recent work has shown that 3D Gaussian-based SLAM enables high-quality\nreconstruction, accurate pose estimation, and real-time rendering of scenes.\nHowever, these approaches are built on a tremendous number of redundant 3D\nGaussian ellipsoids, leading to high memory and storage costs, and slow\ntraining speed. To address the limitation, we propose a compact 3D Gaussian\nSplatting SLAM system that reduces the number and the parameter size of\nGaussian ellipsoids. A sliding window-based masking strategy is first proposed\nto reduce the redundant ellipsoids. Then we observe that the covariance matrix\n(geometry) of most 3D Gaussian ellipsoids are extremely similar, which\nmotivates a novel geometry codebook to compress 3D Gaussian geometric\nattributes, i.e., the parameters. Robust and accurate pose estimation is\nachieved by a global bundle adjustment method with reprojection loss. Extensive\nexperiments demonstrate that our method achieves faster training and rendering\nspeed while maintaining the state-of-the-art (SOTA) quality of the scene\nrepresentation.\n","authors":["Tianchen Deng","Yaohui Chen","Leyan Zhang","Jianfei Yang","Shenghai Yuan","Jiuming Liu","Danwei Wang","Hesheng Wang","Weidong Chen"],"pdf_url":"https://arxiv.org/pdf/2403.11247v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13824v2","updated":"2024-09-27T06:13:36Z","published":"2023-11-23T06:36:57Z","title":"Constraint-Guided Online Data Selection for Scalable Data-Driven Safety\n Filters in Uncertain Robotic Systems","summary":" As the use of autonomous robots expands in tasks that are complex and\nchallenging to model, the demand for robust data-driven control methods that\ncan certify safety and stability in uncertain conditions is increasing.\nHowever, the practical implementation of these methods often faces scalability\nissues due to the growing amount of data points with system complexity, and a\nsignificant reliance on high-quality training data. In response to these\nchallenges, this study presents a scalable data-driven controller that\nefficiently identifies and infers from the most informative data points for\nimplementing data-driven safety filters. Our approach is grounded in the\nintegration of a model-based certificate function-based method and Gaussian\nProcess (GP) regression, reinforced by a novel online data selection algorithm\nthat reduces time complexity from quadratic to linear relative to dataset size.\nEmpirical evidence, gathered from successful real-world cart-pole swing-up\nexperiments and simulated locomotion of a five-link bipedal robot, demonstrates\nthe efficacy of our approach. Our findings reveal that our efficient online\ndata selection algorithm, which strategically selects key data points, enhances\nthe practicality and efficiency of data-driven certifying filters in complex\nrobotic systems, significantly mitigating scalability concerns inherent in\nnonparametric learning-based control methods.\n","authors":["Jason J. Choi","Fernando Castañeda","Wonsuhk Jung","Bike Zhang","Claire J. Tomlin","Koushil Sreenath"],"pdf_url":"https://arxiv.org/pdf/2311.13824v2.pdf","comment":"The first three authors contributed equally to the work. This work\n has been submitted to the IEEE for possible publication. Copyright may be\n transferred without notice, after which this version may no longer be\n accessible"},{"id":"http://arxiv.org/abs/2409.18457v1","updated":"2024-09-27T05:31:33Z","published":"2024-09-27T05:31:33Z","title":"DynaWeightPnP: Toward global real-time 3D-2D solver in PnP without\n correspondences","summary":" This paper addresses a special Perspective-n-Point (PnP) problem: estimating\nthe optimal pose to align 3D and 2D shapes in real-time without\ncorrespondences, termed as correspondence-free PnP. While several studies have\nfocused on 3D and 2D shape registration, achieving both real-time and accurate\nperformance remains challenging. This study specifically targets the 3D-2D\ngeometric shape registration tasks, applying the recently developed Reproducing\nKernel Hilbert Space (RKHS) to address the \"big-to-small\" issue. An iterative\nreweighted least squares method is employed to solve the RKHS-based formulation\nefficiently. Moreover, our work identifies a unique and interesting\nobservability issue in correspondence-free PnP: the numerical ambiguity between\nrotation and translation. To address this, we proposed DynaWeightPnP,\nintroducing a dynamic weighting sub-problem and an alternative searching\nalgorithm designed to enhance pose estimation and alignment accuracy.\nExperiments were conducted on a typical case, that is, a 3D-2D vascular\ncenterline registration task within Endovascular Image-Guided Interventions\n(EIGIs). Results demonstrated that the proposed algorithm achieves registration\nprocessing rates of 60 Hz (without post-refinement) and 31 Hz (with\npost-refinement) on modern single-core CPUs, with competitive accuracy\ncomparable to existing methods. These results underscore the suitability of\nDynaWeightPnP for future robot navigation tasks like EIGIs.\n","authors":["Jingwei Song","Maani Ghaffari"],"pdf_url":"https://arxiv.org/pdf/2409.18457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18452v1","updated":"2024-09-27T05:23:49Z","published":"2024-09-27T05:23:49Z","title":"Exploiting Physical Human-Robot Interaction to Provide a Unique Rolling\n Experience with a Riding Ballbot","summary":" This study introduces the development of hands-free control schemes for a\nriding ballbot, designed to allow riders including manual wheelchair users to\ncontrol its movement through torso leaning and twisting. The hardware platform,\nPersonal Unique Rolling Experience (PURE), utilizes a ballbot drivetrain, a\ndynamically stable mobile robot that uses a ball as its wheel to provide\nomnidirectional maneuverability. To accommodate users with varying torso motion\nfunctions, the hanads-free control scheme should be adjustable based on the\nrider's torso function and personal preferences. Therefore, concepts of (a)\nimpedance control and (b) admittance control were integrated into the control\nscheme. A duo-agent optimization framework was utilized to assess the\nefficiency of this rider-ballbot system for a safety-critical task: braking\nfrom 1.4 m/s. The candidate control schemes were further implemented in the\nphysical robot hardware and validated with two experienced users, demonstrating\nthe efficiency and robustness of the hands-free admittance control scheme\n(HACS). This interface, which utilized physical human-robot interaction (pHRI)\nas the input, resulted in lower braking effort and shorter braking distance and\ntime. Subsequently, 12 novice participants (six able-bodied users and six\nmanual wheelchair users) with different levels of torso motion capability were\nthen recruited to benchmark the braking performance with HACS. The indoor\nnavigation capability of PURE was further demonstrated with these participants\nin courses simulating narrow hallways, tight turns, and navigation through\nstatic and dynamic obstacles. By exploiting pHRI, the proposed admittance-style\ncontrol scheme provided effective control of the ballbot via torso motions.\nThis interface enables PURE to provide a personal unique rolling experience to\nmanual wheelchair users for safe and agile indoor navigation.\n","authors":["Chenzhang Xiao","Seung Yun Song","Yu Chen","Mahshid Mansouri","João Ramos","Adam W. Bleakney","William R. Norris","Elizabeth T. Hsiao-Wecksler"],"pdf_url":"https://arxiv.org/pdf/2409.18452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18434v1","updated":"2024-09-27T03:52:45Z","published":"2024-09-27T03:52:45Z","title":"Get It For Free: Radar Segmentation without Expert Labels and Its\n Application in Odometry and Localization","summary":" This paper presents a novel weakly supervised semantic segmentation method\nfor radar segmentation, where the existing LiDAR semantic segmentation models\nare employed to generate semantic labels, which then serve as supervision\nsignals for training a radar semantic segmentation model. The obtained radar\nsemantic segmentation model outperforms LiDAR-based models, providing more\nconsistent and robust segmentation under all-weather conditions, particularly\nin the snow, rain and fog. To mitigate potential errors in LiDAR semantic\nlabels, we design a dedicated refinement scheme that corrects erroneous labels\nbased on structural features and distribution patterns. The semantic\ninformation generated by our radar segmentation model is used in two downstream\ntasks, achieving significant performance improvements. In large-scale\nradar-based localization using OpenStreetMap, it leads to localization error\nreduction by 20.55\\% over prior methods. For the odometry task, it improves\ntranslation accuracy by 16.4\\% compared to the second-best method, securing the\nfirst place in the radar odometry competition at the Radar in Robotics workshop\nof ICRA 2024, Japan\n","authors":["Siru Li","Ziyang Hong","Yushuai Chen","Liang Hu","Jiahu Qin"],"pdf_url":"https://arxiv.org/pdf/2409.18434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18411v1","updated":"2024-09-27T02:58:46Z","published":"2024-09-27T02:58:46Z","title":"BoT-Drive: Hierarchical Behavior and Trajectory Planning for Autonomous\n Driving using POMDPs","summary":" Uncertainties in dynamic road environments pose significant challenges for\nbehavior and trajectory planning in autonomous driving. This paper introduces\nBoT-Drive, a planning algorithm that addresses uncertainties at both behavior\nand trajectory levels within a Partially Observable Markov Decision Process\n(POMDP) framework. BoT-Drive employs driver models to characterize unknown\nbehavioral intentions and utilizes their model parameters to infer hidden\ndriving styles. By also treating driver models as decision-making actions for\nthe autonomous vehicle, BoT-Drive effectively tackles the exponential\ncomplexity inherent in POMDPs. To enhance safety and robustness, the planner\nfurther applies importance sampling to refine the driving trajectory\nconditioned on the planned high-level behavior. Evaluation on real-world data\nshows that BoT-Drive consistently outperforms both existing planning methods\nand learning-based methods in regular and complex urban driving scenes,\ndemonstrating significant improvements in driving safety and reliability.\n","authors":["Xuanjin Jin","Chendong Zeng","Shengfa Zhu","Chunxiao Liu","Panpan Cai"],"pdf_url":"https://arxiv.org/pdf/2409.18411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01733v2","updated":"2024-09-27T02:49:40Z","published":"2024-07-01T19:07:10Z","title":"AquaMILR: Mechanical intelligence simplifies control of undulatory\n robots in cluttered fluid environments","summary":" While undulatory swimming of elongate limbless robots has been extensively\nstudied in open hydrodynamic environments, less research has been focused on\nlimbless locomotion in complex, cluttered aquatic environments. Motivated by\nthe concept of mechanical intelligence, where controls for obstacle navigation\ncan be offloaded to passive body mechanics in terrestrial limbless locomotion,\nwe hypothesize that principles of mechanical intelligence can be extended to\ncluttered hydrodynamic regimes. To test this, we developed an untethered\nlimbless robot capable of undulatory swimming on water surfaces, utilizing a\nbilateral cable-driven mechanism inspired by organismal muscle actuation\nmorphology to achieve programmable anisotropic body compliance. We demonstrated\nthrough robophysical experiments that, similar to terrestrial locomotion, an\nappropriate level of body compliance can facilitate emergent swim through\ncomplex hydrodynamic environments under pure open-loop control. Moreover, we\nfound that swimming performance depends on undulation frequency, with effective\nlocomotion achieved only within a specific frequency range. This contrasts with\nhighly damped terrestrial regimes, where inertial effects can often be\nneglected. Further, to enhance performance and address the challenges posed by\nnondeterministic obstacle distributions, we incorporated computational\nintelligence by developing a real-time body compliance tuning controller based\non cable tension feedback. This controller improves the robot's robustness and\noverall speed in heterogeneous hydrodynamic environments.\n","authors":["Tianyu Wang","Nishanth Mankame","Matthew Fernandez","Velin Kojouharov","Daniel I. Goldman"],"pdf_url":"https://arxiv.org/pdf/2407.01733v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03954v7","updated":"2024-09-27T02:43:48Z","published":"2024-03-06T18:58:49Z","title":"3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple\n 3D Representations","summary":" Imitation learning provides an efficient way to teach robots dexterous\nskills; however, learning complex skills robustly and generalizablely usually\nconsumes large amounts of human demonstrations. To tackle this challenging\nproblem, we present 3D Diffusion Policy (DP3), a novel visual imitation\nlearning approach that incorporates the power of 3D visual representations into\ndiffusion policies, a class of conditional action generative models. The core\ndesign of DP3 is the utilization of a compact 3D visual representation,\nextracted from sparse point clouds with an efficient point encoder. In our\nexperiments involving 72 simulation tasks, DP3 successfully handles most tasks\nwith just 10 demonstrations and surpasses baselines with a 24.2% relative\nimprovement. In 4 real robot tasks, DP3 demonstrates precise control with a\nhigh success rate of 85%, given only 40 demonstrations of each task, and shows\nexcellent generalization abilities in diverse aspects, including space,\nviewpoint, appearance, and instance. Interestingly, in real robot experiments,\nDP3 rarely violates safety requirements, in contrast to baseline methods which\nfrequently do, necessitating human intervention. Our extensive evaluation\nhighlights the critical importance of 3D representations in real-world robot\nlearning. Videos, code, and data are available on\nhttps://3d-diffusion-policy.github.io .\n","authors":["Yanjie Ze","Gu Zhang","Kangning Zhang","Chenyuan Hu","Muhan Wang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2403.03954v7.pdf","comment":"Published at Robotics: Science and Systems (RSS) 2024. Videos, code,\n and data: https://3d-diffusion-policy.github.io"},{"id":"http://arxiv.org/abs/2409.18405v1","updated":"2024-09-27T02:42:55Z","published":"2024-09-27T02:42:55Z","title":"Word2Wave: Language Driven Mission Programming for Efficient Subsea\n Deployments of Marine Robots","summary":" This paper explores the design and development of a language-based interface\nfor dynamic mission programming of autonomous underwater vehicles (AUVs). The\nproposed 'Word2Wave' (W2W) framework enables interactive programming and\nparameter configuration of AUVs for remote subsea missions. The W2W framework\nincludes: (i) a set of novel language rules and command structures for\nefficient language-to-mission mapping; (ii) a GPT-based prompt engineering\nmodule for training data generation; (iii) a small language model (SLM)-based\nsequence-to-sequence learning pipeline for mission command generation from\nhuman speech or text; and (iv) a novel user interface for 2D mission map\nvisualization and human-machine interfacing. The proposed learning pipeline\nadapts an SLM named T5-Small that can learn language-to-mission mapping from\nprocessed language data effectively, providing robust and efficient\nperformance. In addition to a benchmark evaluation with state-of-the-art, we\nconduct a user interaction study to demonstrate the effectiveness of W2W over\ncommercial AUV programming interfaces. Across participants, W2W-based\nprogramming required less than 10% time for mission programming compared to\ntraditional interfaces; it is deemed to be a simpler and more natural paradigm\nfor subsea mission programming with a usability score of 76.25. W2W opens up\npromising future research opportunities on hands-free AUV mission programming\nfor efficient subsea deployments.\n","authors":["Ruo Chen","David Blow","Adnan Abdullah","Md Jahidul Islam"],"pdf_url":"https://arxiv.org/pdf/2409.18405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18394v1","updated":"2024-09-27T02:24:07Z","published":"2024-09-27T02:24:07Z","title":"An Augmented Reality Interface for Teleoperating Robot Manipulators:\n Reducing Demonstrator Task Load through Digital Twin Control","summary":" Acquiring high-quality demonstration data is essential for the success of\ndata-driven methods, such as imitation learning. Existing platforms for\nproviding demonstrations for manipulation tasks often impose significant\nphysical and mental demands on the demonstrator, require additional hardware\nsystems, or necessitate specialized domain knowledge. In this work, we present\na novel augmented reality (AR) interface for teleoperating robotic\nmanipulators, emphasizing the demonstrator's experience, particularly in the\ncontext of performing complex tasks that require precision and accuracy. This\ninterface, designed for the Microsoft HoloLens 2, leverages the adaptable\nnature of mixed reality (MR), enabling users to control a physical robot\nthrough digital twin surrogates. We assess the effectiveness of our approach\nacross three complex manipulation tasks and compare its performance against\nOPEN TEACH, a recent virtual reality (VR) teleoperation system, as well as two\ntraditional control methods: kinesthetic teaching and a 3D SpaceMouse for\nend-effector control. Our findings show that our method performs comparably to\nthe VR approach and demonstrates the potential for AR in data collection.\nAdditionally, we conduct a pilot study to evaluate the usability and task load\nassociated with each method. Results indicate that our AR-based system achieves\nhigher usability scores than the VR benchmark and significantly reduces mental\ndemand, physical effort, and frustration experienced by users. An accompanying\nvideo can be found at https://youtu.be/w-M58ohPgrA.\n","authors":["Aliyah Smith","Monroe Kennedy III"],"pdf_url":"https://arxiv.org/pdf/2409.18394v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.18390v1","updated":"2024-09-27T02:12:56Z","published":"2024-09-27T02:12:56Z","title":"Speech to Reality: On-Demand Production using Natural Language, 3D\n Generative AI, and Discrete Robotic Assembly","summary":" We present a system that transforms speech into physical objects by combining\n3D generative Artificial Intelligence with robotic assembly. The system\nleverages natural language input to make design and manufacturing more\naccessible, enabling individuals without expertise in 3D modeling or robotic\nprogramming to create physical objects. We propose utilizing discrete robotic\nassembly of lattice-based voxel components to address the challenges of using\ngenerative AI outputs in physical production, such as design variability,\nfabrication speed, structural integrity, and material waste. The system\ninterprets speech to generate 3D objects, discretizes them into voxel\ncomponents, computes an optimized assembly sequence, and generates a robotic\ntoolpath. The results are demonstrated through the assembly of various objects,\nranging from chairs to shelves, which are prompted via speech and realized\nwithin 5 minutes using a 6-axis robotic arm.\n","authors":["Alexander Htet Kyaw","Se Hwan Jeon","Miana Smith","Neil Gershenfeld"],"pdf_url":"https://arxiv.org/pdf/2409.18390v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible. An updated version will replace this version"},{"id":"http://arxiv.org/abs/2409.18385v1","updated":"2024-09-27T02:01:05Z","published":"2024-09-27T02:01:05Z","title":"Robo-CSK-Organizer: Commonsense Knowledge to Organize Detected Objects\n for Multipurpose Robots","summary":" This paper presents a system called Robo-CSK-Organizer that infuses\ncommonsense knowledge from a classical knowledge based to enhance the context\nrecognition capabilities of robots so as to facilitate the organization of\ndetected objects by classifying them in a task-relevant manner. It is\nparticularly useful in multipurpose robotics. Unlike systems relying solely on\ndeep learning tools such as ChatGPT, the Robo-CSK-Organizer system stands out\nin multiple avenues as follows. It resolves ambiguities well, and maintains\nconsistency in object placement. Moreover, it adapts to diverse task-based\nclassifications. Furthermore, it contributes to explainable AI, hence helping\nto improve trust and human-robot collaboration. Controlled experiments\nperformed in our work, simulating domestic robotics settings, make\nRobo-CSK-Organizer demonstrate superior performance while placing objects in\ncontextually relevant locations. This work highlights the capacity of an\nAI-based system to conduct commonsense-guided decision-making in robotics\ncloser to the thresholds of human cognition. Hence, Robo-CSK-Organizer makes\npositive impacts on AI and robotics.\n","authors":["Rafael Hidalgo","Jesse Parron","Aparna S. Varde","Weitian Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18383v1","updated":"2024-09-27T01:49:56Z","published":"2024-09-27T01:49:56Z","title":"AquaMILR+: Design of an untethered limbless robot for complex aquatic\n terrain navigation","summary":" This paper presents AquaMILR+, an untethered limbless robot designed for\nagile navigation in complex aquatic environments. The robot features a\nbilateral actuation mechanism that models musculoskeletal actuation in many\nanguilliform swimming organisms which propagates a moving wave from head to\ntail allowing open fluid undulatory swimming. This actuation mechanism employs\nmechanical intelligence, enhancing the robot's maneuverability when interacting\nwith obstacles. AquaMILR+ also includes a compact depth control system inspired\nby the swim bladder and lung structures of eels and sea snakes. The mechanism,\ndriven by a syringe and telescoping leadscrew, enables depth and pitch\ncontrol-capabilities that are difficult for most anguilliform swimming robots\nto achieve. Additional structures, such as fins and a tail, further improve\nstability and propulsion efficiency. Our tests in both open water and indoor 2D\nand 3D heterogeneous aquatic environments highlight AquaMILR+'s capabilities\nand suggest a promising system for complex underwater tasks such as search and\nrescue and deep-sea exploration.\n","authors":["Matthew Fernandez","Tianyu Wang","Galen Tunnicliffe","Donoven Dortilus","Peter Gunnarson","John O. Dabiri","Daniel I. Goldman"],"pdf_url":"https://arxiv.org/pdf/2409.18383v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18382v1","updated":"2024-09-27T01:48:16Z","published":"2024-09-27T01:48:16Z","title":"CurricuLLM: Automatic Task Curricula Design for Learning Complex Robot\n Skills using Large Language Models","summary":" Curriculum learning is a training mechanism in reinforcement learning (RL)\nthat facilitates the achievement of complex policies by progressively\nincreasing the task difficulty during training. However, designing effective\ncurricula for a specific task often requires extensive domain knowledge and\nhuman intervention, which limits its applicability across various domains. Our\ncore idea is that large language models (LLMs), with their extensive training\non diverse language data and ability to encapsulate world knowledge, present\nsignificant potential for efficiently breaking down tasks and decomposing\nskills across various robotics environments. Additionally, the demonstrated\nsuccess of LLMs in translating natural language into executable code for RL\nagents strengthens their role in generating task curricula. In this work, we\npropose CurricuLLM, which leverages the high-level planning and programming\ncapabilities of LLMs for curriculum design, thereby enhancing the efficient\nlearning of complex target tasks. CurricuLLM consists of: (Step 1) Generating\nsequence of subtasks that aid target task learning in natural language form,\n(Step 2) Translating natural language description of subtasks in executable\ntask code, including the reward code and goal distribution code, and (Step 3)\nEvaluating trained policies based on trajectory rollout and subtask\ndescription. We evaluate CurricuLLM in various robotics simulation\nenvironments, ranging from manipulation, navigation, and locomotion, to show\nthat CurricuLLM can aid learning complex robot control tasks. In addition, we\nvalidate humanoid locomotion policy learned through CurricuLLM in real-world.\nThe code is provided in https://github.com/labicon/CurricuLLM\n","authors":["Kanghyun Ryu","Qiayuan Liao","Zhongyu Li","Koushil Sreenath","Negar Mehr"],"pdf_url":"https://arxiv.org/pdf/2409.18382v1.pdf","comment":"Submitted to ICRA 2025"},{"id":"http://arxiv.org/abs/2409.18361v1","updated":"2024-09-27T00:35:21Z","published":"2024-09-27T00:35:21Z","title":"iWalker: Imperative Visual Planning for Walking Humanoid Robot","summary":" Humanoid robots, with the potential to perform a broad range of tasks in\nenvironments designed for humans, have been deemed crucial for the basis of\ngeneral AI agents. When talking about planning and controlling, although\ntraditional models and task-specific methods have been extensively studied over\nthe past few decades, they are inadequate for achieving the flexibility and\nversatility needed for general autonomy. Learning approaches, especially\nreinforcement learning, are powerful and popular nowadays, but they are\ninherently \"blind\" during training, relying heavily on trials in simulation\nwithout proper guidance from physical principles or underlying dynamics. In\nresponse, we propose a novel end-to-end pipeline that seamlessly integrates\nperception, planning, and model-based control for humanoid robot walking. We\nrefer to our method as iWalker, which is driven by imperative learning (IL), a\nself-supervising neuro-symbolic learning framework. This enables the robot to\nlearn from arbitrary unlabeled data, significantly improving its adaptability\nand generalization capabilities. In experiments, iWalker demonstrates\neffectiveness in both simulated and real-world environments, representing a\nsignificant advancement toward versatile and autonomous humanoid robots.\n","authors":["Xiao Lin","Yuhao Huang","Taimeng Fu","Xiaobin Xiong","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03860v3","updated":"2024-09-27T00:06:21Z","published":"2024-02-06T10:18:30Z","title":"AED: Adaptable Error Detection for Few-shot Imitation Policy","summary":" We introduce a new task called Adaptable Error Detection (AED), which aims to\nidentify behavior errors in few-shot imitation (FSI) policies based on visual\nobservations in novel environments. The potential to cause serious damage to\nsurrounding areas limits the application of FSI policies in real-world\nscenarios. Thus, a robust system is necessary to notify operators when FSI\npolicies are inconsistent with the intent of demonstrations. This task\nintroduces three challenges: (1) detecting behavior errors in novel\nenvironments, (2) identifying behavior errors that occur without revealing\nnotable changes, and (3) lacking complete temporal information of the rollout\ndue to the necessity of online detection. However, the existing benchmarks\ncannot support the development of AED because their tasks do not present all\nthese challenges. To this end, we develop a cross-domain AED benchmark,\nconsisting of 322 base and 153 novel environments. Additionally, we propose\nPattern Observer (PrObe) to address these challenges. PrObe is equipped with a\npowerful pattern extractor and guided by novel learning objectives to parse\ndiscernible patterns in the policy feature representations of normal or error\nstates. Through our comprehensive evaluation, PrObe demonstrates superior\ncapability to detect errors arising from a wide range of FSI policies,\nconsistently surpassing strong baselines. Moreover, we conduct detailed\nablations and a pilot study on error correction to validate the effectiveness\nof the proposed architecture design and the practicality of the AED task,\nrespectively.\n","authors":["Jia-Fong Yeh","Kuo-Han Hung","Pang-Chi Lo","Chi-Ming Chung","Tsung-Han Wu","Hung-Ting Su","Yi-Ting Chen","Winston H. Hsu"],"pdf_url":"https://arxiv.org/pdf/2402.03860v3.pdf","comment":"Accepted to NeurIPS2024"},{"id":"http://arxiv.org/abs/2409.18352v1","updated":"2024-09-27T00:05:13Z","published":"2024-09-27T00:05:13Z","title":"A New 10-mg SMA-Based Fast Bimorph Actuator for Microrobotics","summary":" We present a new millimeter-scale bimorph actuator for microrobotic\napplications, driven by feedforward controlled shape-memory alloy (SMA) wires.\nThe device weighs 10 mg, measures 14 mm in length, and occupies a volume of 4.8\nmm3, which makes it the lightest and smallest fully functional SMA-based\nbimorph actuator for microrobotics developed to date. The experimentally\nmeasured operational bandwidth is on the order of 20 Hz, and the unimorph and\nbimorph maximum low-frequency displacement outputs are on the order of 3.5 and\n7 mm, respectively. To test and demonstrate the functionality and suitability\nof the actuator for microrobotics, we developed the Fish-&-Ribbon-Inspired\nSmall Swimming Harmonic roBot (FRISSHBot). Loosely inspired by carangiformes,\nthe FRISSHBot leverages fluid-structure interaction (FSI) phenomena to propel\nitself forward, weighs 30 mg, measures 34 mm in length, operates at frequencies\nof up to 4 Hz, and swims at speeds of up to 3.06 mm/s (0.09 Bl/s). This robot\nis the lightest and smallest swimmer with onboard actuation developed to date.\n","authors":["Conor K. Trygstad","Elijah K. Blankenship","Nestor O. Perez-Arancibia"],"pdf_url":"https://arxiv.org/pdf/2409.18352v1.pdf","comment":"To be presented at the 2024 IEEE/RSJ International Conference on\n Intelligent Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2407.09890v2","updated":"2024-09-27T00:01:03Z","published":"2024-07-13T13:43:39Z","title":"Speech-Guided Sequential Planning for Autonomous Navigation using Large\n Language Model Meta AI 3 (Llama3)","summary":" In social robotics, a pivotal focus is enabling robots to engage with humans\nin a more natural and seamless manner. The emergence of advanced large language\nmodels (LLMs) such as Generative Pre-trained Transformers (GPTs) and\nautoregressive models like Large Language Model Meta AI (Llamas) has driven\nsignificant advancements in integrating natural language understanding\ncapabilities into social robots. This paper presents a system for speech-guided\nsequential planning in autonomous navigation, utilizing Llama3 and the Robot\nOperating System~(ROS). The proposed system involves using Llama3 to interpret\nvoice commands, extracting essential details through parsing, and decoding\nthese commands into sequential actions for tasks. Such sequential planning is\nessential in various domains, particularly in the pickup and delivery of an\nobject. Once a sequential navigation task is evaluated, we employ DRL-VO, a\nlearning-based control policy that allows a robot to autonomously navigate\nthrough social spaces with static infrastructure and (crowds of) people. We\ndemonstrate the effectiveness of the system in simulation experiment using\nTurtlebot 2 in ROS1 and Turtlebot 3 in ROS2. We conduct hardware trials using a\nClearpath Robotics Jackal UGV, highlighting its potential for real-world\ndeployment in scenarios requiring flexible and interactive robotic behaviors.\n","authors":["Alkesh K. Srivastava","Philip Dames"],"pdf_url":"https://arxiv.org/pdf/2407.09890v2.pdf","comment":"Accepted at the 16th International Conference on Social Robotics + AI"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.17993v2","updated":"2024-09-27T02:35:47Z","published":"2024-09-26T16:04:31Z","title":"InterNet: Unsupervised Cross-modal Homography Estimation Based on\n Interleaved Modality Transfer and Self-supervised Homography Prediction","summary":" We propose a novel unsupervised cross-modal homography estimation framework,\nbased on interleaved modality transfer and self-supervised homography\nprediction, named InterNet. InterNet integrates modality transfer and\nself-supervised homography estimation, introducing an innovative interleaved\noptimization framework to alternately promote both components. The modality\ntransfer gradually narrows the modality gaps, facilitating the self-supervised\nhomography estimation to fully leverage the synthetic intra-modal data. The\nself-supervised homography estimation progressively achieves reliable\npredictions, thereby providing robust cross-modal supervision for the modality\ntransfer. To further boost the estimation accuracy, we also formulate a\nfine-grained homography feature loss to improve the connection between two\ncomponents. Furthermore, we employ a simple yet effective distillation training\ntechnique to reduce model parameters and improve cross-domain generalization\nability while maintaining comparable performance. Experiments reveal that\nInterNet achieves the state-of-the-art (SOTA) performance among unsupervised\nmethods, and even outperforms many supervised methods such as MHN and\nLocalTrans.\n","authors":["Junchen Yu","Si-Yuan Cao","Runmin Zhang","Chenghao Zhang","Jianxin Hu","Zhu Yu","Beinan Yu","Hui-liang Shen"],"pdf_url":"https://arxiv.org/pdf/2409.17993v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17851v2","updated":"2024-09-27T15:59:45Z","published":"2024-09-26T13:57:05Z","title":"A New Dataset for Monocular Depth Estimation Under Viewpoint Shifts","summary":" Monocular depth estimation is a critical task for autonomous driving and many\nother computer vision applications. While significant progress has been made in\nthis field, the effects of viewpoint shifts on depth estimation models remain\nlargely underexplored. This paper introduces a novel dataset and evaluation\nmethodology to quantify the impact of different camera positions and\norientations on monocular depth estimation performance. We propose a ground\ntruth strategy based on homography estimation and object detection, eliminating\nthe need for expensive lidar sensors. We collect a diverse dataset of road\nscenes from multiple viewpoints and use it to assess the robustness of a modern\ndepth estimation model to geometric shifts. After assessing the validity of our\nstrategy on a public dataset, we provide valuable insights into the limitations\nof current models and highlight the importance of considering viewpoint\nvariations in real-world applications.\n","authors":["Aurel Pjetri","Stefano Caprasecca","Leonardo Taccari","Matteo Simoncini","Henrique Piñeiro Monteagudo","Walter Wallace","Douglas Coimbra de Andrade","Francesco Sambo","Andrew David Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2409.17851v2.pdf","comment":"17 pages, 5 figures. Accepted at ECCV 2024 2nd Workshop on\n Vision-Centric Autonomous Driving (VCAD)"},{"id":"http://arxiv.org/abs/2409.17763v2","updated":"2024-09-27T06:50:21Z","published":"2024-09-26T11:58:41Z","title":"Confidence intervals uncovered: Are we ready for real-world medical\n imaging AI?","summary":" Medical imaging is spearheading the AI transformation of healthcare.\nPerformance reporting is key to determine which methods should be translated\ninto clinical practice. Frequently, broad conclusions are simply derived from\nmean performance values. In this paper, we argue that this common practice is\noften a misleading simplification as it ignores performance variability. Our\ncontribution is threefold. (1) Analyzing all MICCAI segmentation papers (n =\n221) published in 2023, we first observe that more than 50% of papers do not\nassess performance variability at all. Moreover, only one (0.5%) paper reported\nconfidence intervals (CIs) for model performance. (2) To address the reporting\nbottleneck, we show that the unreported standard deviation (SD) in segmentation\npapers can be approximated by a second-order polynomial function of the mean\nDice similarity coefficient (DSC). Based on external validation data from 56\nprevious MICCAI challenges, we demonstrate that this approximation can\naccurately reconstruct the CI of a method using information provided in\npublications. (3) Finally, we reconstructed 95% CIs around the mean DSC of\nMICCAI 2023 segmentation papers. The median CI width was 0.03 which is three\ntimes larger than the median performance gap between the first and second\nranked method. For more than 60% of papers, the mean performance of the\nsecond-ranked method was within the CI of the first-ranked method. We conclude\nthat current publications typically do not provide sufficient evidence to\nsupport which models could potentially be translated into clinical practice.\n","authors":["Evangelia Christodoulou","Annika Reinke","Rola Houhou","Piotr Kalinowski","Selen Erkan","Carole H. Sudre","Ninon Burgos","Sofiène Boutaj","Sophie Loizillon","Maëlys Solal","Nicola Rieke","Veronika Cheplygina","Michela Antonelli","Leon D. Mayer","Minu D. Tizabi","M. Jorge Cardoso","Amber Simpson","Paul F. Jäger","Annette Kopp-Schneider","Gaël Varoquaux","Olivier Colliot","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2409.17763v2.pdf","comment":"Paper accepted at MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2409.17671v2","updated":"2024-09-27T10:02:53Z","published":"2024-09-26T09:30:37Z","title":"Leveraging Anthropometric Measurements to Improve Human Mesh Estimation\n and Ensure Consistent Body Shapes","summary":" The basic body shape of a person does not change within a single video.\nHowever, most SOTA human mesh estimation (HME) models output a slightly\ndifferent body shape for each video frame, which results in inconsistent body\nshapes for the same person. In contrast, we leverage anthropometric\nmeasurements like tailors are already obtaining from humans for centuries. We\ncreate a model called A2B that converts such anthropometric measurements to\nbody shape parameters of human mesh models. Moreover, we find that finetuned\nSOTA 3D human pose estimation (HPE) models outperform HME models regarding the\nprecision of the estimated keypoints. We show that applying inverse kinematics\n(IK) to the results of such a 3D HPE model and combining the resulting body\npose with the A2B body shape leads to superior and consistent human meshes for\nchallenging datasets like ASPset or fit3D, where we can lower the MPJPE by over\n30 mm compared to SOTA HME models. Further, replacing HME models estimates of\nthe body shape parameters with A2B model results not only increases the\nperformance of these HME models, but also leads to consistent body shapes.\n","authors":["Katja Ludwig","Julian Lorenz","Daniel Kienzle","Tuan Bui","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2409.17671v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16225v3","updated":"2024-09-27T14:04:22Z","published":"2024-09-24T16:38:41Z","title":"VideoPatchCore: An Effective Method to Memorize Normality for Video\n Anomaly Detection","summary":" Video anomaly detection (VAD) is a crucial task in video analysis and\nsurveillance within computer vision. Currently, VAD is gaining attention with\nmemory techniques that store the features of normal frames. The stored features\nare utilized for frame reconstruction, identifying an abnormality when a\nsignificant difference exists between the reconstructed and input frames.\nHowever, this approach faces several challenges due to the simultaneous\noptimization required for both the memory and encoder-decoder model. These\nchallenges include increased optimization difficulty, complexity of\nimplementation, and performance variability depending on the memory size. To\naddress these challenges,we propose an effective memory method for VAD, called\nVideoPatchCore. Inspired by PatchCore, our approach introduces a structure that\nprioritizes memory optimization and configures three types of memory tailored\nto the characteristics of video data. This method effectively addresses the\nlimitations of existing memory-based methods, achieving good performance\ncomparable to state-of-the-art methods. Furthermore, our method requires no\ntraining and is straightforward to implement, making VAD tasks more accessible.\nOur code is available online at github.com/SkiddieAhn/Paper-VideoPatchCore.\n","authors":["Sunghyun Ahn","Youngwan Jo","Kijung Lee","Sanghyun Park"],"pdf_url":"https://arxiv.org/pdf/2409.16225v3.pdf","comment":"Accepted to ACCV 2024"},{"id":"http://arxiv.org/abs/2409.18964v1","updated":"2024-09-27T17:59:57Z","published":"2024-09-27T17:59:57Z","title":"PhysGen: Rigid-Body Physics-Grounded Image-to-Video Generation","summary":" We present PhysGen, a novel image-to-video generation method that converts a\nsingle image and an input condition (e.g., force and torque applied to an\nobject in the image) to produce a realistic, physically plausible, and\ntemporally consistent video. Our key insight is to integrate model-based\nphysical simulation with a data-driven video generation process, enabling\nplausible image-space dynamics. At the heart of our system are three core\ncomponents: (i) an image understanding module that effectively captures the\ngeometry, materials, and physical parameters of the image; (ii) an image-space\ndynamics simulation model that utilizes rigid-body physics and inferred\nparameters to simulate realistic behaviors; and (iii) an image-based rendering\nand refinement module that leverages generative video diffusion to produce\nrealistic video footage featuring the simulated motion. The resulting videos\nare realistic in both physics and appearance and are even precisely\ncontrollable, showcasing superior results over existing data-driven\nimage-to-video generation works through quantitative comparison and\ncomprehensive user study. PhysGen's resulting videos can be used for various\ndownstream applications, such as turning an image into a realistic animation or\nallowing users to interact with the image and create various dynamics. Project\npage: https://stevenlsw.github.io/physgen/\n","authors":["Shaowei Liu","Zhongzheng Ren","Saurabh Gupta","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18964v1.pdf","comment":"Accepted to ECCV 2024. Project page:\n https://stevenlsw.github.io/physgen/"},{"id":"http://arxiv.org/abs/2409.18962v1","updated":"2024-09-27T17:59:50Z","published":"2024-09-27T17:59:50Z","title":"Exploring Token Pruning in Vision State Space Models","summary":" State Space Models (SSMs) have the advantage of keeping linear computational\ncomplexity compared to attention modules in transformers, and have been applied\nto vision tasks as a new type of powerful vision foundation model. Inspired by\nthe observations that the final prediction in vision transformers (ViTs) is\nonly based on a subset of most informative tokens, we take the novel step of\nenhancing the efficiency of SSM-based vision models through token-based\npruning. However, direct applications of existing token pruning techniques\ndesigned for ViTs fail to deliver good performance, even with extensive\nfine-tuning. To address this issue, we revisit the unique computational\ncharacteristics of SSMs and discover that naive application disrupts the\nsequential token positions. This insight motivates us to design a novel and\ngeneral token pruning method specifically for SSM-based vision models. We first\nintroduce a pruning-aware hidden state alignment method to stabilize the\nneighborhood of remaining tokens for performance enhancement. Besides, based on\nour detailed analysis, we propose a token importance evaluation method adapted\nfor SSM models, to guide the token pruning. With efficient implementation and\npractical acceleration methods, our method brings actual speedup. Extensive\nexperiments demonstrate that our approach can achieve significant computation\nreduction with minimal impact on performance across different tasks. Notably,\nwe achieve 81.7\\% accuracy on ImageNet with a 41.6\\% reduction in the FLOPs for\npruned PlainMamba-L3. Furthermore, our work provides deeper insights into\nunderstanding the behavior of SSM-based vision models for future research.\n","authors":["Zheng Zhan","Zhenglun Kong","Yifan Gong","Yushu Wu","Zichong Meng","Hangyu Zheng","Xuan Shen","Stratis Ioannidis","Wei Niu","Pu Zhao","Yanzhi Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18962v1.pdf","comment":"NeurIPS'24"},{"id":"http://arxiv.org/abs/2409.18961v1","updated":"2024-09-27T17:59:42Z","published":"2024-09-27T17:59:42Z","title":"ProMerge: Prompt and Merge for Unsupervised Instance Segmentation","summary":" Unsupervised instance segmentation aims to segment distinct object instances\nin an image without relying on human-labeled data. This field has recently seen\nsignificant advancements, partly due to the strong local correspondences\nafforded by rich visual feature representations from self-supervised models\n(e.g., DINO). Recent state-of-the-art approaches use self-supervised features\nto represent images as graphs and solve a generalized eigenvalue system (i.e.,\nnormalized-cut) to generate foreground masks. While effective, this strategy is\nlimited by its attendant computational demands, leading to slow inference\nspeeds. In this paper, we propose Prompt and Merge (ProMerge), which leverages\nself-supervised visual features to obtain initial groupings of patches and\napplies a strategic merging to these segments, aided by a sophisticated\nbackground-based mask pruning technique. ProMerge not only yields competitive\nresults but also offers a significant reduction in inference time compared to\nstate-of-the-art normalized-cut-based approaches. Furthermore, when training an\nobject detector using our mask predictions as pseudo-labels, the resulting\ndetector surpasses the current leading unsupervised model on various\nchallenging instance segmentation benchmarks.\n","authors":["Dylan Li","Gyungin Shin"],"pdf_url":"https://arxiv.org/pdf/2409.18961v1.pdf","comment":"ECCV2024 camera-ready"},{"id":"http://arxiv.org/abs/2409.18953v1","updated":"2024-09-27T17:56:04Z","published":"2024-09-27T17:56:04Z","title":"UniCal: Unified Neural Sensor Calibration","summary":" Self-driving vehicles (SDVs) require accurate calibration of LiDARs and\ncameras to fuse sensor data accurately for autonomy. Traditional calibration\nmethods typically leverage fiducials captured in a controlled and structured\nscene and compute correspondences to optimize over. These approaches are costly\nand require substantial infrastructure and operations, making it challenging to\nscale for vehicle fleets. In this work, we propose UniCal, a unified framework\nfor effortlessly calibrating SDVs equipped with multiple LiDARs and cameras.\nOur approach is built upon a differentiable scene representation capable of\nrendering multi-view geometrically and photometrically consistent sensor\nobservations. We jointly learn the sensor calibration and the underlying scene\nrepresentation through differentiable volume rendering, utilizing outdoor\nsensor data without the need for specific calibration fiducials. This\n\"drive-and-calibrate\" approach significantly reduces costs and operational\noverhead compared to existing calibration systems, enabling efficient\ncalibration for large SDV fleets at scale. To ensure geometric consistency\nacross observations from different sensors, we introduce a novel surface\nalignment loss that combines feature-based registration with neural rendering.\nComprehensive evaluations on multiple datasets demonstrate that UniCal\noutperforms or matches the accuracy of existing calibration approaches while\nbeing more efficient, demonstrating the value of UniCal for scalable\ncalibration.\n","authors":["Ze Yang","George Chen","Haowei Zhang","Kevin Ta","Ioan Andrei Bârsan","Daniel Murphy","Sivabalan Manivasagam","Raquel Urtasun"],"pdf_url":"https://arxiv.org/pdf/2409.18953v1.pdf","comment":"ECCV 2024. Project page: https://waabi.ai/unical/"},{"id":"http://arxiv.org/abs/2409.18951v1","updated":"2024-09-27T17:52:08Z","published":"2024-09-27T17:52:08Z","title":"Spectral Wavelet Dropout: Regularization in the Wavelet Domain","summary":" Regularization techniques help prevent overfitting and therefore improve the\nability of convolutional neural networks (CNNs) to generalize. One reason for\noverfitting is the complex co-adaptations among different parts of the network,\nwhich make the CNN dependent on their joint response rather than encouraging\neach part to learn a useful feature representation independently. Frequency\ndomain manipulation is a powerful strategy for modifying data that has temporal\nand spatial coherence by utilizing frequency decomposition. This work\nintroduces Spectral Wavelet Dropout (SWD), a novel regularization method that\nincludes two variants: 1D-SWD and 2D-SWD. These variants improve CNN\ngeneralization by randomly dropping detailed frequency bands in the discrete\nwavelet decomposition of feature maps. Our approach distinguishes itself from\nthe pre-existing Spectral \"Fourier\" Dropout (2D-SFD), which eliminates\ncoefficients in the Fourier domain. Notably, SWD requires only a single\nhyperparameter, unlike the two required by SFD. We also extend the literature\nby implementing a one-dimensional version of Spectral \"Fourier\" Dropout\n(1D-SFD), setting the stage for a comprehensive comparison. Our evaluation\nshows that both 1D and 2D SWD variants have competitive performance on\nCIFAR-10/100 benchmarks relative to both 1D-SFD and 2D-SFD. Specifically,\n1D-SWD has a significantly lower computational complexity compared to\n1D/2D-SFD. In the Pascal VOC Object Detection benchmark, SWD variants surpass\n1D-SFD and 2D-SFD in performance and demonstrate lower computational complexity\nduring training.\n","authors":["Rinor Cakaj","Jens Mehnert","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2409.18951v1.pdf","comment":"Accepted by The International Conference on Machine Learning and\n Applications (ICMLA) 2024"},{"id":"http://arxiv.org/abs/2409.18938v1","updated":"2024-09-27T17:38:36Z","published":"2024-09-27T17:38:36Z","title":"From Seconds to Hours: Reviewing MultiModal Large Language Models on\n Comprehensive Long Video Understanding","summary":" The integration of Large Language Models (LLMs) with visual encoders has\nrecently shown promising performance in visual understanding tasks, leveraging\ntheir inherent capability to comprehend and generate human-like text for visual\nreasoning. Given the diverse nature of visual data, MultiModal Large Language\nModels (MM-LLMs) exhibit variations in model designing and training for\nunderstanding images, short videos, and long videos. Our paper focuses on the\nsubstantial differences and unique challenges posed by long video understanding\ncompared to static image and short video understanding. Unlike static images,\nshort videos encompass sequential frames with both spatial and within-event\ntemporal information, while long videos consist of multiple events with\nbetween-event and long-term temporal information. In this survey, we aim to\ntrace and summarize the advancements of MM-LLMs from image understanding to\nlong video understanding. We review the differences among various visual\nunderstanding tasks and highlight the challenges in long video understanding,\nincluding more fine-grained spatiotemporal details, dynamic events, and\nlong-term dependencies. We then provide a detailed summary of the advancements\nin MM-LLMs in terms of model design and training methodologies for\nunderstanding long videos. Finally, we compare the performance of existing\nMM-LLMs on video understanding benchmarks of various lengths and discuss\npotential future directions for MM-LLMs in long video understanding.\n","authors":["Heqing Zou","Tianze Luo","Guiyang Xie"," Victor"," Zhang","Fengmao Lv","Guangcong Wang","Juanyang Chen","Zhuochen Wang","Hansheng Zhang","Huaijian Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18938v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2407.13027v2","updated":"2024-09-27T17:29:43Z","published":"2024-07-17T21:28:20Z","title":"SpaRED benchmark: Enhancing Gene Expression Prediction from Histology\n Images with Spatial Transcriptomics Completion","summary":" Spatial Transcriptomics is a novel technology that aligns histology images\nwith spatially resolved gene expression profiles. Although groundbreaking, it\nstruggles with gene capture yielding high corruption in acquired data. Given\npotential applications, recent efforts have focused on predicting\ntranscriptomic profiles solely from histology images. However, differences in\ndatabases, preprocessing techniques, and training hyperparameters hinder a fair\ncomparison between methods. To address these challenges, we present a\nsystematically curated and processed database collected from 26 public sources,\nrepresenting an 8.6-fold increase compared to previous works. Additionally, we\npropose a state-of-the-art transformer based completion technique for inferring\nmissing gene expression, which significantly boosts the performance of\ntranscriptomic profile predictions across all datasets. Altogether, our\ncontributions constitute the most comprehensive benchmark of gene expression\nprediction from histology images to date and a stepping stone for future\nresearch on spatial transcriptomics.\n","authors":["Gabriel Mejia","Daniela Ruiz","Paula Cárdenas","Leonardo Manrique","Daniela Vega","Pablo Arbeláez"],"pdf_url":"https://arxiv.org/pdf/2407.13027v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18932v1","updated":"2024-09-27T17:29:23Z","published":"2024-09-27T17:29:23Z","title":"ReviveDiff: A Universal Diffusion Model for Restoring Images in Adverse\n Weather Conditions","summary":" Images captured in challenging environments--such as nighttime, foggy, rainy\nweather, and underwater--often suffer from significant degradation, resulting\nin a substantial loss of visual quality. Effective restoration of these\ndegraded images is critical for the subsequent vision tasks. While many\nexisting approaches have successfully incorporated specific priors for\nindividual tasks, these tailored solutions limit their applicability to other\ndegradations. In this work, we propose a universal network architecture, dubbed\n\"ReviveDiff\", which can address a wide range of degradations and bring images\nback to life by enhancing and restoring their quality. Our approach is inspired\nby the observation that, unlike degradation caused by movement or electronic\nissues, quality degradation under adverse conditions primarily stems from\nnatural media (such as fog, water, and low luminance), which generally\npreserves the original structures of objects. To restore the quality of such\nimages, we leveraged the latest advancements in diffusion models and developed\nReviveDiff to restore image quality from both macro and micro levels across\nsome key factors determining image quality, such as sharpness, distortion,\nnoise level, dynamic range, and color accuracy. We rigorously evaluated\nReviveDiff on seven benchmark datasets covering five types of degrading\nconditions: Rainy, Underwater, Low-light, Smoke, and Nighttime Hazy. Our\nexperimental results demonstrate that ReviveDiff outperforms the\nstate-of-the-art methods both quantitatively and visually.\n","authors":["Wenfeng Huang","Guoan Xu","Wenjing Jia","Stuart Perry","Guangwei Gao"],"pdf_url":"https://arxiv.org/pdf/2409.18932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18922v1","updated":"2024-09-27T17:13:25Z","published":"2024-09-27T17:13:25Z","title":"SurfaceAI: Automated creation of cohesive road surface quality datasets\n based on open street-level imagery","summary":" This paper introduces SurfaceAI, a pipeline designed to generate\ncomprehensive georeferenced datasets on road surface type and quality from\nopenly available street-level imagery. The motivation stems from the\nsignificant impact of road unevenness on the safety and comfort of traffic\nparticipants, especially vulnerable road users, emphasizing the need for\ndetailed road surface data in infrastructure modeling and analysis. SurfaceAI\naddresses this gap by leveraging crowdsourced Mapillary data to train models\nthat predict the type and quality of road surfaces visible in street-level\nimages, which are then aggregated to provide cohesive information on entire\nroad segment conditions.\n","authors":["Alexandra Kapp","Edith Hoffmann","Esther Weigmann","Helena Mihaljević"],"pdf_url":"https://arxiv.org/pdf/2409.18922v1.pdf","comment":"4 pages, 2 figures; accepted at 2nd ACM SIGSPATIAL International\n Workshop on Advances in Urban-AI"},{"id":"http://arxiv.org/abs/2409.18901v1","updated":"2024-09-27T16:39:50Z","published":"2024-09-27T16:39:50Z","title":"Improving Visual Object Tracking through Visual Prompting","summary":" Learning a discriminative model to distinguish a target from its surrounding\ndistractors is essential to generic visual object tracking. Dynamic target\nrepresentation adaptation against distractors is challenging due to the limited\ndiscriminative capabilities of prevailing trackers. We present a new visual\nPrompting mechanism for generic Visual Object Tracking (PiVOT) to address this\nissue. PiVOT proposes a prompt generation network with the pre-trained\nfoundation model CLIP to automatically generate and refine visual prompts,\nenabling the transfer of foundation model knowledge for tracking. While CLIP\noffers broad category-level knowledge, the tracker, trained on\ninstance-specific data, excels at recognizing unique object instances. Thus,\nPiVOT first compiles a visual prompt highlighting potential target locations.\nTo transfer the knowledge of CLIP to the tracker, PiVOT leverages CLIP to\nrefine the visual prompt based on the similarities between candidate objects\nand the reference templates across potential targets. Once the visual prompt is\nrefined, it can better highlight potential target locations, thereby reducing\nirrelevant prompt information. With the proposed prompting mechanism, the\ntracker can generate improved instance-aware feature maps through the guidance\nof the visual prompt, thus effectively reducing distractors. The proposed\nmethod does not involve CLIP during training, thereby keeping the same training\ncomplexity and preserving the generalization capability of the pretrained\nfoundation model. Extensive experiments across multiple benchmarks indicate\nthat PiVOT, using the proposed prompting method can suppress distracting\nobjects and enhance the tracker.\n","authors":["Shih-Fang Chen","Jun-Cheng Chen","I-Hong Jhuo","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2409.18901v1.pdf","comment":"Accepted and to appear in IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2409.18899v1","updated":"2024-09-27T16:37:27Z","published":"2024-09-27T16:37:27Z","title":"Unsupervised Low-light Image Enhancement with Lookup Tables and\n Diffusion Priors","summary":" Low-light image enhancement (LIE) aims at precisely and efficiently\nrecovering an image degraded in poor illumination environments. Recent advanced\nLIE techniques are using deep neural networks, which require lots of low-normal\nlight image pairs, network parameters, and computational resources. As a\nresult, their practicality is limited. In this work, we devise a novel\nunsupervised LIE framework based on diffusion priors and lookup tables (DPLUT)\nto achieve efficient low-light image recovery. The proposed approach comprises\ntwo critical components: a light adjustment lookup table (LLUT) and a noise\nsuppression lookup table (NLUT). LLUT is optimized with a set of unsupervised\nlosses. It aims at predicting pixel-wise curve parameters for the dynamic range\nadjustment of a specific image. NLUT is designed to remove the amplified noise\nafter the light brightens. As diffusion models are sensitive to noise,\ndiffusion priors are introduced to achieve high-performance noise suppression.\nExtensive experiments demonstrate that our approach outperforms\nstate-of-the-art methods in terms of visual quality and efficiency.\n","authors":["Yunlong Lin","Zhenqi Fu","Kairun Wen","Tian Ye","Sixiang Chen","Ge Meng","Yingying Wang","Yue Huang","Xiaotong Tu","Xinghao Ding"],"pdf_url":"https://arxiv.org/pdf/2409.18899v1.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2409.18897v1","updated":"2024-09-27T16:34:48Z","published":"2024-09-27T16:34:48Z","title":"Detecting Dataset Abuse in Fine-Tuning Stable Diffusion Models for\n Text-to-Image Synthesis","summary":" Text-to-image synthesis has become highly popular for generating realistic\nand stylized images, often requiring fine-tuning generative models with\ndomain-specific datasets for specialized tasks. However, these valuable\ndatasets face risks of unauthorized usage and unapproved sharing, compromising\nthe rights of the owners. In this paper, we address the issue of dataset abuse\nduring the fine-tuning of Stable Diffusion models for text-to-image synthesis.\nWe present a dataset watermarking framework designed to detect unauthorized\nusage and trace data leaks. The framework employs two key strategies across\nmultiple watermarking schemes and is effective for large-scale dataset\nauthorization. Extensive experiments demonstrate the framework's effectiveness,\nminimal impact on the dataset (only 2% of the data required to be modified for\nhigh detection accuracy), and ability to trace data leaks. Our results also\nhighlight the robustness and transferability of the framework, proving its\npractical applicability in detecting dataset abuse.\n","authors":["Songrui Wang","Yubo Zhu","Wei Tong","Sheng Zhong"],"pdf_url":"https://arxiv.org/pdf/2409.18897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18896v1","updated":"2024-09-27T16:34:13Z","published":"2024-09-27T16:34:13Z","title":"S2O: Static to Openable Enhancement for Articulated 3D Objects","summary":" Despite much progress in large 3D datasets there are currently few\ninteractive 3D object datasets, and their scale is limited due to the manual\neffort required in their construction. We introduce the static to openable\n(S2O) task which creates interactive articulated 3D objects from static\ncounterparts through openable part detection, motion prediction, and interior\ngeometry completion. We formulate a unified framework to tackle this task, and\ncurate a challenging dataset of openable 3D objects that serves as a test bed\nfor systematic evaluation. Our experiments benchmark methods from prior work\nand simple yet effective heuristics for the S2O task. We find that turning\nstatic 3D objects into interactively openable counterparts is possible but that\nall methods struggle to generalize to realistic settings of the task, and we\nhighlight promising future work directions.\n","authors":["Denys Iliash","Hanxiao Jiang","Yiming Zhang","Manolis Savva","Angel X. Chang"],"pdf_url":"https://arxiv.org/pdf/2409.18896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00712v4","updated":"2024-09-27T16:27:55Z","published":"2024-02-01T16:07:12Z","title":"ChaosBench: A Multi-Channel, Physics-Based Benchmark for\n Subseasonal-to-Seasonal Climate Prediction","summary":" Accurate prediction of climate in the subseasonal-to-seasonal scale is\ncrucial for disaster preparedness and robust decision making amidst climate\nchange. Yet, forecasting beyond the weather timescale is challenging because it\ndeals with problems other than initial condition, including boundary\ninteraction, butterfly effect, and our inherent lack of physical understanding.\nAt present, existing benchmarks tend to have shorter forecasting range of up-to\n15 days, do not include a wide range of operational baselines, and lack\nphysics-based constraints for explainability. Thus, we propose ChaosBench, a\nchallenging benchmark to extend the predictability range of data-driven weather\nemulators to S2S timescale. First, ChaosBench is comprised of variables beyond\nthe typical surface-atmospheric ERA5 to also include ocean, ice, and land\nreanalysis products that span over 45 years to allow for full Earth system\nemulation that respects boundary conditions. We also propose physics-based, in\naddition to deterministic and probabilistic metrics, to ensure a\nphysically-consistent ensemble that accounts for butterfly effect. Furthermore,\nwe evaluate on a diverse set of physics-based forecasts from four national\nweather agencies as baselines to our data-driven counterpart such as\nViT/ClimaX, PanguWeather, GraphCast, and FourCastNetV2. Overall, we find\nmethods originally developed for weather-scale applications fail on S2S task:\ntheir performance simply collapse to an unskilled climatology. Nonetheless, we\noutline and demonstrate several strategies that can extend the predictability\nrange of existing weather emulators, including the use of ensembles, robust\ncontrol of error propagation, and the use of physics-informed models. Our\nbenchmark, datasets, and instructions are available at\nhttps://leap-stc.github.io/ChaosBench.\n","authors":["Juan Nathaniel","Yongquan Qu","Tung Nguyen","Sungduk Yu","Julius Busecke","Aditya Grover","Pierre Gentine"],"pdf_url":"https://arxiv.org/pdf/2402.00712v4.pdf","comment":"Accepted as Oral in NeurIPS'24 D&B Track"},{"id":"http://arxiv.org/abs/2409.18881v1","updated":"2024-09-27T16:18:13Z","published":"2024-09-27T16:18:13Z","title":"Explainable Artifacts for Synthetic Western Blot Source Attribution","summary":" Recent advancements in artificial intelligence have enabled generative models\nto produce synthetic scientific images that are indistinguishable from pristine\nones, posing a challenge even for expert scientists habituated to working with\nsuch content. When exploited by organizations known as paper mills, which\nsystematically generate fraudulent articles, these technologies can\nsignificantly contribute to the spread of misinformation about ungrounded\nscience, potentially undermining trust in scientific research. While previous\nstudies have explored black-box solutions, such as Convolutional Neural\nNetworks, for identifying synthetic content, only some have addressed the\nchallenge of generalizing across different models and providing insight into\nthe artifacts in synthetic images that inform the detection process. This study\naims to identify explainable artifacts generated by state-of-the-art generative\nmodels (e.g., Generative Adversarial Networks and Diffusion Models) and\nleverage them for open-set identification and source attribution (i.e.,\npointing to the model that created the image).\n","authors":["João Phillipe Cardenuto","Sara Mandelli","Daniel Moreira","Paolo Bestagini","Edward Delp","Anderson Rocha"],"pdf_url":"https://arxiv.org/pdf/2409.18881v1.pdf","comment":"Accepted in IEEE International Workshop on Information Forensics and\n Security - WIFS 2024, Rome, Italy"},{"id":"http://arxiv.org/abs/2409.18877v1","updated":"2024-09-27T16:12:51Z","published":"2024-09-27T16:12:51Z","title":"UniEmoX: Cross-modal Semantic-Guided Large-Scale Pretraining for\n Universal Scene Emotion Perception","summary":" Visual emotion analysis holds significant research value in both computer\nvision and psychology. However, existing methods for visual emotion analysis\nsuffer from limited generalizability due to the ambiguity of emotion perception\nand the diversity of data scenarios. To tackle this issue, we introduce\nUniEmoX, a cross-modal semantic-guided large-scale pretraining framework.\nInspired by psychological research emphasizing the inseparability of the\nemotional exploration process from the interaction between individuals and\ntheir environment, UniEmoX integrates scene-centric and person-centric\nlow-level image spatial structural information, aiming to derive more nuanced\nand discriminative emotional representations. By exploiting the similarity\nbetween paired and unpaired image-text samples, UniEmoX distills rich semantic\nknowledge from the CLIP model to enhance emotional embedding representations\nmore effectively. To the best of our knowledge, this is the first large-scale\npretraining framework that integrates psychological theories with contemporary\ncontrastive learning and masked image modeling techniques for emotion analysis\nacross diverse scenarios. Additionally, we develop a visual emotional dataset\ntitled Emo8. Emo8 samples cover a range of domains, including cartoon, natural,\nrealistic, science fiction and advertising cover styles, covering nearly all\ncommon emotional scenes. Comprehensive experiments conducted on six benchmark\ndatasets across two downstream tasks validate the effectiveness of UniEmoX. The\nsource code is available at https://github.com/chincharles/u-emo.\n","authors":["Chuang Chen","Xiao Sun","Zhi Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18877v1.pdf","comment":"Submitted to TIP"},{"id":"http://arxiv.org/abs/2409.18876v1","updated":"2024-09-27T16:11:30Z","published":"2024-09-27T16:11:30Z","title":"CemiFace: Center-based Semi-hard Synthetic Face Generation for Face\n Recognition","summary":" Privacy issue is a main concern in developing face recognition techniques.\nAlthough synthetic face images can partially mitigate potential legal risks\nwhile maintaining effective face recognition (FR) performance, FR models\ntrained by face images synthesized by existing generative approaches frequently\nsuffer from performance degradation problems due to the insufficient\ndiscriminative quality of these synthesized samples. In this paper, we\nsystematically investigate what contributes to solid face recognition model\ntraining, and reveal that face images with certain degree of similarities to\ntheir identity centers show great effectiveness in the performance of trained\nFR models. Inspired by this, we propose a novel diffusion-based approach\n(namely Center-based Semi-hard Synthetic Face Generation (CemiFace)) which\nproduces facial samples with various levels of similarity to the subject\ncenter, thus allowing to generate face datasets containing effective\ndiscriminative samples for training face recognition. Experimental results show\nthat with a modest degree of similarity, training on the generated dataset can\nproduce competitive performance compared to previous generation methods.\n","authors":["Zhonglin Sun","Siyang Song","Ioannis Patras","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2409.18876v1.pdf","comment":"accepted to NeurIPS 2024. We are preparing the camera-ready version\n according to the reviews"},{"id":"http://arxiv.org/abs/2409.18872v1","updated":"2024-09-27T16:08:52Z","published":"2024-09-27T16:08:52Z","title":"Simulating Dynamic Tumor Contrast Enhancement in Breast MRI using\n Conditional Generative Adversarial Networks","summary":" This paper presents a method for virtual contrast enhancement in breast MRI,\noffering a promising non-invasive alternative to traditional contrast\nagent-based DCE-MRI acquisition. Using a conditional generative adversarial\nnetwork, we predict DCE-MRI images, including jointly-generated sequences of\nmultiple corresponding DCE-MRI timepoints, from non-contrast-enhanced MRIs,\nenabling tumor localization and characterization without the associated health\nrisks. Furthermore, we qualitatively and quantitatively evaluate the synthetic\nDCE-MRI images, proposing a multi-metric Scaled Aggregate Measure (SAMe),\nassessing their utility in a tumor segmentation downstream task, and conclude\nwith an analysis of the temporal patterns in multi-sequence DCE-MRI generation.\nOur approach demonstrates promising results in generating realistic and useful\nDCE-MRI sequences, highlighting the potential of virtual contrast enhancement\nfor improving breast cancer diagnosis and treatment, particularly for patients\nwhere contrast agent administration is contraindicated.\n","authors":["Richard Osuala","Smriti Joshi","Apostolia Tsirikoglou","Lidia Garrucho","Walter H. L. Pinaya","Daniel M. Lang","Julia A. Schnabel","Oliver Diaz","Karim Lekadir"],"pdf_url":"https://arxiv.org/pdf/2409.18872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18869v1","updated":"2024-09-27T16:06:11Z","published":"2024-09-27T16:06:11Z","title":"Emu3: Next-Token Prediction is All You Need","summary":" While next-token prediction is considered a promising path towards artificial\ngeneral intelligence, it has struggled to excel in multimodal tasks, which are\nstill dominated by diffusion models (e.g., Stable Diffusion) and compositional\napproaches (e.g., CLIP combined with LLMs). In this paper, we introduce Emu3, a\nnew suite of state-of-the-art multimodal models trained solely with next-token\nprediction. By tokenizing images, text, and videos into a discrete space, we\ntrain a single transformer from scratch on a mixture of multimodal sequences.\nEmu3 outperforms several well-established task-specific models in both\ngeneration and perception tasks, surpassing flagship models such as SDXL and\nLLaVA-1.6, while eliminating the need for diffusion or compositional\narchitectures. Emu3 is also capable of generating high-fidelity video via\npredicting the next token in a video sequence. We simplify complex multimodal\nmodel designs by converging on a singular focus: tokens, unlocking great\npotential for scaling both during training and inference. Our results\ndemonstrate that next-token prediction is a promising path towards building\ngeneral multimodal intelligence beyond language. We open-source key techniques\nand models to support further research in this direction.\n","authors":["Xinlong Wang","Xiaosong Zhang","Zhengxiong Luo","Quan Sun","Yufeng Cui","Jinsheng Wang","Fan Zhang","Yueze Wang","Zhen Li","Qiying Yu","Yingli Zhao","Yulong Ao","Xuebin Min","Tao Li","Boya Wu","Bo Zhao","Bowen Zhang","Liangdong Wang","Guang Liu","Zheqi He","Xi Yang","Jingjing Liu","Yonghua Lin","Tiejun Huang","Zhongyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18869v1.pdf","comment":"Project Page: https://emu.baai.ac.cn"},{"id":"http://arxiv.org/abs/2409.18866v1","updated":"2024-09-27T16:02:56Z","published":"2024-09-27T16:02:56Z","title":"MCUBench: A Benchmark of Tiny Object Detectors on MCUs","summary":" We introduce MCUBench, a benchmark featuring over 100 YOLO-based object\ndetection models evaluated on the VOC dataset across seven different MCUs. This\nbenchmark provides detailed data on average precision, latency, RAM, and Flash\nusage for various input resolutions and YOLO-based one-stage detectors. By\nconducting a controlled comparison with a fixed training pipeline, we collect\ncomprehensive performance metrics. Our Pareto-optimal analysis shows that\nintegrating modern detection heads and training techniques allows various YOLO\narchitectures, including legacy models like YOLOv3, to achieve a highly\nefficient tradeoff between mean Average Precision (mAP) and latency. MCUBench\nserves as a valuable tool for benchmarking the MCU performance of contemporary\nobject detectors and aids in model selection based on specific constraints.\n","authors":["Sudhakar Sah","Darshan C. Ganji","Matteo Grimaldi","Ravish Kumar","Alexander Hoffman","Honnesh Rohmetra","Ehsan Saboori"],"pdf_url":"https://arxiv.org/pdf/2409.18866v1.pdf","comment":"Code and data are available at\n https://github.com/Deeplite/deeplite-torch-zoo"},{"id":"http://arxiv.org/abs/2409.18865v1","updated":"2024-09-27T16:02:12Z","published":"2024-09-27T16:02:12Z","title":"Positional Encoder Graph Quantile Neural Networks for Geographic Data","summary":" Positional Encoder Graph Neural Networks (PE-GNNs) are a leading approach for\nmodeling continuous spatial data. However, they often fail to produce\ncalibrated predictive distributions, limiting their effectiveness for\nuncertainty quantification. We introduce the Positional Encoder Graph Quantile\nNeural Network (PE-GQNN), a novel method that integrates PE-GNNs, Quantile\nNeural Networks, and recalibration techniques in a fully nonparametric\nframework, requiring minimal assumptions about the predictive distributions. We\npropose a new network architecture that, when combined with a quantile-based\nloss function, yields accurate and reliable probabilistic models without\nincreasing computational complexity. Our approach provides a flexible, robust\nframework for conditional density estimation, applicable beyond spatial data\ncontexts. We further introduce a structured method for incorporating a KNN\npredictor into the model while avoiding data leakage through the GNN layer\noperation. Experiments on benchmark datasets demonstrate that PE-GQNN\nsignificantly outperforms existing state-of-the-art methods in both predictive\naccuracy and uncertainty quantification.\n","authors":["William E. R. de Amorim","Scott A. Sisson","T. Rodrigues","David J. Nott","Guilherme S. Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2409.18865v1.pdf","comment":"17 main text pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.18860v1","updated":"2024-09-27T15:55:13Z","published":"2024-09-27T15:55:13Z","title":"LW2G: Learning Whether to Grow for Prompt-based Continual Learning","summary":" Continual Learning (CL) aims to learn in non-stationary scenarios,\nprogressively acquiring and maintaining knowledge from sequential tasks. Recent\nPrompt-based Continual Learning (PCL) has achieved remarkable performance with\nPre-Trained Models (PTMs). These approaches grow a prompt sets pool by adding a\nnew set of prompts when learning each new task (\\emph{prompt learning}) and\nadopt a matching mechanism to select the correct set for each testing sample\n(\\emph{prompt retrieval}). Previous studies focus on the latter stage by\nimproving the matching mechanism to enhance Prompt Retrieval Accuracy (PRA). To\npromote cross-task knowledge facilitation and form an effective and efficient\nprompt sets pool, we propose a plug-in module in the former stage to\n\\textbf{Learn Whether to Grow (LW2G)} based on the disparities between tasks.\nSpecifically, a shared set of prompts is utilized when several tasks share\ncertain commonalities, and a new set is added when there are significant\ndifferences between the new task and previous tasks. Inspired by Gradient\nProjection Continual Learning, our LW2G develops a metric called Hinder Forward\nCapability (HFC) to measure the hindrance imposed on learning new tasks by\nsurgically modifying the original gradient onto the orthogonal complement of\nthe old feature space. With HFC, an automated scheme Dynamic Growing Approach\nadaptively learns whether to grow with a dynamic threshold. Furthermore, we\ndesign a gradient-based constraint to ensure the consistency between the\nupdating prompts and pre-trained knowledge, and a prompts weights reusing\nstrategy to enhance forward transfer. Extensive experiments show the\neffectiveness of our method. The source codes are available at\n\\url{https://github.com/RAIAN08/LW2G}.\n","authors":["Qian Feng","Dawei Zhou","Hanbin Zhao","Chao Zhang","Hui Qian"],"pdf_url":"https://arxiv.org/pdf/2409.18860v1.pdf","comment":"submit to neurips2024"},{"id":"http://arxiv.org/abs/2409.18852v1","updated":"2024-09-27T15:50:36Z","published":"2024-09-27T15:50:36Z","title":"Space-time 2D Gaussian Splatting for Accurate Surface Reconstruction\n under Complex Dynamic Scenes","summary":" Previous surface reconstruction methods either suffer from low geometric\naccuracy or lengthy training times when dealing with real-world complex dynamic\nscenes involving multi-person activities, and human-object interactions. To\ntackle the dynamic contents and the occlusions in complex scenes, we present a\nspace-time 2D Gaussian Splatting approach. Specifically, to improve geometric\nquality in dynamic scenes, we learn canonical 2D Gaussian splats and deform\nthese 2D Gaussian splats while enforcing the disks of the Gaussian located on\nthe surface of the objects by introducing depth and normal regularizers.\nFurther, to tackle the occlusion issues in complex scenes, we introduce a\ncompositional opacity deformation strategy, which further reduces the surface\nrecovery of those occluded areas. Experiments on real-world sparse-view video\ndatasets and monocular dynamic datasets demonstrate that our reconstructions\noutperform state-of-the-art methods, especially for the surface of the details.\nThe project page and more visualizations can be found at:\nhttps://tb2-sy.github.io/st-2dgs/.\n","authors":["Shuo Wang","Binbin Huang","Ruoyu Wang","Shenghua Gao"],"pdf_url":"https://arxiv.org/pdf/2409.18852v1.pdf","comment":"Project page: https://tb2-sy.github.io/st-2dgs/"},{"id":"http://arxiv.org/abs/2409.13550v2","updated":"2024-09-27T15:41:33Z","published":"2024-09-20T14:49:21Z","title":"A preliminary study on continual learning in computer vision using\n Kolmogorov-Arnold Networks","summary":" Deep learning has long been dominated by multi-layer perceptrons (MLPs),\nwhich have demonstrated superiority over other optimizable models in various\ndomains. Recently, a new alternative to MLPs has emerged - Kolmogorov-Arnold\nNetworks (KAN)- which are based on a fundamentally different mathematical\nframework. According to their authors, KANs address several major issues in\nMLPs, such as catastrophic forgetting in continual learning scenarios. However,\nthis claim has only been supported by results from a regression task on a toy\n1D dataset. In this paper, we extend the investigation by evaluating the\nperformance of KANs in continual learning tasks within computer vision,\nspecifically using the MNIST datasets. To this end, we conduct a structured\nanalysis of the behavior of MLPs and two KAN-based models in a\nclass-incremental learning scenario, ensuring that the architectures involved\nhave the same number of trainable parameters. Our results demonstrate that an\nefficient version of KAN outperforms both traditional MLPs and the original KAN\nimplementation. We further analyze the influence of hyperparameters in MLPs and\nKANs, as well as the impact of certain trainable parameters in KANs, such as\nbias and scale weights. Additionally, we provide a preliminary investigation of\nrecent KAN-based convolutional networks and compare their performance with that\nof traditional convolutional neural networks. Our codes can be found at\nhttps://github.com/MrPio/KAN-Continual_Learning_tests.\n","authors":["Alessandro Cacciatore","Valerio Morelli","Federica Paganica","Emanuele Frontoni","Lucia Migliorelli","Daniele Berardini"],"pdf_url":"https://arxiv.org/pdf/2409.13550v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18839v1","updated":"2024-09-27T15:35:15Z","published":"2024-09-27T15:35:15Z","title":"MinerU: An Open-Source Solution for Precise Document Content Extraction","summary":" Document content analysis has been a crucial research area in computer\nvision. Despite significant advancements in methods such as OCR, layout\ndetection, and formula recognition, existing open-source solutions struggle to\nconsistently deliver high-quality content extraction due to the diversity in\ndocument types and content. To address these challenges, we present MinerU, an\nopen-source solution for high-precision document content extraction. MinerU\nleverages the sophisticated PDF-Extract-Kit models to extract content from\ndiverse documents effectively and employs finely-tuned preprocessing and\npostprocessing rules to ensure the accuracy of the final results. Experimental\nresults demonstrate that MinerU consistently achieves high performance across\nvarious document types, significantly enhancing the quality and consistency of\ncontent extraction. The MinerU open-source project is available at\nhttps://github.com/opendatalab/MinerU.\n","authors":["Bin Wang","Chao Xu","Xiaomeng Zhao","Linke Ouyang","Fan Wu","Zhiyuan Zhao","Rui Xu","Kaiwen Liu","Yuan Qu","Fukai Shang","Bo Zhang","Liqun Wei","Zhihao Sui","Wei Li","Botian Shi","Yu Qiao","Dahua Lin","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2409.18839v1.pdf","comment":"MinerU Technical Report"},{"id":"http://arxiv.org/abs/2409.18832v1","updated":"2024-09-27T15:27:04Z","published":"2024-09-27T15:27:04Z","title":"Classification and regression of trajectories rendered as images via 2D\n Convolutional Neural Networks","summary":" Trajectories can be regarded as time-series of coordinates, typically arising\nfrom motile objects. Methods for trajectory classification are particularly\nimportant to detect different movement patterns, while methods for regression\nto compute motility metrics and forecasting. Recent advances in computer vision\nhave facilitated the processing of trajectories rendered as images via\nartificial neural networks with 2d convolutional layers (CNNs). This approach\nleverages the capability of CNNs to learn spatial hierarchies of features from\nimages, necessary to recognize complex shapes. Moreover, it overcomes the\nlimitation of other machine learning methods that require input trajectories\nwith a fixed number of points. However, rendering trajectories as images can\nintroduce poorly investigated artifacts such as information loss due to the\nplotting of coordinates on a discrete grid, and spectral changes due to line\nthickness and aliasing. In this study, we investigate the effectiveness of CNNs\nfor solving classification and regression problems from synthetic trajectories\nthat have been rendered as images using different modalities. The parameters\nconsidered in this study include line thickness, image resolution, usage of\nmotion history (color-coding of the temporal component) and anti-aliasing.\nResults highlight the importance of choosing an appropriate image resolution\naccording to model depth and motion history in applications where movement\ndirection is critical.\n","authors":["Mariaclaudia Nicolai","Raffaella Fiamma Cabini","Diego Ulisse Pizzagalli"],"pdf_url":"https://arxiv.org/pdf/2409.18832v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.18826v1","updated":"2024-09-27T15:19:51Z","published":"2024-09-27T15:19:51Z","title":"YOLOv8-ResCBAM: YOLOv8 Based on An Effective Attention Module for\n Pediatric Wrist Fracture Detection","summary":" Wrist trauma and even fractures occur frequently in daily life, particularly\namong children who account for a significant proportion of fracture cases.\nBefore performing surgery, surgeons often request patients to undergo X-ray\nimaging first, and prepare for the surgery based on the analysis of the X-ray\nimages. With the development of neural networks, You Only Look Once (YOLO)\nseries models have been widely used in fracture detection for Computer-Assisted\nDiagnosis, where the YOLOv8 model has obtained the satisfactory results.\nApplying the attention modules to neural networks is one of the effective\nmethods to improve the model performance. This paper proposes YOLOv8-ResCBAM,\nwhich incorporates Convolutional Block Attention Module integrated with\nresblock (ResCBAM) into the original YOLOv8 network architecture. The\nexperimental results on the GRAZPEDWRI-DX dataset demonstrate that the mean\nAverage Precision calculated at Intersection over Union threshold of 0.5 (mAP\n50) of the proposed model increased from 63.6% of the original YOLOv8 model to\n65.8%, which achieves the state-of-the-art performance. The implementation code\nis available at\nhttps://github.com/RuiyangJu/Fracture_Detection_Improved_YOLOv8.\n","authors":["Rui-Yang Ju","Chun-Tse Chien","Jen-Shiun Chiang"],"pdf_url":"https://arxiv.org/pdf/2409.18826v1.pdf","comment":"Accepted by ICONIP 2024. arXiv admin note: substantial text overlap\n with arXiv:2402.09329"},{"id":"http://arxiv.org/abs/2409.18814v1","updated":"2024-09-27T15:07:26Z","published":"2024-09-27T15:07:26Z","title":"Early diagnosis of Alzheimer's disease from MRI images with deep\n learning model","summary":" It is acknowledged that the most common cause of dementia worldwide is\nAlzheimer's disease (AD). This condition progresses in severity from mild to\nsevere and interferes with people's everyday routines. Early diagnosis plays a\ncritical role in patient care and clinical trials. Convolutional neural\nnetworks (CNN) are used to create a framework for identifying specific disease\nfeatures from MRI scans Classification of dementia involves approaches such as\nmedical history review, neuropsychological tests, and magnetic resonance\nimaging (MRI). However, the image dataset obtained from Kaggle faces a\nsignificant issue of class imbalance, which requires equal distribution of\nsamples from each class to address. In this article, to address this imbalance,\nthe Synthetic Minority Oversampling Technique (SMOTE) is utilized. Furthermore,\na pre-trained convolutional neural network has been applied to the DEMNET\ndementia network to extract key features from AD images. The proposed model\nachieved an impressive accuracy of 98.67%.\n","authors":["Sajjad Aghasi Javid","Mahmood Mohassel Feghhi"],"pdf_url":"https://arxiv.org/pdf/2409.18814v1.pdf","comment":"7 pages, 3 figures, Presented at the 20-th CSI International\n Symposium on Artificial Intelligence and Signal Processing (AISP) 21-22\n February, 2024, Mazandaran University of Science and Technology, Babol, Iran"},{"id":"http://arxiv.org/abs/2409.18813v1","updated":"2024-09-27T15:06:05Z","published":"2024-09-27T15:06:05Z","title":"EyeTrAES: Fine-grained, Low-Latency Eye Tracking via Adaptive Event\n Slicing","summary":" Eye-tracking technology has gained significant attention in recent years due\nto its wide range of applications in human-computer interaction, virtual and\naugmented reality, and wearable health. Traditional RGB camera-based\neye-tracking systems often struggle with poor temporal resolution and\ncomputational constraints, limiting their effectiveness in capturing rapid eye\nmovements. To address these limitations, we propose EyeTrAES, a novel approach\nusing neuromorphic event cameras for high-fidelity tracking of natural\npupillary movement that shows significant kinematic variance. One of EyeTrAES's\nhighlights is the use of a novel adaptive windowing/slicing algorithm that\nensures just the right amount of descriptive asynchronous event data\naccumulation within an event frame, across a wide range of eye movement\npatterns. EyeTrAES then applies lightweight image processing functions over\naccumulated event frames from just a single eye to perform pupil segmentation\nand tracking. We show that these methods boost pupil tracking fidelity by 6+%,\nachieving IoU~=92%, while incurring at least 3x lower latency than competing\npure event-based eye tracking alternatives [38]. We additionally demonstrate\nthat the microscopic pupillary motion captured by EyeTrAES exhibits distinctive\nvariations across individuals and can thus serve as a biometric fingerprint.\nFor robust user authentication, we train a lightweight per-user Random Forest\nclassifier using a novel feature vector of short-term pupillary kinematics,\ncomprising a sliding window of pupil (location, velocity, acceleration)\ntriples. Experimental studies with two different datasets demonstrate that the\nEyeTrAES-based authentication technique can simultaneously achieve high\nauthentication accuracy (~=0.82) and low processing latency (~=12ms), and\nsignificantly outperform multiple state-of-the-art competitive baselines.\n","authors":["Argha Sen","Nuwan Bandara","Ila Gokarn","Thivya Kandappu","Archan Misra"],"pdf_url":"https://arxiv.org/pdf/2409.18813v1.pdf","comment":"32 pages,15 figures,"},{"id":"http://arxiv.org/abs/2409.15546v2","updated":"2024-09-27T15:02:10Z","published":"2024-09-23T21:02:21Z","title":"A Novel Framework for the Automated Characterization of Gram-Stained\n Blood Culture Slides Using a Large-Scale Vision Transformer","summary":" This study introduces a new framework for the artificial\nintelligence-assisted characterization of Gram-stained whole-slide images\n(WSIs). As a test for the diagnosis of bloodstream infections, Gram stains\nprovide critical early data to inform patient treatment. Rapid and reliable\nanalysis of Gram stains has been shown to be positively associated with better\nclinical outcomes, underscoring the need for improved tools to automate Gram\nstain analysis. In this work, we developed a novel transformer-based model for\nGram-stained WSI classification, which is more scalable to large datasets than\nprevious convolutional neural network (CNN) -based methods as it does not\nrequire patch-level manual annotations. We also introduce a large Gram stain\ndataset from Dartmouth-Hitchcock Medical Center (Lebanon, New Hampshire, USA)\nto evaluate our model, exploring the classification of five major categories of\nGram-stained WSIs: Gram-positive cocci in clusters, Gram-positive cocci in\npairs/chains, Gram-positive rods, Gram-negative rods, and slides with no\nbacteria. Our model achieves a classification accuracy of 0.858 (95% CI: 0.805,\n0.905) and an AUC of 0.952 (95% CI: 0.922, 0.976) using five-fold nested\ncross-validation on our 475-slide dataset, demonstrating the potential of\nlarge-scale transformer models for Gram stain classification. We further\ndemonstrate the generalizability of our trained model, which achieves strong\nperformance on external datasets without additional fine-tuning.\n","authors":["Jack McMahon","Naofumi Tomita","Elizabeth S. Tatishev","Adrienne A. Workman","Cristina R Costales","Niaz Banaei","Isabella W. Martin","Saeed Hassanpour"],"pdf_url":"https://arxiv.org/pdf/2409.15546v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18800v1","updated":"2024-09-27T14:54:54Z","published":"2024-09-27T14:54:54Z","title":"MiniVLN: Efficient Vision-and-Language Navigation by Progressive\n Knowledge Distillation","summary":" In recent years, Embodied Artificial Intelligence (Embodied AI) has advanced\nrapidly, yet the increasing size of models conflicts with the limited\ncomputational capabilities of Embodied AI platforms. To address this challenge,\nwe aim to achieve both high model performance and practical deployability.\nSpecifically, we focus on Vision-and-Language Navigation (VLN), a core task in\nEmbodied AI. This paper introduces a two-stage knowledge distillation\nframework, producing a student model, MiniVLN, and showcasing the significant\npotential of distillation techniques in developing lightweight models. The\nproposed method aims to capture fine-grained knowledge during the pretraining\nphase and navigation-specific knowledge during the fine-tuning phase. Our\nfindings indicate that the two-stage distillation approach is more effective in\nnarrowing the performance gap between the teacher model and the student model\ncompared to single-stage distillation. On the public R2R and REVERIE\nbenchmarks, MiniVLN achieves performance on par with the teacher model while\nhaving only about 12% of the teacher model's parameter count.\n","authors":["Junyou Zhu","Yanyuan Qiao","Siqi Zhang","Xingjian He","Qi Wu","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10494v4","updated":"2024-09-27T14:50:23Z","published":"2023-02-21T07:48:34Z","title":"The Role of Masking for Efficient Supervised Knowledge Distillation of\n Vision Transformers","summary":" Knowledge distillation is an effective method for training lightweight vision\nmodels. However, acquiring teacher supervision for training samples is often\ncostly, especially from large-scale models like vision transformers (ViTs). In\nthis paper, we develop a simple framework to reduce the supervision cost of ViT\ndistillation: masking out a fraction of input tokens given to the teacher. By\nmasking input tokens, one can skip the computations associated with the masked\ntokens without requiring any change to teacher parameters or architecture. We\nfind that masking patches with the lowest student attention scores is highly\neffective, saving up to 50% of teacher FLOPs without any drop in student\naccuracy, while other masking criterion leads to suboptimal efficiency gains.\nThrough in-depth analyses, we reveal that the student-guided masking provides a\ngood curriculum to the student, making teacher supervision easier to follow\nduring the early stage and challenging in the later stage.\n","authors":["Seungwoo Son","Jegwang Ryu","Namhoon Lee","Jaeho Lee"],"pdf_url":"https://arxiv.org/pdf/2302.10494v4.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2409.18794v1","updated":"2024-09-27T14:47:18Z","published":"2024-09-27T14:47:18Z","title":"Open-Nav: Exploring Zero-Shot Vision-and-Language Navigation in\n Continuous Environment with Open-Source LLMs","summary":" Vision-and-Language Navigation (VLN) tasks require an agent to follow textual\ninstructions to navigate through 3D environments. Traditional approaches use\nsupervised learning methods, relying heavily on domain-specific datasets to\ntrain VLN models. Recent methods try to utilize closed-source large language\nmodels (LLMs) like GPT-4 to solve VLN tasks in zero-shot manners, but face\nchallenges related to expensive token costs and potential data breaches in\nreal-world applications. In this work, we introduce Open-Nav, a novel study\nthat explores open-source LLMs for zero-shot VLN in the continuous environment.\nOpen-Nav employs a spatial-temporal chain-of-thought (CoT) reasoning approach\nto break down tasks into instruction comprehension, progress estimation, and\ndecision-making. It enhances scene perceptions with fine-grained object and\nspatial knowledge to improve LLM's reasoning in navigation. Our extensive\nexperiments in both simulated and real-world environments demonstrate that\nOpen-Nav achieves competitive performance compared to using closed-source LLMs.\n","authors":["Yanyuan Qiao","Wenqi Lyu","Hui Wang","Zixu Wang","Zerui Li","Yuan Zhang","Mingkui Tan","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2409.18794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18788v1","updated":"2024-09-27T14:36:20Z","published":"2024-09-27T14:36:20Z","title":"Excavating in the Wild: The GOOSE-Ex Dataset for Semantic Segmentation","summary":" The successful deployment of deep learning-based techniques for autonomous\nsystems is highly dependent on the data availability for the respective system\nin its deployment environment. Especially for unstructured outdoor\nenvironments, very few datasets exist for even fewer robotic platforms and\nscenarios. In an earlier work, we presented the German Outdoor and Offroad\nDataset (GOOSE) framework along with 10000 multimodal frames from an offroad\nvehicle to enhance the perception capabilities in unstructured environments. In\nthis work, we address the generalizability of the GOOSE framework. To\naccomplish this, we open-source the GOOSE-Ex dataset, which contains additional\n5000 labeled multimodal frames from various completely different environments,\nrecorded on a robotic excavator and a quadruped platform. We perform a\ncomprehensive analysis of the semantic segmentation performance on different\nplatforms and sensor modalities in unseen environments. In addition, we\ndemonstrate how the combined datasets can be utilized for different downstream\napplications or competitions such as offroad navigation, object manipulation or\nscene completion. The dataset, its platform documentation and pre-trained\nstate-of-the-art models for offroad perception will be made available on\nhttps://goose-dataset.de/.\n \\\n","authors":["Raphael Hagmanns","Peter Mortimer","Miguel Granero","Thorsten Luettel","Janko Petereit"],"pdf_url":"https://arxiv.org/pdf/2409.18788v1.pdf","comment":"Submitted to IEEE for review"},{"id":"http://arxiv.org/abs/2409.18785v1","updated":"2024-09-27T14:34:08Z","published":"2024-09-27T14:34:08Z","title":"Student-Oriented Teacher Knowledge Refinement for Knowledge Distillation","summary":" Knowledge distillation has become widely recognized for its ability to\ntransfer knowledge from a large teacher network to a compact and more\nstreamlined student network. Traditional knowledge distillation methods\nprimarily follow a teacher-oriented paradigm that imposes the task of learning\nthe teacher's complex knowledge onto the student network. However, significant\ndisparities in model capacity and architectural design hinder the student's\ncomprehension of the complex knowledge imparted by the teacher, resulting in\nsub-optimal performance. This paper introduces a novel perspective emphasizing\nstudent-oriented and refining the teacher's knowledge to better align with the\nstudent's needs, thereby improving knowledge transfer effectiveness.\nSpecifically, we present the Student-Oriented Knowledge Distillation (SoKD),\nwhich incorporates a learnable feature augmentation strategy during training to\nrefine the teacher's knowledge of the student dynamically. Furthermore, we\ndeploy the Distinctive Area Detection Module (DAM) to identify areas of mutual\ninterest between the teacher and student, concentrating knowledge transfer\nwithin these critical areas to avoid transferring irrelevant information. This\ncustomized module ensures a more focused and effective knowledge distillation\nprocess. Our approach, functioning as a plug-in, could be integrated with\nvarious knowledge distillation methods. Extensive experimental results\ndemonstrate the efficacy and generalizability of our method.\n","authors":["Chaomin Shen","Yaomin Huang","Haokun Zhu","Jinsong Fan","Guixu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18783v1","updated":"2024-09-27T14:30:24Z","published":"2024-09-27T14:30:24Z","title":"DualDn: Dual-domain Denoising via Differentiable ISP","summary":" Image denoising is a critical component in a camera's Image Signal Processing\n(ISP) pipeline. There are two typical ways to inject a denoiser into the ISP\npipeline: applying a denoiser directly to captured raw frames (raw domain) or\nto the ISP's output sRGB images (sRGB domain). However, both approaches have\ntheir limitations. Residual noise from raw-domain denoising can be amplified by\nthe subsequent ISP processing, and the sRGB domain struggles to handle\nspatially varying noise since it only sees noise distorted by the ISP.\nConsequently, most raw or sRGB domain denoising works only for specific noise\ndistributions and ISP configurations. To address these challenges, we propose\nDualDn, a novel learning-based dual-domain denoising. Unlike previous\nsingle-domain denoising, DualDn consists of two denoising networks: one in the\nraw domain and one in the sRGB domain. The raw domain denoising adapts to\nsensor-specific noise as well as spatially varying noise levels, while the sRGB\ndomain denoising adapts to ISP variations and removes residual noise amplified\nby the ISP. Both denoising networks are connected with a differentiable ISP,\nwhich is trained end-to-end and discarded during the inference stage. With this\ndesign, DualDn achieves greater generalizability compared to most\nlearning-based denoising methods, as it can adapt to different unseen noises,\nISP parameters, and even novel ISP pipelines. Experiments show that DualDn\nachieves state-of-the-art performance and can adapt to different denoising\narchitectures. Moreover, DualDn can be used as a plug-and-play denoising module\nwith real cameras without retraining, and still demonstrate better performance\nthan commercial on-camera denoising. The project website is available at:\nhttps://openimaginglab.github.io/DualDn/\n","authors":["Ruikang Li","Yujin Wang","Shiqi Chen","Fan Zhang","Jinwei Gu","Tianfan Xue"],"pdf_url":"https://arxiv.org/pdf/2409.18783v1.pdf","comment":"Accepted at ECCV 2024, Project page:\n https://openimaginglab.github.io/DualDn/"},{"id":"http://arxiv.org/abs/2403.11876v2","updated":"2024-09-27T14:18:56Z","published":"2024-03-18T15:28:35Z","title":"Deep Bayesian Future Fusion for Self-Supervised, High-Resolution,\n Off-Road Mapping","summary":" High-speed off-road navigation requires long-range, high-resolution maps to\nenable robots to safely navigate over different surfaces while avoiding\ndangerous obstacles. However, due to limited computational power and sensing\nnoise, most approaches to off-road mapping focus on producing coarse (20-40cm)\nmaps of the environment. In this paper, we propose Future Fusion, a framework\ncapable of generating dense, high-resolution maps from sparse sensing data (30m\nforward at 2cm). This is accomplished by - (1) the efficient realization of the\nwell-known Bayes filtering within the standard deep learning models that\nexplicitly accounts for the sparsity pattern in stereo and LiDAR depth data,\nand (2) leveraging perceptual losses common in generative image completion. The\nproposed methodology outperforms the conventional baselines. Moreover, the\nlearned features and the completed dense maps lead to improvements in the\ndownstream navigation task.\n","authors":["Shubhra Aich","Wenshan Wang","Parv Maheshwari","Matthew Sivaprakasam","Samuel Triest","Cherie Ho","Jason M. Gregory","John G. Rogers III","Sebastian Scherer"],"pdf_url":"https://arxiv.org/pdf/2403.11876v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18770v1","updated":"2024-09-27T14:15:02Z","published":"2024-09-27T14:15:02Z","title":"Relighting from a Single Image: Datasets and Deep Intrinsic-based\n Architecture","summary":" Single image scene relighting aims to generate a realistic new version of an\ninput image so that it appears to be illuminated by a new target light\ncondition. Although existing works have explored this problem from various\nperspectives, generating relit images under arbitrary light conditions remains\nhighly challenging, and related datasets are scarce. Our work addresses this\nproblem from both the dataset and methodological perspectives. We propose two\nnew datasets: a synthetic dataset with the ground truth of intrinsic components\nand a real dataset collected under laboratory conditions. These datasets\nalleviate the scarcity of existing datasets. To incorporate physical\nconsistency in the relighting pipeline, we establish a two-stage network based\non intrinsic decomposition, giving outputs at intermediate steps, thereby\nintroducing physical constraints. When the training set lacks ground truth for\nintrinsic decomposition, we introduce an unsupervised module to ensure that the\nintrinsic outputs are satisfactory. Our method outperforms the state-of-the-art\nmethods in performance, as tested on both existing datasets and our newly\ndeveloped datasets. Furthermore, pretraining our method or other prior methods\nusing our synthetic dataset can enhance their performance on other datasets.\nSince our method can accommodate any light conditions, it is capable of\nproducing animated results. The dataset, method, and videos are publicly\navailable.\n","authors":["Yixiong Yang","Hassan Ahmed Sial","Ramon Baldrich","Maria Vanrell"],"pdf_url":"https://arxiv.org/pdf/2409.18770v1.pdf","comment":"Accepted for publication as a Regular paper in the IEEE Transactions\n on Multimedia"},{"id":"http://arxiv.org/abs/2409.18769v1","updated":"2024-09-27T14:14:16Z","published":"2024-09-27T14:14:16Z","title":"State-of-the-Art Periorbital Distance Prediction and Disease\n Classification Using Periorbital Features","summary":" Periorbital distances and features around the eyes and lids hold valuable\ninformation for disease quantification and monitoring of surgical and medical\nintervention. These distances are commonly measured manually, a process that is\nboth subjective and highly time-consuming. Here, we set out to developed three\ndeep-learning methods for segmentation and periorbital distance prediction, and\nalso evaluate the utility of periorbital distances for disease classification.\nThe MAE of our deep learning predicted distances was less than or very close to\nthe error observed between trained human annotators. We compared our models to\nthe current state-of-the-art (SOTA) method for periorbital distance prediction\nand found that our methods outperformed SOTA on all of our datasets on all but\none periorbital measurement. We also show that robust segmentation can be\nachieved on diseased eyes using models trained on open-source, healthy eyes,\nand that periorbital distances have can be used as high-quality features in\ndownstream classification models. Leveraging segmentation networks as\nintermediary steps in classification has broad implications for increasing the\ngeneralizability of classification models in ophthalmic plastic and\ncraniofacial surgery by avoiding the out-of-distribution problem observed in\ntraditional convolutional neural networks.\n","authors":["George R. Nahass","Ghasem Yazdanpanah","Madison Cheung","Alex Palacios","Jeffery Peterson","Kevin Heinze","Sasha Hubschman","Chad A. Purnell","Pete Setabutr","Ann Q. Tran","Darvin Yi"],"pdf_url":"https://arxiv.org/pdf/2409.18769v1.pdf","comment":"16 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2311.13833v2","updated":"2024-09-27T14:04:35Z","published":"2023-11-23T07:33:38Z","title":"Lego: Learning to Disentangle and Invert Personalized Concepts Beyond\n Object Appearance in Text-to-Image Diffusion Models","summary":" Text-to-Image (T2I) models excel at synthesizing concepts such as nouns,\nappearances, and styles. To enable customized content creation based on a few\nexample images of a concept, methods such as Textual Inversion and DreamBooth\ninvert the desired concept and enable synthesizing it in new scenes. However,\ninverting personalized concepts that go beyond object appearance and style\n(adjectives and verbs) through natural language remains a challenge. Two key\ncharacteristics of these concepts contribute to the limitations of current\ninversion methods. 1) Adjectives and verbs are entangled with nouns (subject)\nand can hinder appearance-based inversion methods, where the subject appearance\nleaks into the concept embedding, and 2) describing such concepts often extends\nbeyond single word embeddings.\n In this study, we introduce Lego, a textual inversion method designed to\ninvert subject-entangled concepts from a few example images. Lego disentangles\nconcepts from their associated subjects using a simple yet effective Subject\nSeparation step and employs a Context Loss that guides the inversion of\nsingle/multi-embedding concepts. In a thorough user study, Lego-generated\nconcepts were preferred over 70% of the time when compared to the baseline in\nterms of authentically generating concepts according to a reference.\nAdditionally, visual question answering using an LLM suggested Lego-generated\nconcepts are better aligned with the text description of the concept.\n","authors":["Saman Motamed","Danda Pani Paudel","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2311.13833v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18764v1","updated":"2024-09-27T14:02:48Z","published":"2024-09-27T14:02:48Z","title":"Charting the Future: Using Chart Question-Answering for Scalable\n Evaluation of LLM-Driven Data Visualizations","summary":" We propose a novel framework that leverages Visual Question Answering (VQA)\nmodels to automate the evaluation of LLM-generated data visualizations.\nTraditional evaluation methods often rely on human judgment, which is costly\nand unscalable, or focus solely on data accuracy, neglecting the effectiveness\nof visual communication. By employing VQA models, we assess data representation\nquality and the general communicative clarity of charts. Experiments were\nconducted using two leading VQA benchmark datasets, ChartQA and PlotQA, with\nvisualizations generated by OpenAI's GPT-3.5 Turbo and Meta's Llama 3.1\n70B-Instruct models. Our results indicate that LLM-generated charts do not\nmatch the accuracy of the original non-LLM-generated charts based on VQA\nperformance measures. Moreover, while our results demonstrate that few-shot\nprompting significantly boosts the accuracy of chart generation, considerable\nprogress remains to be made before LLMs can fully match the precision of\nhuman-generated graphs. This underscores the importance of our work, which\nexpedites the research process by enabling rapid iteration without the need for\nhuman annotation, thus accelerating advancements in this field.\n","authors":["James Ford","Xingmeng Zhao","Dan Schumacher","Anthony Rios"],"pdf_url":"https://arxiv.org/pdf/2409.18764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11540v3","updated":"2024-09-27T13:55:04Z","published":"2024-08-21T11:39:18Z","title":"DeRainGS: Gaussian Splatting for Enhanced Scene Reconstruction in Rainy\n Environments","summary":" Reconstruction under adverse rainy conditions poses significant challenges\ndue to reduced visibility and the distortion of visual perception. These\nconditions can severely impair the quality of geometric maps, which is\nessential for applications ranging from autonomous planning to environmental\nmonitoring. In response to these challenges, this study introduces the novel\ntask of 3D Reconstruction in Rainy Environments (3DRRE), specifically designed\nto address the complexities of reconstructing 3D scenes under rainy conditions.\nTo benchmark this task, we construct the HydroViews dataset that comprises a\ndiverse collection of both synthesized and real-world scene images\ncharacterized by various intensities of rain streaks and raindrops.\nFurthermore, we propose DeRainGS, the first 3DGS method tailored for\nreconstruction in adverse rainy environments. Extensive experiments across a\nwide range of rain scenarios demonstrate that our method delivers\nstate-of-the-art performance, remarkably outperforming existing occlusion-free\nmethods.\n","authors":["Shuhong Liu","Xiang Chen","Hongming Chen","Quanfeng Xu","Mingrui Li"],"pdf_url":"https://arxiv.org/pdf/2408.11540v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08167v2","updated":"2024-09-27T13:52:36Z","published":"2024-09-12T15:58:28Z","title":"High-Frequency Anti-DreamBooth: Robust Defense against Personalized\n Image Synthesis","summary":" Recently, text-to-image generative models have been misused to create\nunauthorized malicious images of individuals, posing a growing social problem.\nPrevious solutions, such as Anti-DreamBooth, add adversarial noise to images to\nprotect them from being used as training data for malicious generation.\nHowever, we found that the adversarial noise can be removed by adversarial\npurification methods such as DiffPure. Therefore, we propose a new adversarial\nattack method that adds strong perturbation on the high-frequency areas of\nimages to make it more robust to adversarial purification. Our experiment\nshowed that the adversarial images retained noise even after adversarial\npurification, hindering malicious image generation.\n","authors":["Takuto Onikubo","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2409.08167v2.pdf","comment":"ECCV 2024 Workshop The Dark Side of Generative AIs and Beyond"},{"id":"http://arxiv.org/abs/2409.18753v1","updated":"2024-09-27T13:44:55Z","published":"2024-09-27T13:44:55Z","title":"Enhancing Explainability in Multimodal Large Language Models Using\n Ontological Context","summary":" Recently, there has been a growing interest in Multimodal Large Language\nModels (MLLMs) due to their remarkable potential in various tasks integrating\ndifferent modalities, such as image and text, as well as applications such as\nimage captioning and visual question answering. However, such models still face\nchallenges in accurately captioning and interpreting specific visual concepts\nand classes, particularly in domain-specific applications. We argue that\nintegrating domain knowledge in the form of an ontology can significantly\naddress these issues. In this work, as a proof of concept, we propose a new\nframework that combines ontology with MLLMs to classify images of plant\ndiseases. Our method uses concepts about plant diseases from an existing\ndisease ontology to query MLLMs and extract relevant visual concepts from\nimages. Then, we use the reasoning capabilities of the ontology to classify the\ndisease according to the identified concepts. Ensuring that the model\naccurately uses the concepts describing the disease is crucial in\ndomain-specific applications. By employing an ontology, we can assist in\nverifying this alignment. Additionally, using the ontology's inference\ncapabilities increases transparency, explainability, and trust in the\ndecision-making process while serving as a judge by checking if the annotations\nof the concepts by MLLMs are aligned with those in the ontology and displaying\nthe rationales behind their errors. Our framework offers a new direction for\nsynergizing ontologies and MLLMs, supported by an empirical study using\ndifferent well-known MLLMs.\n","authors":["Jihen Amara","Birgitta König-Ries","Sheeba Samuel"],"pdf_url":"https://arxiv.org/pdf/2409.18753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15117v2","updated":"2024-09-27T13:32:18Z","published":"2024-09-23T15:23:01Z","title":"Diffusion-based RGB-D Semantic Segmentation with Deformable Attention\n Transformer","summary":" Vision-based perception and reasoning is essential for scene understanding in\nany autonomous system. RGB and depth images are commonly used to capture both\nthe semantic and geometric features of the environment. Developing methods to\nreliably interpret this data is critical for real-world applications, where\nnoisy measurements are often unavoidable. In this work, we introduce a\ndiffusion-based framework to address the RGB-D semantic segmentation problem.\nAdditionally, we demonstrate that utilizing a Deformable Attention Transformer\nas the encoder to extract features from depth images effectively captures the\ncharacteristics of invalid regions in depth measurements. Our generative\nframework shows a greater capacity to model the underlying distribution of\nRGB-D images, achieving robust performance in challenging scenarios with\nsignificantly less training time compared to discriminative methods.\nExperimental results indicate that our approach achieves State-of-the-Art\nperformance on both the NYUv2 and SUN-RGBD datasets in general and especially\nin the most challenging of their image data. Our project page will be available\nat https://diffusionmms.github.io/\n","authors":["Minh Bui","Kostas Alexis"],"pdf_url":"https://arxiv.org/pdf/2409.15117v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18730v1","updated":"2024-09-27T13:23:17Z","published":"2024-09-27T13:23:17Z","title":"Effectiveness of learning-based image codecs on fingerprint storage","summary":" The success of learning-based coding techniques and the development of\nlearning-based image coding standards, such as JPEG-AI, point towards the\nadoption of such solutions in different fields, including the storage of\nbiometric data, like fingerprints. However, the peculiar nature of\nlearning-based compression artifacts poses several issues concerning their\nimpact and effectiveness on extracting biometric features and landmarks, e.g.,\nminutiae. This problem is utterly stressed by the fact that most models are\ntrained on natural color images, whose characteristics are very different from\nusual biometric images, e.g, fingerprint or iris pictures. As a matter of fact,\nthese issues are deemed to be accurately questioned and investigated, being\nsuch analysis still largely unexplored.\n This study represents the first investigation about the adaptability of\nlearning-based image codecs in the storage of fingerprint images by measuring\nits impact on the extraction and characterization of minutiae. Experimental\nresults show that at a fixed rate point, learned solutions considerably\noutperform previous fingerprint coding standards, like JPEG2000, both in terms\nof distortion and minutiae preservation. Indeed, experimental results prove\nthat the peculiarities of learned compression artifacts do not prevent\nautomatic fingerprint identification (since minutiae types and locations are\nnot significantly altered), nor do compromise image quality for human visual\ninspection (as they gain in terms of BD rate and PSNR of 47.8% and +3.97dB\nrespectively).\n","authors":["Daniele Mari","Saverio Cavasin","Simone Milani","Mauro Conti"],"pdf_url":"https://arxiv.org/pdf/2409.18730v1.pdf","comment":"Accepted ad Wifs 2024"},{"id":"http://arxiv.org/abs/2409.18731v1","updated":"2024-09-27T13:23:17Z","published":"2024-09-27T13:23:17Z","title":"A Generalized Tensor Formulation for Hyperspectral Image\n Super-Resolution Under General Spatial Blurring","summary":" Hyperspectral super-resolution is commonly accomplished by the fusing of a\nhyperspectral imaging of low spatial resolution with a multispectral image of\nhigh spatial resolution, and many tensor-based approaches to this task have\nbeen recently proposed. Yet, it is assumed in such tensor-based methods that\nthe spatial-blurring operation that creates the observed hyperspectral image\nfrom the desired super-resolved image is separable into independent horizontal\nand vertical blurring. Recent work has argued that such separable spatial\ndegradation is ill-equipped to model the operation of real sensors which may\nexhibit, for example, anisotropic blurring. To accommodate this fact, a\ngeneralized tensor formulation based on a Kronecker decomposition is proposed\nto handle any general spatial-degradation matrix, including those that are not\nseparable as previously assumed. Analysis of the generalized formulation\nreveals conditions under which exact recovery of the desired super-resolved\nimage is guaranteed, and a practical algorithm for such recovery, driven by a\nblockwise-group-sparsity regularization, is proposed. Extensive experimental\nresults demonstrate that the proposed generalized tensor approach outperforms\nnot only traditional matrix-based techniques but also state-of-the-art\ntensor-based methods; the gains with respect to the latter are especially\nsignificant in cases of anisotropic spatial blurring.\n","authors":["Yinjian Wang","Wei Li","Yuanyuan Gui","Qian Du","James E. Fowler"],"pdf_url":"https://arxiv.org/pdf/2409.18731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14180v2","updated":"2024-09-27T13:12:04Z","published":"2024-08-26T11:08:44Z","title":"I2EBench: A Comprehensive Benchmark for Instruction-based Image Editing","summary":" Significant progress has been made in the field of Instruction-based Image\nEditing (IIE). However, evaluating these models poses a significant challenge.\nA crucial requirement in this field is the establishment of a comprehensive\nevaluation benchmark for accurately assessing editing results and providing\nvaluable insights for its further development. In response to this need, we\npropose I2EBench, a comprehensive benchmark designed to automatically evaluate\nthe quality of edited images produced by IIE models from multiple dimensions.\nI2EBench consists of 2,000+ images for editing, along with 4,000+ corresponding\noriginal and diverse instructions. It offers three distinctive characteristics:\n1) Comprehensive Evaluation Dimensions: I2EBench comprises 16 evaluation\ndimensions that cover both high-level and low-level aspects, providing a\ncomprehensive assessment of each IIE model. 2) Human Perception Alignment: To\nensure the alignment of our benchmark with human perception, we conducted an\nextensive user study for each evaluation dimension. 3) Valuable Research\nInsights: By analyzing the advantages and disadvantages of existing IIE models\nacross the 16 dimensions, we offer valuable research insights to guide future\ndevelopment in the field. We will open-source I2EBench, including all\ninstructions, input images, human annotations, edited images from all evaluated\nmethods, and a simple script for evaluating the results from new IIE models.\nThe code, dataset and generated images from all IIE models are provided in\ngithub: https://github.com/cocoshe/I2EBench.\n","authors":["Yiwei Ma","Jiayi Ji","Ke Ye","Weihuang Lin","Zhibin Wang","Yonghan Zheng","Qiang Zhou","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2408.14180v2.pdf","comment":"NeurIPS2024, 15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.18715v1","updated":"2024-09-27T12:59:29Z","published":"2024-09-27T12:59:29Z","title":"Multi-modal Medical Image Fusion For Non-Small Cell Lung Cancer\n Classification","summary":" The early detection and nuanced subtype classification of non-small cell lung\ncancer (NSCLC), a predominant cause of cancer mortality worldwide, is a\ncritical and complex issue. In this paper, we introduce an innovative\nintegration of multi-modal data, synthesizing fused medical imaging (CT and PET\nscans) with clinical health records and genomic data. This unique fusion\nmethodology leverages advanced machine learning models, notably MedClip and\nBEiT, for sophisticated image feature extraction, setting a new standard in\ncomputational oncology. Our research surpasses existing approaches, as\nevidenced by a substantial enhancement in NSCLC detection and classification\nprecision. The results showcase notable improvements across key performance\nmetrics, including accuracy, precision, recall, and F1-score. Specifically, our\nleading multi-modal classifier model records an impressive accuracy of 94.04%.\nWe believe that our approach has the potential to transform NSCLC diagnostics,\nfacilitating earlier detection and more effective treatment planning and,\nultimately, leading to superior patient outcomes in lung cancer care.\n","authors":["Salma Hassan","Hamad Al Hammadi","Ibrahim Mohammed","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2409.18715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18701v1","updated":"2024-09-27T12:44:06Z","published":"2024-09-27T12:44:06Z","title":"3DPX: Single Panoramic X-ray Analysis Guided by 3D Oral Structure\n Reconstruction","summary":" Panoramic X-ray (PX) is a prevalent modality in dentistry practice owing to\nits wide availability and low cost. However, as a 2D projection of a 3D\nstructure, PX suffers from anatomical information loss and PX diagnosis is\nlimited compared to that with 3D imaging modalities. 2D-to-3D reconstruction\nmethods have been explored for the ability to synthesize the absent 3D\nanatomical information from 2D PX for use in PX image analysis. However, there\nare challenges in leveraging such 3D synthesized reconstructions. First,\ninferring 3D depth from 2D images remains a challenging task with limited\naccuracy. The second challenge is the joint analysis of 2D PX with its 3D\nsynthesized counterpart, with the aim to maximize the 2D-3D synergy while\nminimizing the errors arising from the synthesized image. In this study, we\npropose a new method termed 3DPX - PX image analysis guided by 2D-to-3D\nreconstruction, to overcome these challenges. 3DPX consists of (i) a novel\nprogressive reconstruction network to improve 2D-to-3D reconstruction and, (ii)\na contrastive-guided bidirectional multimodality alignment module for 3D-guided\n2D PX classification and segmentation tasks. The reconstruction network\nprogressively reconstructs 3D images with knowledge imposed on the intermediate\nreconstructions at multiple pyramid levels and incorporates Multilayer\nPerceptrons to improve semantic understanding. The downstream networks leverage\nthe reconstructed images as 3D anatomical guidance to the PX analysis through\nfeature alignment, which increases the 2D-3D synergy with bidirectional feature\nprojection and decease the impact of potential errors with contrastive\nguidance. Extensive experiments on two oral datasets involving 464 studies\ndemonstrate that 3DPX outperforms the state-of-the-art methods in various tasks\nincluding 2D-to-3D reconstruction, PX classification and lesion segmentation.\n","authors":["Xiaoshuang Li","Zimo Huang","Mingyuan Meng","Eduardo Delamare","Dagan Feng","Lei Bi","Bin Sheng","Lingyong Jiang","Bo Li","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14224v2","updated":"2024-09-27T12:31:24Z","published":"2024-07-19T11:48:36Z","title":"Hierarchical Windowed Graph Attention Network and a Large Scale Dataset\n for Isolated Indian Sign Language Recognition","summary":" Automatic Sign Language (SL) recognition is an important task in the computer\nvision community. To build a robust SL recognition system, we need a\nconsiderable amount of data which is lacking particularly in Indian sign\nlanguage (ISL). In this paper, we introduce a large-scale isolated ISL dataset\nand a novel SL recognition model based on skeleton graph structure. The dataset\ncovers 2002 daily used common words in the deaf community recorded by 20 (10\nmale and 10 female) deaf adult signers (contains 40033 videos). We propose a SL\nrecognition model namely Hierarchical Windowed Graph Attention Network (HWGAT)\nby utilizing the human upper body skeleton graph. The HWGAT tries to capture\ndistinctive motions by giving attention to different body parts induced by the\nhuman skeleton graph. The utility of the proposed dataset and the usefulness of\nour model are evaluated through extensive experiments. We pre-trained the\nproposed model on the presented dataset and fine-tuned it across different sign\nlanguage datasets further boosting the performance of 1.10, 0.46, 0.78, and\n6.84 percentage points on INCLUDE, LSA64, AUTSL and WLASL respectively compared\nto the existing state-of-the-art keypoints-based models.\n","authors":["Suvajit Patra","Arkadip Maitra","Megha Tiwari","K. Kumaran","Swathy Prabhu","Swami Punyeshwarananda","Soumitra Samanta"],"pdf_url":"https://arxiv.org/pdf/2407.14224v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18694v1","updated":"2024-09-27T12:28:47Z","published":"2024-09-27T12:28:47Z","title":"Learning from Pattern Completion: Self-supervised Controllable\n Generation","summary":" The human brain exhibits a strong ability to spontaneously associate\ndifferent visual attributes of the same or similar visual scene, such as\nassociating sketches and graffiti with real-world visual objects, usually\nwithout supervising information. In contrast, in the field of artificial\nintelligence, controllable generation methods like ControlNet heavily rely on\nannotated training datasets such as depth maps, semantic segmentation maps, and\nposes, which limits the method's scalability. Inspired by the neural mechanisms\nthat may contribute to the brain's associative power, specifically the cortical\nmodularization and hippocampal pattern completion, here we propose a\nself-supervised controllable generation (SCG) framework. Firstly, we introduce\nan equivariant constraint to promote inter-module independence and intra-module\ncorrelation in a modular autoencoder network, thereby achieving functional\nspecialization. Subsequently, based on these specialized modules, we employ a\nself-supervised pattern completion approach for controllable generation\ntraining. Experimental results demonstrate that the proposed modular\nautoencoder effectively achieves functional specialization, including the\nmodular processing of color, brightness, and edge detection, and exhibits\nbrain-like features including orientation selectivity, color antagonism, and\ncenter-surround receptive fields. Through self-supervised training, associative\ngeneration capabilities spontaneously emerge in SCG, demonstrating excellent\ngeneralization ability to various tasks such as associative generation on\npainting, sketches, and ancient graffiti. Compared to the previous\nrepresentative method ControlNet, our proposed approach not only demonstrates\nsuperior robustness in more challenging high-noise scenarios but also possesses\nmore promising scalability potential due to its self-supervised manner.\n","authors":["Zhiqiang Chen","Guofan Fan","Jinying Gao","Lei Ma","Bo Lei","Tiejun Huang","Shan Yu"],"pdf_url":"https://arxiv.org/pdf/2409.18694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12514v3","updated":"2024-09-27T12:23:06Z","published":"2024-09-19T07:10:18Z","title":"TinyVLA: Towards Fast, Data-Efficient Vision-Language-Action Models for\n Robotic Manipulation","summary":" Vision-Language-Action (VLA) models have shown remarkable potential in\nvisuomotor control and instruction comprehension through end-to-end learning\nprocesses. However, current VLA models face significant challenges: they are\nslow during inference and require extensive pre-training on large amounts of\nrobotic data, making real-world deployment difficult. In this paper, we\nintroduce a new family of compact vision-language-action models, called\nTinyVLA, which offers two key advantages over existing VLA models: (1) faster\ninference speeds, and (2) improved data efficiency, eliminating the need for\npre-training stage. Our framework incorporates two essential components to\nbuild TinyVLA: (1) initializing the policy backbone with robust, high-speed\nmultimodal models, and (2) integrating a diffusion policy decoder during\nfine-tuning to enable precise robot actions. We conducted extensive evaluations\nof TinyVLA in both simulation and on real robots, demonstrating that our\napproach significantly outperforms the state-of-the-art VLA model, OpenVLA, in\nterms of speed and data efficiency, while delivering comparable or superior\nperformance. Additionally, TinyVLA exhibits strong generalization capabilities\nacross various dimensions, including language instructions, novel objects,\nunseen positions, changes in object appearance, background variations, and\nenvironmental shifts, often matching or exceeding the performance of OpenVLA.\nWe believe that \\methodname offers an interesting perspective on utilizing\npre-trained multimodal models for policy learning. Our project is at\nhttps://tiny-vla.github.io.\n","authors":["Junjie Wen","Yichen Zhu","Jinming Li","Minjie Zhu","Kun Wu","Zhiyuan Xu","Ning Liu","Ran Cheng","Chaomin Shen","Yaxin Peng","Feifei Feng","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2409.12514v3.pdf","comment":"add more citations"},{"id":"http://arxiv.org/abs/2403.06069v2","updated":"2024-09-27T12:23:04Z","published":"2024-03-10T03:22:57Z","title":"Implicit Image-to-Image Schrodinger Bridge for Image Restoration","summary":" Diffusion-based models are widely recognized for their effectiveness in image\nrestoration tasks; however, their iterative denoising process, which begins\nfrom Gaussian noise, often results in slow inference speeds. The Image-to-Image\nSchr\\\"odinger Bridge (I$^2$SB) presents a promising alternative by starting the\ngenerative process from corrupted images and leveraging training techniques\nfrom score-based diffusion models. In this paper, we introduce the Implicit\nImage-to-Image Schr\\\"odinger Bridge (I$^3$SB) to further accelerate the\ngenerative process of I$^2$SB. I$^3$SB reconfigures the generative process into\na non-Markovian framework by incorporating the initial corrupted image into\neach step, while ensuring that the marginal distribution aligns with that of\nI$^2$SB. This allows for the direct use of the pretrained network from I$^2$SB.\nExtensive experiments on natural images, human face images, and medical images\nvalidate the acceleration benefits of I$^3$SB. Compared to I$^2$SB, I$^3$SB\nachieves the same perceptual quality with fewer generative steps, while\nmaintaining equal or improved fidelity to the ground truth.\n","authors":["Yuang Wang","Siyeop Yoon","Pengfei Jin","Matthew Tivnan","Sifan Song","Zhennong Chen","Rui Hu","Li Zhang","Quanzheng Li","Zhiqiang Chen","Dufan Wu"],"pdf_url":"https://arxiv.org/pdf/2403.06069v2.pdf","comment":"23 pages, 8 figures, submitted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2409.07271v2","updated":"2024-09-27T12:21:26Z","published":"2024-09-11T13:46:35Z","title":"CCFExp: Facial Image Synthesis with Cycle Cross-Fusion Diffusion Model\n for Facial Paralysis Individuals","summary":" Facial paralysis is a debilitating condition that affects the movement of\nfacial muscles, leading to a significant loss of facial expressions. Currently,\nthe diagnosis of facial paralysis remains a challenging task, often relying\nheavily on the subjective judgment and experience of clinicians, which can\nintroduce variability and uncertainty in the assessment process. One promising\napplication in real-life situations is the automatic estimation of facial\nparalysis. However, the scarcity of facial paralysis datasets limits the\ndevelopment of robust machine learning models for automated diagnosis and\ntherapeutic interventions. To this end, this study aims to synthesize a\nhigh-quality facial paralysis dataset to address this gap, enabling more\naccurate and efficient algorithm training. Specifically, a novel Cycle\nCross-Fusion Expression Generative Model (CCFExp) based on the diffusion model\nis proposed to combine different features of facial information and enhance the\nvisual details of facial appearance and texture in facial regions, thus\ncreating synthetic facial images that accurately represent various degrees and\ntypes of facial paralysis. We have qualitatively and quantitatively evaluated\nthe proposed method on the commonly used public clinical datasets of facial\nparalysis to demonstrate its effectiveness. Experimental results indicate that\nthe proposed method surpasses state-of-the-art methods, generating more\nrealistic facial images and maintaining identity consistency.\n","authors":["Weixiang Gao","Yifan Xia"],"pdf_url":"https://arxiv.org/pdf/2409.07271v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18686v1","updated":"2024-09-27T12:20:29Z","published":"2024-09-27T12:20:29Z","title":"A Novel Unified Architecture for Low-Shot Counting by Detection and\n Segmentation","summary":" Low-shot object counters estimate the number of objects in an image using few\nor no annotated exemplars. Objects are localized by matching them to\nprototypes, which are constructed by unsupervised image-wide object appearance\naggregation. Due to potentially diverse object appearances, the existing\napproaches often lead to overgeneralization and false positive detections.\nFurthermore, the best-performing methods train object localization by a\nsurrogate loss, that predicts a unit Gaussian at each object center. This loss\nis sensitive to annotation error, hyperparameters and does not directly\noptimize the detection task, leading to suboptimal counts. We introduce GeCo, a\nnovel low-shot counter that achieves accurate object detection, segmentation,\nand count estimation in a unified architecture. GeCo robustly generalizes the\nprototypes across objects appearances through a novel dense object query\nformulation. In addition, a novel counting loss is proposed, that directly\noptimizes the detection task and avoids the issues of the standard surrogate\nloss. GeCo surpasses the leading few-shot detection-based counters by\n$\\sim$25\\% in the total count MAE, achieves superior detection accuracy and\nsets a new solid state-of-the-art result across all low-shot counting setups.\n","authors":["Jer Pelhan","Alan Lukežič","Vitjan Zavrtanik","Matej Kristan"],"pdf_url":"https://arxiv.org/pdf/2409.18686v1.pdf","comment":"Accepted to NeurIPS2024"},{"id":"http://arxiv.org/abs/2403.06164v2","updated":"2024-09-27T12:08:07Z","published":"2024-03-10T10:30:34Z","title":"Platypose: Calibrated Zero-Shot Multi-Hypothesis 3D Human Motion\n Estimation","summary":" Single camera 3D pose estimation is an ill-defined problem due to inherent\nambiguities from depth, occlusion or keypoint noise. Multi-hypothesis pose\nestimation accounts for this uncertainty by providing multiple 3D poses\nconsistent with the 2D measurements. Current research has predominantly\nconcentrated on generating multiple hypotheses for single frame static pose\nestimation or single hypothesis motion estimation. In this study we focus on\nthe new task of multi-hypothesis motion estimation. Multi-hypothesis motion\nestimation is not simply multi-hypothesis pose estimation applied to multiple\nframes, which would ignore temporal correlation across frames. Instead, it\nrequires distributions which are capable of generating temporally consistent\nsamples, which is significantly more challenging than multi-hypothesis pose\nestimation or single-hypothesis motion estimation. To this end, we introduce\nPlatypose, a framework that uses a diffusion model pretrained on 3D human\nmotion sequences for zero-shot 3D pose sequence estimation. Platypose\noutperforms baseline methods on multiple hypotheses for motion estimation.\nAdditionally, Platypose also achieves state-of-the-art calibration and\ncompetitive joint error when tested on static poses from Human3.6M,\nMPI-INF-3DHP and 3DPW. Finally, because it is zero-shot, our method generalizes\nflexibly to different settings such as multi-camera inference.\n","authors":["Paweł A. Pierzchlewicz","Caio O. da Silva","R. James Cotton","Fabian H. Sinz"],"pdf_url":"https://arxiv.org/pdf/2403.06164v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18674v1","updated":"2024-09-27T12:02:28Z","published":"2024-09-27T12:02:28Z","title":"Image-guided topic modeling for interpretable privacy classification","summary":" Predicting and explaining the private information contained in an image in\nhuman-understandable terms is a complex and contextual task. This task is\nchallenging even for large language models. To facilitate the understanding of\nprivacy decisions, we propose to predict image privacy based on a set of\nnatural language content descriptors. These content descriptors are associated\nwith privacy scores that reflect how people perceive image content. We generate\ndescriptors with our novel Image-guided Topic Modeling (ITM) approach. ITM\nleverages, via multimodality alignment, both vision information and image\ntextual descriptions from a vision language model. We use the ITM-generated\ndescriptors to learn a privacy predictor, Priv$\\times$ITM, whose decisions are\ninterpretable by design. Our Priv$\\times$ITM classifier outperforms the\nreference interpretable method by 5 percentage points in accuracy and performs\ncomparably to the current non-interpretable state-of-the-art model.\n","authors":["Alina Elena Baia","Andrea Cavallaro"],"pdf_url":"https://arxiv.org/pdf/2409.18674v1.pdf","comment":"Paper accepted at the eXCV Workshop at ECCV 2024. Supplementary\n material included. Code available at https://github.com/idiap/itm"},{"id":"http://arxiv.org/abs/2409.18673v1","updated":"2024-09-27T11:59:00Z","published":"2024-09-27T11:59:00Z","title":"Exploiting Motion Prior for Accurate Pose Estimation of Dashboard\n Cameras","summary":" Dashboard cameras (dashcams) record millions of driving videos daily,\noffering a valuable potential data source for various applications, including\ndriving map production and updates. A necessary step for utilizing these\ndashcam data involves the estimation of camera poses. However, the low-quality\nimages captured by dashcams, characterized by motion blurs and dynamic objects,\npose challenges for existing image-matching methods in accurately estimating\ncamera poses. In this study, we propose a precise pose estimation method for\ndashcam images, leveraging the inherent camera motion prior. Typically, image\nsequences captured by dash cameras exhibit pronounced motion prior, such as\nforward movement or lateral turns, which serve as essential cues for\ncorrespondence estimation. Building upon this observation, we devise a pose\nregression module aimed at learning camera motion prior, subsequently\nintegrating these prior into both correspondences and pose estimation\nprocesses. The experiment shows that, in real dashcams dataset, our method is\n22% better than the baseline for pose estimation in AUC5\\textdegree, and it can\nestimate poses for 19% more images with less reprojection error in Structure\nfrom Motion (SfM).\n","authors":["Yipeng Lu","Yifan Zhao","Haiping Wang","Zhiwei Ruan","Yuan Liu","Zhen Dong","Bisheng Yang"],"pdf_url":"https://arxiv.org/pdf/2409.18673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18653v1","updated":"2024-09-27T11:35:50Z","published":"2024-09-27T11:35:50Z","title":"When SAM2 Meets Video Camouflaged Object Segmentation: A Comprehensive\n Evaluation and Adaptation","summary":" This study investigates the application and performance of the Segment\nAnything Model 2 (SAM2) in the challenging task of video camouflaged object\nsegmentation (VCOS). VCOS involves detecting objects that blend seamlessly in\nthe surroundings for videos, due to similar colors and textures, poor light\nconditions, etc. Compared to the objects in normal scenes, camouflaged objects\nare much more difficult to detect. SAM2, a video foundation model, has shown\npotential in various tasks. But its effectiveness in dynamic camouflaged\nscenarios remains under-explored. This study presents a comprehensive study on\nSAM2's ability in VCOS. First, we assess SAM2's performance on camouflaged\nvideo datasets using different models and prompts (click, box, and mask).\nSecond, we explore the integration of SAM2 with existing multimodal large\nlanguage models (MLLMs) and VCOS methods. Third, we specifically adapt SAM2 by\nfine-tuning it on the video camouflaged dataset. Our comprehensive experiments\ndemonstrate that SAM2 has excellent zero-shot ability of detecting camouflaged\nobjects in videos. We also show that this ability could be further improved by\nspecifically adjusting SAM2's parameters for VCOS. The code will be available\nat https://github.com/zhoustan/SAM2-VCOS\n","authors":["Yuli Zhou","Guolei Sun","Yawei Li","Luca Benini","Ender Konukoglu"],"pdf_url":"https://arxiv.org/pdf/2409.18653v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2405.17461v2","updated":"2024-09-27T11:34:05Z","published":"2024-05-23T05:25:45Z","title":"EMR-Merging: Tuning-Free High-Performance Model Merging","summary":" The success of pretrain-finetune paradigm brings about the release of\nnumerous model weights. In this case, merging models finetuned on different\ntasks to enable a single model with multi-task capabilities is gaining\nincreasing attention for its practicability. Existing model merging methods\nusually suffer from (1) significant performance degradation or (2) requiring\ntuning by additional data or training. In this paper, we rethink and analyze\nthe existing model merging paradigm. We discover that using a single model's\nweights can hardly simulate all the models' performance. To tackle this issue,\nwe propose Elect, Mask & Rescale-Merging (EMR-Merging). We first (a) elect a\nunified model from all the model weights and then (b) generate extremely\nlightweight task-specific modulators, including masks and rescalers, to align\nthe direction and magnitude between the unified model and each specific model,\nrespectively. EMR-Merging is tuning-free, thus requiring no data availability\nor any additional training while showing impressive performance. We find that\nEMR-Merging shows outstanding performance compared to existing merging methods\nunder different classical and newly-established settings, including merging\ndifferent numbers of vision models (up to 30), NLP models, PEFT models, and\nmulti-modal models.\n","authors":["Chenyu Huang","Peng Ye","Tao Chen","Tong He","Xiangyu Yue","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2405.17461v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.13978v2","updated":"2024-09-27T11:28:18Z","published":"2024-09-21T02:01:55Z","title":"FracGM: A Fast Fractional Programming Technique for Geman-McClure Robust\n Estimator","summary":" Robust estimation is essential in computer vision, robotics, and navigation,\naiming to minimize the impact of outlier measurements for improved accuracy. We\npresent a fast algorithm for Geman-McClure robust estimation, FracGM,\nleveraging fractional programming techniques. This solver reformulates the\noriginal non-convex fractional problem to a convex dual problem and a linear\nequation system, iteratively solving them in an alternating optimization\npattern. Compared to graduated non-convexity approaches, this strategy exhibits\na faster convergence rate and better outlier rejection capability. In addition,\nthe global optimality of the proposed solver can be guaranteed under given\nconditions. We demonstrate the proposed FracGM solver with Wahba's rotation\nproblem and 3-D point-cloud registration along with relaxation pre-processing\nand projection post-processing. Compared to state-of-the-art algorithms, when\nthe outlier rates increase from 20% to 80%, FracGM shows 53% and 88% lower\nrotation and translation increases. In real-world scenarios, FracGM achieves\nbetter results in 13 out of 18 outcomes, while having a 19.43% improvement in\nthe computation time.\n","authors":["Bang-Shien Chen","Yu-Kai Lin","Jian-Yu Chen","Chih-Wei Huang","Jann-Long Chern","Ching-Cherng Sun"],"pdf_url":"https://arxiv.org/pdf/2409.13978v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.18642v1","updated":"2024-09-27T11:20:20Z","published":"2024-09-27T11:20:20Z","title":"Enhanced Convolution Neural Network with Optimized Pooling and\n Hyperparameter Tuning for Network Intrusion Detection","summary":" Network Intrusion Detection Systems (NIDS) are essential for protecting\ncomputer networks from malicious activities, including Denial of Service (DoS),\nProbing, User-to-Root (U2R), and Remote-to-Local (R2L) attacks. Without\neffective NIDS, networks are vulnerable to significant security breaches and\ndata loss. Machine learning techniques provide a promising approach to enhance\nNIDS by automating threat detection and improving accuracy. In this research,\nwe propose an Enhanced Convolutional Neural Network (EnCNN) for NIDS and\nevaluate its performance using the KDDCUP'99 dataset. Our methodology includes\ncomprehensive data preprocessing, exploratory data analysis (EDA), and feature\nengineering. We compare EnCNN with various machine learning algorithms,\nincluding Logistic Regression, Decision Trees, Support Vector Machines (SVM),\nand ensemble methods like Random Forest, AdaBoost, and Voting Ensemble. The\nresults show that EnCNN significantly improves detection accuracy, with a\nnotable 10% increase over state-of-art approaches. This demonstrates the\neffectiveness of EnCNN in real-time network intrusion detection, offering a\nrobust solution for identifying and mitigating security threats, and enhancing\noverall network resilience.\n","authors":["Ayush Kumar Sharma","Sourav Patel","Supriya Bharat Wakchaure","Abirami S"],"pdf_url":"https://arxiv.org/pdf/2409.18642v1.pdf","comment":"7 Pages , 2 figures , 4 Tables , Conference paper"},{"id":"http://arxiv.org/abs/2409.18636v1","updated":"2024-09-27T11:07:48Z","published":"2024-09-27T11:07:48Z","title":"Unsupervised Fingerphoto Presentation Attack Detection With Diffusion\n Models","summary":" Smartphone-based contactless fingerphoto authentication has become a reliable\nalternative to traditional contact-based fingerprint biometric systems owing to\nrapid advances in smartphone camera technology. Despite its convenience,\nfingerprint authentication through fingerphotos is more vulnerable to\npresentation attacks, which has motivated recent research efforts towards\ndeveloping fingerphoto Presentation Attack Detection (PAD) techniques. However,\nprior PAD approaches utilized supervised learning methods that require labeled\ntraining data for both bona fide and attack samples. This can suffer from two\nkey issues, namely (i) generalization:the detection of novel presentation\nattack instruments (PAIs) unseen in the training data, and (ii) scalability:the\ncollection of a large dataset of attack samples using different PAIs. To\naddress these challenges, we propose a novel unsupervised approach based on a\nstate-of-the-art deep-learning-based diffusion model, the Denoising Diffusion\nProbabilistic Model (DDPM), which is trained solely on bona fide samples. The\nproposed approach detects Presentation Attacks (PA) by calculating the\nreconstruction similarity between the input and output pairs of the DDPM. We\npresent extensive experiments across three PAI datasets to test the accuracy\nand generalization capability of our approach. The results show that the\nproposed DDPM-based PAD method achieves significantly better detection error\nrates on several PAI classes compared to other baseline unsupervised\napproaches.\n","authors":["Hailin Li","Raghavendra Ramachandra","Mohamed Ragab","Soumik Mondal","Yong Kiam Tan","Khin Mi Mi Aung"],"pdf_url":"https://arxiv.org/pdf/2409.18636v1.pdf","comment":"Accepted by IJCB 2024"},{"id":"http://arxiv.org/abs/2409.10357v2","updated":"2024-09-27T10:59:21Z","published":"2024-09-16T15:06:12Z","title":"2D or not 2D: How Does the Dimensionality of Gesture Representation\n Affect 3D Co-Speech Gesture Generation?","summary":" Co-speech gestures are fundamental for communication. The advent of recent\ndeep learning techniques has facilitated the creation of lifelike, synchronous\nco-speech gestures for Embodied Conversational Agents. \"In-the-wild\" datasets,\naggregating video content from platforms like YouTube via human pose detection\ntechnologies, provide a feasible solution by offering 2D skeletal sequences\naligned with speech. Concurrent developments in lifting models enable the\nconversion of these 2D sequences into 3D gesture databases. However, it is\nimportant to note that the 3D poses estimated from the 2D extracted poses are,\nin essence, approximations of the ground-truth, which remains in the 2D domain.\nThis distinction raises questions about the impact of gesture representation\ndimensionality on the quality of generated motions - a topic that, to our\nknowledge, remains largely unexplored. Our study examines the effect of using\neither 2D or 3D joint coordinates as training data on the performance of\nspeech-to-gesture deep generative models. We employ a lifting model for\nconverting generated 2D pose sequences into 3D and assess how gestures created\ndirectly in 3D stack up against those initially generated in 2D and then\nconverted to 3D. We perform an objective evaluation using widely used metrics\nin the gesture generation field as well as a user study to qualitatively\nevaluate the different approaches.\n","authors":["Téo Guichoux","Laure Soulier","Nicolas Obin","Catherine Pelachaud"],"pdf_url":"https://arxiv.org/pdf/2409.10357v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2406.15111"},{"id":"http://arxiv.org/abs/2409.18628v1","updated":"2024-09-27T10:55:58Z","published":"2024-09-27T10:55:58Z","title":"Towards Integrating Epistemic Uncertainty Estimation into the\n Radiotherapy Workflow","summary":" The precision of contouring target structures and organs-at-risk (OAR) in\nradiotherapy planning is crucial for ensuring treatment efficacy and patient\nsafety. Recent advancements in deep learning (DL) have significantly improved\nOAR contouring performance, yet the reliability of these models, especially in\nthe presence of out-of-distribution (OOD) scenarios, remains a concern in\nclinical settings. This application study explores the integration of epistemic\nuncertainty estimation within the OAR contouring workflow to enable OOD\ndetection in clinically relevant scenarios, using specifically compiled data.\nFurthermore, we introduce an advanced statistical method for OOD detection to\nenhance the methodological framework of uncertainty estimation. Our empirical\nevaluation demonstrates that epistemic uncertainty estimation is effective in\nidentifying instances where model predictions are unreliable and may require an\nexpert review. Notably, our approach achieves an AUC-ROC of 0.95 for OOD\ndetection, with a specificity of 0.95 and a sensitivity of 0.92 for implant\ncases, underscoring its efficacy. This study addresses significant gaps in the\ncurrent research landscape, such as the lack of ground truth for uncertainty\nestimation and limited empirical evaluations. Additionally, it provides a\nclinically relevant application of epistemic uncertainty estimation in an\nFDA-approved and widely used clinical solution for OAR segmentation from\nVarian, a Siemens Healthineers company, highlighting its practical benefits.\n","authors":["Marvin Tom Teichmann","Manasi Datar","Lisa Kratzke","Fernando Vega","Florin C. Ghesu"],"pdf_url":"https://arxiv.org/pdf/2409.18628v1.pdf","comment":"Keywords: Epistemic Uncertainty - Out-of-Distribution Detection - CT\n Segmentation - OAR contouring - Radiotherapy"},{"id":"http://arxiv.org/abs/2409.14149v2","updated":"2024-09-27T10:32:29Z","published":"2024-09-21T13:59:50Z","title":"JVID: Joint Video-Image Diffusion for Visual-Quality and\n Temporal-Consistency in Video Generation","summary":" We introduce the Joint Video-Image Diffusion model (JVID), a novel approach\nto generating high-quality and temporally coherent videos. We achieve this by\nintegrating two diffusion models: a Latent Image Diffusion Model (LIDM) trained\non images and a Latent Video Diffusion Model (LVDM) trained on video data. Our\nmethod combines these models in the reverse diffusion process, where the LIDM\nenhances image quality and the LVDM ensures temporal consistency. This unique\ncombination allows us to effectively handle the complex spatio-temporal\ndynamics in video generation. Our results demonstrate quantitative and\nqualitative improvements in producing realistic and coherent videos.\n","authors":["Hadrien Reynaud","Matthew Baugh","Mischa Dombrowski","Sarah Cechnicka","Qingjie Meng","Bernhard Kainz"],"pdf_url":"https://arxiv.org/pdf/2409.14149v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18614v1","updated":"2024-09-27T10:24:19Z","published":"2024-09-27T10:24:19Z","title":"Metasurface-generated large and arbitrary analog convolution kernels for\n accelerated machine vision","summary":" In the rapidly evolving field of artificial intelligence, convolutional\nneural networks are essential for tackling complex challenges such as machine\nvision and medical diagnosis. Recently, to address the challenges in processing\nspeed and power consumption of conventional digital convolution operations,\nmany optical components have been suggested to replace the digital convolution\nlayer in the neural network, accelerating various machine vision tasks.\nNonetheless, the analog nature of the optical convolution kernel has not been\nfully explored. Here, we develop a spatial frequency domain training method to\ncreate arbitrarily shaped analog convolution kernels using an optical\nmetasurface as the convolution layer, with its receptive field largely\nsurpassing digital convolution kernels. By employing spatial multiplexing, the\nmultiple parallel convolution kernels with both positive and negative weights\nare generated under the incoherent illumination condition. We experimentally\ndemonstrate a 98.59% classification accuracy on the MNIST dataset, with\nsimulations showing 92.63% and 68.67% accuracy on the Fashion-MNIST and\nCIFAR-10 datasets with additional digital layers. This work underscores the\nunique advantage of analog optical convolution, offering a promising avenue to\naccelerate machine vision tasks, especially in edge devices.\n","authors":["Ruiqi Liang","Shuai Wang","Yiying Dong","Liu Li","Ying Kuang","Bohan Zhang","Yuanmu Yang"],"pdf_url":"https://arxiv.org/pdf/2409.18614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03882v2","updated":"2024-09-27T10:19:55Z","published":"2024-05-06T21:57:35Z","title":"Trio-ViT: Post-Training Quantization and Acceleration for Softmax-Free\n Efficient Vision Transformer","summary":" Motivated by the huge success of Transformers in the field of natural\nlanguage processing (NLP), Vision Transformers (ViTs) have been rapidly\ndeveloped and achieved remarkable performance in various computer vision tasks.\nHowever, their huge model sizes and intensive computations hinder ViTs'\ndeployment on embedded devices, calling for effective model compression\nmethods, such as quantization. Unfortunately, due to the existence of\nhardware-unfriendly and quantization-sensitive non-linear operations,\nparticularly {Softmax}, it is non-trivial to completely quantize all operations\nin ViTs, yielding either significant accuracy drops or non-negligible hardware\ncosts. In response to challenges associated with \\textit{standard ViTs}, we\nfocus our attention towards the quantization and acceleration for\n\\textit{efficient ViTs}, which not only eliminate the troublesome Softmax but\nalso integrate linear attention with low computational complexity, and propose\nTrio-ViT accordingly. Specifically, at the algorithm level, we develop a\n{tailored post-training quantization engine} taking the unique activation\ndistributions of Softmax-free efficient ViTs into full consideration, aiming to\nboost quantization accuracy. Furthermore, at the hardware level, we build an\naccelerator dedicated to the specific Convolution-Transformer hybrid\narchitecture of efficient ViTs, thereby enhancing hardware efficiency.\nExtensive experimental results consistently prove the effectiveness of our\nTrio-ViT framework. {Particularly, we can gain up to\n$\\uparrow$$\\mathbf{3.6}\\times$, $\\uparrow$$\\mathbf{5.0}\\times$, and\n$\\uparrow$$\\mathbf{7.3}\\times$ FPS under comparable accuracy over\nstate-of-the-art ViT accelerators, as well as $\\uparrow$$\\mathbf{6.0}\\times$,\n$\\uparrow$$\\mathbf{1.5}\\times$, and $\\uparrow$$\\mathbf{2.1}\\times$ DSP\nefficiency.} Codes are available at\n\\url{https://github.com/shihuihong214/Trio-ViT}.\n","authors":["Huihong Shi","Haikuo Shao","Wendong Mao","Zhongfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.03882v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08843v4","updated":"2024-09-27T09:56:47Z","published":"2023-11-15T10:33:20Z","title":"Personalized Video Relighting With an At-Home Light Stage","summary":" In this paper, we develop a personalized video relighting algorithm that\nproduces high-quality and temporally consistent relit videos under any pose,\nexpression, and lighting condition in real-time. Existing relighting algorithms\ntypically rely either on publicly available synthetic data, which yields poor\nrelighting results, or on actual light stage data which is difficult to\nacquire. We show that by just capturing recordings of a user watching YouTube\nvideos on a monitor we can train a personalized algorithm capable of performing\nhigh-quality relighting under any condition. Our key contribution is a novel\nimage-based neural relighting architecture that effectively separates the\nintrinsic appearance features - the geometry and reflectance of the face - from\nthe source lighting and then combines them with the target lighting to generate\na relit image. This neural architecture enables smoothing of intrinsic\nappearance features leading to temporally stable video relighting. Both\nqualitative and quantitative evaluations show that our architecture improves\nportrait image relighting quality and temporal consistency over\nstate-of-the-art approaches on both casually captured `Light Stage at Your\nDesk' (LSYD) and light-stage-captured `One Light At a Time' (OLAT) datasets.\n","authors":["Jun Myeong Choi","Max Christman","Roni Sengupta"],"pdf_url":"https://arxiv.org/pdf/2311.08843v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18592v1","updated":"2024-09-27T09:51:45Z","published":"2024-09-27T09:51:45Z","title":"From One to the Power of Many: Augmentations for Invariance to\n Multi-LiDAR Perception from Single-Sensor Datasets","summary":" Recently, LiDAR perception methods for autonomous vehicles, powered by deep\nneural networks have experienced steep growth in performance on classic\nbenchmarks, such as nuScenes and SemanticKITTI. However, there are still large\ngaps in performance when deploying models trained on such single-sensor setups\nto modern multi-sensor vehicles. In this work, we investigate if a lack of\ninvariance may be responsible for these performance gaps, and propose some\ninitial solutions in the form of application-specific data augmentations, which\ncan facilitate better transfer to multi-sensor LiDAR setups. We provide\nexperimental evidence that our proposed augmentations improve generalization\nacross LiDAR sensor setups, and investigate how these augmentations affect the\nmodels' invariance properties on simulations of different LiDAR sensor setups.\n","authors":["Marc Uecker","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2409.18592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18591v1","updated":"2024-09-27T09:51:25Z","published":"2024-09-27T09:51:25Z","title":"Off to new Shores: A Dataset & Benchmark for (near-)coastal Flood\n Inundation Forecasting","summary":" Floods are among the most common and devastating natural hazards, imposing\nimmense costs on our society and economy due to their disastrous consequences.\nRecent progress in weather prediction and spaceborne flood mapping demonstrated\nthe feasibility of anticipating extreme events and reliably detecting their\ncatastrophic effects afterwards. However, these efforts are rarely linked to\none another and there is a critical lack of datasets and benchmarks to enable\nthe direct forecasting of flood extent. To resolve this issue, we curate a\nnovel dataset enabling a timely prediction of flood extent. Furthermore, we\nprovide a representative evaluation of state-of-the-art methods, structured\ninto two benchmark tracks for forecasting flood inundation maps i) in general\nand ii) focused on coastal regions. Altogether, our dataset and benchmark\nprovide a comprehensive platform for evaluating flood forecasts, enabling\nfuture solutions for this critical challenge. Data, code & models are shared at\nhttps://github.com/Multihuntr/GFF under a CC0 license.\n","authors":["Brandon Victor","Mathilde Letard","Peter Naylor","Karim Douch","Nicolas Longépé","Zhen He","Patrick Ebel"],"pdf_url":"https://arxiv.org/pdf/2409.18591v1.pdf","comment":"Accepted at NeurIPS 2024 Datasets & Benchmarks"},{"id":"http://arxiv.org/abs/2407.20623v2","updated":"2024-09-27T09:35:05Z","published":"2024-07-30T07:59:28Z","title":"SharkTrack: an accurate, generalisable software for streamlining shark\n and ray underwater video analysis","summary":" Elasmobranchs (shark sand rays) represent a critical component of marine\necosystems. Yet, they are experiencing global population declines and effective\nmonitoring of populations is essential to their protection. Underwater\nstationary videos, such as those from Baited Remote Underwater Video Stations\n(BRUVS), are critical for understanding elasmobranch spatial ecology and\nabundance. However, processing these videos requires time-consuming manual\nanalysis that can delay conservation. To address this challenge, we developed\nSharkTrack, a semi-automatic underwater video analysis software. SharkTrack\nuses Convolutional Neural Networks (CNN) and Multi-Object Tracking to\nautomatically detect and track elasmobranchs and provides an annotation\npipeline to manually classify elasmobranch species and compute species-specific\nMaxN (ssMaxN), the standard metric of relative abundance. When tested on BRUVS\nfootage from locations unseen by the CNN model during training, SharkTrack\ncomputed ssMaxN with 89% accuracy over 207 hours of footage. The semi-automatic\nSharkTrack pipeline required two minutes of manual classification per hour of\nvideo, an estimated 95% reduction of manual analysis time compared to\ntraditional methods. Furthermore, we demonstrate SharkTrack accuracy across\ndiverse marine ecosystems and elasmobranch species, an advancement compared to\nprevious models, which were limited to specific species or locations.\nSharkTrack applications extend beyond BRUVS, facilitating the analysis of any\nunderwater stationary video. By making video analysis faster and more\naccessible, SharkTrack enables research and conservation organisations to\nmonitor elasmobranch populations more efficiently, thereby improving\nconservation efforts. To further support these goals, we provide public access\nto the SharkTrack software.\n","authors":["Filippo Varini","Joel H. Gayford","Jeremy Jenrette","Matthew J. Witt","Francesco Garzon","Francesco Ferretti","Sophie Wilday","Mark E. Bond","Michael R. Heithaus","Danielle Robinson","Devon Carter","Najee Gumbs","Vincent Webster","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2407.20623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02332v2","updated":"2024-09-27T09:21:03Z","published":"2024-04-26T06:22:43Z","title":"Efficient Exploration of Image Classifier Failures with Bayesian\n Optimization and Text-to-Image Models","summary":" Image classifiers should be used with caution in the real world. Performance\nevaluated on a validation set may not reflect performance in the real world. In\nparticular, classifiers may perform well for conditions that are frequently\nencountered during training, but poorly for other infrequent conditions. In\nthis study, we hypothesize that recent advances in text-to-image generative\nmodels make them valuable for benchmarking computer vision models such as image\nclassifiers: they can generate images conditioned by textual prompts that cause\nclassifier failures, allowing failure conditions to be described with textual\nattributes. However, their generation cost becomes an issue when a large number\nof synthetic images need to be generated, which is the case when many different\nattribute combinations need to be tested. We propose an image classifier\nbenchmarking method as an iterative process that alternates image generation,\nclassifier evaluation, and attribute selection. This method efficiently\nexplores the attributes that ultimately lead to poor behavior detection.\n","authors":["Adrien LeCoz","Houssem Ouertatani","Stéphane Herbin","Faouzi Adjed"],"pdf_url":"https://arxiv.org/pdf/2405.02332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18569v1","updated":"2024-09-27T09:12:33Z","published":"2024-09-27T09:12:33Z","title":"Cross-video Identity Correlating for Person Re-identification\n Pre-training","summary":" Recent researches have proven that pre-training on large-scale person images\nextracted from internet videos is an effective way in learning better\nrepresentations for person re-identification. However, these researches are\nmostly confined to pre-training at the instance-level or single-video\ntracklet-level. They ignore the identity-invariance in images of the same\nperson across different videos, which is a key focus in person\nre-identification. To address this issue, we propose a Cross-video\nIdentity-cOrrelating pre-traiNing (CION) framework. Defining a noise concept\nthat comprehensively considers both intra-identity consistency and\ninter-identity discrimination, CION seeks the identity correlation from\ncross-video images by modeling it as a progressive multi-level denoising\nproblem. Furthermore, an identity-guided self-distillation loss is proposed to\nimplement better large-scale pre-training by mining the identity-invariance\nwithin person images. We conduct extensive experiments to verify the\nsuperiority of our CION in terms of efficiency and performance. CION achieves\nsignificantly leading performance with even fewer training samples. For\nexample, compared with the previous state-of-the-art~\\cite{ISR}, CION with the\nsame ResNet50-IBN achieves higher mAP of 93.3\\% and 74.3\\% on Market1501 and\nMSMT17, while only utilizing 8\\% training samples. Finally, with CION\ndemonstrating superior model-agnostic ability, we contribute a model zoo named\nReIDZoo to meet diverse research and application needs in this field. It\ncontains a series of CION pre-trained models with spanning structures and\nparameters, totaling 32 models with 10 different structures, including\nGhostNet, ConvNext, RepViT, FastViT and so on. The code and models will be made\npublicly available at https://github.com/Zplusdragon/CION_ReIDZoo.\n","authors":["Jialong Zuo","Ying Nie","Hanyu Zhou","Huaxin Zhang","Haoyu Wang","Tianyu Guo","Nong Sang","Changxin Gao"],"pdf_url":"https://arxiv.org/pdf/2409.18569v1.pdf","comment":"NeurIPS 2024 Accepted Paper"},{"id":"http://arxiv.org/abs/2409.18565v1","updated":"2024-09-27T09:09:45Z","published":"2024-09-27T09:09:45Z","title":"Harmonizing knowledge Transfer in Neural Network with Unified\n Distillation","summary":" Knowledge distillation (KD), known for its ability to transfer knowledge from\na cumbersome network (teacher) to a lightweight one (student) without altering\nthe architecture, has been garnering increasing attention. Two primary\ncategories emerge within KD methods: feature-based, focusing on intermediate\nlayers' features, and logits-based, targeting the final layer's logits. This\npaper introduces a novel perspective by leveraging diverse knowledge sources\nwithin a unified KD framework. Specifically, we aggregate features from\nintermediate layers into a comprehensive representation, effectively gathering\nsemantic information from different stages and scales. Subsequently, we predict\nthe distribution parameters from this representation. These steps transform\nknowledge from the intermediate layers into corresponding distributive forms,\nthereby allowing for knowledge distillation through a unified distribution\nconstraint at different stages of the network, ensuring the comprehensiveness\nand coherence of knowledge transfer. Numerous experiments were conducted to\nvalidate the effectiveness of the proposed method.\n","authors":["Yaomin Huang","Zaomin Yan","Chaomin Shen","Faming Fang","Guixu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18561v1","updated":"2024-09-27T09:01:08Z","published":"2024-09-27T09:01:08Z","title":"AL-GTD: Deep Active Learning for Gaze Target Detection","summary":" Gaze target detection aims at determining the image location where a person\nis looking. While existing studies have made significant progress in this area\nby regressing accurate gaze heatmaps, these achievements have largely relied on\naccess to extensive labeled datasets, which demands substantial human labor. In\nthis paper, our goal is to reduce the reliance on the size of labeled training\ndata for gaze target detection. To achieve this, we propose AL-GTD, an\ninnovative approach that integrates supervised and self-supervised losses\nwithin a novel sample acquisition function to perform active learning (AL).\nAdditionally, it utilizes pseudo-labeling to mitigate distribution shifts\nduring the training phase. AL-GTD achieves the best of all AUC results by\nutilizing only 40-50% of the training data, in contrast to state-of-the-art\n(SOTA) gaze target detectors requiring the entire training dataset to achieve\nthe same performance. Importantly, AL-GTD quickly reaches satisfactory\nperformance with 10-20% of the training data, showing the effectiveness of our\nacquisition function, which is able to acquire the most informative samples. We\nprovide a comprehensive experimental analysis by adapting several AL methods\nfor the task. AL-GTD outperforms AL competitors, simultaneously exhibiting\nsuperior performance compared to SOTA gaze target detectors when all are\ntrained within a low-data regime. Code is available at\nhttps://github.com/francescotonini/al-gtd.\n","authors":["Francesco Tonini","Nicola Dall'Asen","Lorenzo Vaquero","Cigdem Beyan","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2409.18561v1.pdf","comment":"Accepted to ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2409.18556v1","updated":"2024-09-27T08:53:17Z","published":"2024-09-27T08:53:17Z","title":"CodeSCAN: ScreenCast ANalysis for Video Programming Tutorials","summary":" Programming tutorials in the form of coding screencasts play a crucial role\nin programming education, serving both novices and experienced developers.\nHowever, the video format of these tutorials presents a challenge due to the\ndifficulty of searching for and within videos. Addressing the absence of\nlarge-scale and diverse datasets for screencast analysis, we introduce the\nCodeSCAN dataset. It comprises 12,000 screenshots captured from the Visual\nStudio Code environment during development, featuring 24 programming languages,\n25 fonts, and over 90 distinct themes, in addition to diverse layout changes\nand realistic user interactions. Moreover, we conduct detailed quantitative and\nqualitative evaluations to benchmark the performance of Integrated Development\nEnvironment (IDE) element detection, color-to-black-and-white conversion, and\nOptical Character Recognition (OCR). We hope that our contributions facilitate\nmore research in coding screencast analysis, and we make the source code for\ncreating the dataset and the benchmark publicly available on this website.\n","authors":["Alexander Naumann","Felix Hertlein","Jacqueline Höllig","Lucas Cazzonelli","Steffen Thoma"],"pdf_url":"https://arxiv.org/pdf/2409.18556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03094v4","updated":"2024-09-27T08:50:10Z","published":"2024-02-05T15:25:32Z","title":"Cross-Domain Few-Shot Object Detection via Enhanced Open-Set Object\n Detector","summary":" This paper studies the challenging cross-domain few-shot object detection\n(CD-FSOD), aiming to develop an accurate object detector for novel domains with\nminimal labeled examples. While transformer-based open-set detectors, such as\nDE-ViT, show promise in traditional few-shot object detection, their\ngeneralization to CD-FSOD remains unclear: 1) can such open-set detection\nmethods easily generalize to CD-FSOD? 2) If not, how can models be enhanced\nwhen facing huge domain gaps? To answer the first question, we employ measures\nincluding style, inter-class variance (ICV), and indefinable boundaries (IB) to\nunderstand the domain gap. Based on these measures, we establish a new\nbenchmark named CD-FSOD to evaluate object detection methods, revealing that\nmost of the current approaches fail to generalize across domains. Technically,\nwe observe that the performance decline is associated with our proposed\nmeasures: style, ICV, and IB. Consequently, we propose several novel modules to\naddress these issues. First, the learnable instance features align initial\nfixed instances with target categories, enhancing feature distinctiveness.\nSecond, the instance reweighting module assigns higher importance to\nhigh-quality instances with slight IB. Third, the domain prompter encourages\nfeatures resilient to different styles by synthesizing imaginary domains\nwithout altering semantic contents. These techniques collectively contribute to\nthe development of the Cross-Domain Vision Transformer for CD-FSOD (CD-ViTO),\nsignificantly improving upon the base DE-ViT. Experimental results validate the\nefficacy of our model.\n","authors":["Yuqian Fu","Yu Wang","Yixuan Pan","Lian Huai","Xingyu Qiu","Zeyu Shangguan","Tong Liu","Yanwei Fu","Luc Van Gool","Xingqun Jiang"],"pdf_url":"https://arxiv.org/pdf/2402.03094v4.pdf","comment":"Accepted by ECCV2024 (project website:\n http://yuqianfu.com/CDFSOD-benchmark)"},{"id":"http://arxiv.org/abs/2409.18553v1","updated":"2024-09-27T08:45:55Z","published":"2024-09-27T08:45:55Z","title":"Efficient Noise Mitigation for Enhancing Inference Accuracy in DNNs on\n Mixed-Signal Accelerators","summary":" In this paper, we propose a framework to enhance the robustness of the neural\nmodels by mitigating the effects of process-induced and aging-related\nvariations of analog computing components on the accuracy of the analog neural\nnetworks. We model these variations as the noise affecting the precision of the\nactivations and introduce a denoising block inserted between selected layers of\na pre-trained model. We demonstrate that training the denoising block\nsignificantly increases the model's robustness against various noise levels. To\nminimize the overhead associated with adding these blocks, we present an\nexploration algorithm to identify optimal insertion points for the denoising\nblocks. Additionally, we propose a specialized architecture to efficiently\nexecute the denoising blocks, which can be integrated into mixed-signal\naccelerators. We evaluate the effectiveness of our approach using Deep Neural\nNetwork (DNN) models trained on the ImageNet and CIFAR-10 datasets. The results\nshow that on average, by accepting 2.03% parameter count overhead, the accuracy\ndrop due to the variations reduces from 31.7% to 1.15%.\n","authors":["Seyedarmin Azizi","Mohammad Erfan Sadeghi","Mehdi Kamal","Massoud Pedram"],"pdf_url":"https://arxiv.org/pdf/2409.18553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15564v2","updated":"2024-09-27T08:40:26Z","published":"2024-09-23T21:38:49Z","title":"CauSkelNet: Causal Representation Learning for Human Behaviour Analysis","summary":" Constrained by the lack of model interpretability and a deep understanding of\nhuman movement in traditional movement recognition machine learning methods,\nthis study introduces a novel representation learning method based on causal\ninference to better understand human joint dynamics and complex behaviors. We\npropose a two-stage framework that combines the Peter-Clark (PC) algorithm and\nKullback-Leibler (KL) divergence to identify and quantify causal relationships\nbetween joints. Our method effectively captures interactions and produces\ninterpretable, robust representations. Experiments on the EmoPain dataset show\nthat our causal GCN outperforms traditional GCNs in accuracy, F1 score, and\nrecall, especially in detecting protective behaviors. The model is also highly\ninvariant to data scale changes, enhancing its reliability in practical\napplications. Our approach advances human motion analysis and paves the way for\nmore adaptive intelligent healthcare solutions.\n","authors":["Xingrui Gu","Chuyi Jiang","Erte Wang","Zekun Wu","Qiang Cui","Leimin Tian","Lianlong Wu","Siyang Song","Chuang Yu"],"pdf_url":"https://arxiv.org/pdf/2409.15564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18543v1","updated":"2024-09-27T08:25:03Z","published":"2024-09-27T08:25:03Z","title":"Reducing Semantic Ambiguity In Domain Adaptive Semantic Segmentation Via\n Probabilistic Prototypical Pixel Contrast","summary":" Domain adaptation aims to reduce the model degradation on the target domain\ncaused by the domain shift between the source and target domains. Although\nencouraging performance has been achieved by combining cognitive learning with\nthe self-training paradigm, they suffer from ambiguous scenarios caused by\nscale, illumination, or overlapping when deploying deterministic embedding. To\naddress these issues, we propose probabilistic proto-typical pixel contrast\n(PPPC), a universal adaptation framework that models each pixel embedding as a\nprobability via multivariate Gaussian distribution to fully exploit the\nuncertainty within them, eventually improving the representation quality of the\nmodel. In addition, we derive prototypes from probability estimation posterior\nprobability estimation which helps to push the decision boundary away from the\nambiguity points. Moreover, we employ an efficient method to compute similarity\nbetween distributions, eliminating the need for sampling and\nreparameterization, thereby significantly reducing computational overhead.\nFurther, we dynamically select the ambiguous crops at the image level to\nenlarge the number of boundary points involved in contrastive learning, which\nbenefits the establishment of precise distributions for each category.\nExtensive experimentation demonstrates that PPPC not only helps to address\nambiguity at the pixel level, yielding discriminative representations but also\nachieves significant improvements in both synthetic-to-real and day-to-night\nadaptation tasks. It surpasses the previous state-of-the-art (SOTA) by +5.2%\nmIoU in the most challenging daytime-to-nighttime adaptation scenario,\nexhibiting stronger generalization on other unseen datasets. The code and\nmodels are available at\nhttps://github.com/DarlingInTheSV/Probabilistic-Prototypical-Pixel-Contrast.\n","authors":["Xiaoke Hao","Shiyu Liu","Chuanbo Feng","Ye Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.18543v1.pdf","comment":"revise"},{"id":"http://arxiv.org/abs/2409.18536v1","updated":"2024-09-27T08:15:14Z","published":"2024-09-27T08:15:14Z","title":"How Effective is Pre-training of Large Masked Autoencoders for\n Downstream Earth Observation Tasks?","summary":" Self-supervised pre-training has proven highly effective for many computer\nvision tasks, particularly when labelled data are scarce. In the context of\nEarth Observation (EO), foundation models and various other Vision Transformer\n(ViT)-based approaches have been successfully applied for transfer learning to\ndownstream tasks. However, it remains unclear under which conditions\npre-trained models offer significant advantages over training from scratch. In\nthis study, we investigate the effectiveness of pre-training ViT-based Masked\nAutoencoders (MAE) for downstream EO tasks, focusing on reconstruction,\nsegmentation, and classification. We consider two large ViT-based MAE\npre-trained models: a foundation model (Prithvi) and SatMAE. We evaluate\nPrithvi on reconstruction and segmentation-based downstream tasks, and for\nSatMAE we assess its performance on a classification downstream task. Our\nfindings suggest that pre-training is particularly beneficial when the\nfine-tuning task closely resembles the pre-training task, e.g. reconstruction.\nIn contrast, for tasks such as segmentation or classification, training from\nscratch with specific hyperparameter adjustments proved to be equally or more\neffective.\n","authors":["Jose Sosa","Mohamed Aloulou","Danila Rukhovich","Rim Sleimi","Boonyarit Changaival","Anis Kacem","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2409.18536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18533v1","updated":"2024-09-27T08:12:28Z","published":"2024-09-27T08:12:28Z","title":"Prompt-Driven Temporal Domain Adaptation for Nighttime UAV Tracking","summary":" Nighttime UAV tracking under low-illuminated scenarios has achieved great\nprogress by domain adaptation (DA). However, previous DA training-based works\nare deficient in narrowing the discrepancy of temporal contexts for UAV\ntrackers. To address the issue, this work proposes a prompt-driven temporal\ndomain adaptation training framework to fully utilize temporal contexts for\nchallenging nighttime UAV tracking, i.e., TDA. Specifically, the proposed\nframework aligns the distribution of temporal contexts from daytime and\nnighttime domains by training the temporal feature generator against the\ndiscriminator. The temporal-consistent discriminator progressively extracts\nshared domain-specific features to generate coherent domain discrimination\nresults in the time series. Additionally, to obtain high-quality training\nsamples, a prompt-driven object miner is employed to precisely locate objects\nin unannotated nighttime videos. Moreover, a new benchmark for long-term\nnighttime UAV tracking is constructed. Exhaustive evaluations on both public\nand self-constructed nighttime benchmarks demonstrate the remarkable\nperformance of the tracker trained in TDA framework, i.e., TDA-Track.\nReal-world tests at nighttime also show its practicality. The code and demo\nvideos are available at https://github.com/vision4robotics/TDA-Track.\n","authors":["Changhong Fu","Yiheng Wang","Liangliang Yao","Guangze Zheng","Haobo Zuo","Jia Pan"],"pdf_url":"https://arxiv.org/pdf/2409.18533v1.pdf","comment":"Accepted by IROS2024"},{"id":"http://arxiv.org/abs/2409.18523v1","updated":"2024-09-27T08:05:34Z","published":"2024-09-27T08:05:34Z","title":"Token Caching for Diffusion Transformer Acceleration","summary":" Diffusion transformers have gained substantial interest in diffusion\ngenerative modeling due to their outstanding performance. However, their high\ncomputational cost, arising from the quadratic computational complexity of\nattention mechanisms and multi-step inference, presents a significant\nbottleneck. To address this challenge, we propose TokenCache, a novel\npost-training acceleration method that leverages the token-based multi-block\narchitecture of transformers to reduce redundant computations among tokens\nacross inference steps. TokenCache specifically addresses three critical\nquestions in the context of diffusion transformers: (1) which tokens should be\npruned to eliminate redundancy, (2) which blocks should be targeted for\nefficient pruning, and (3) at which time steps caching should be applied to\nbalance speed and quality. In response to these challenges, TokenCache\nintroduces a Cache Predictor that assigns importance scores to tokens, enabling\nselective pruning without compromising model performance. Furthermore, we\npropose an adaptive block selection strategy to focus on blocks with minimal\nimpact on the network's output, along with a Two-Phase Round-Robin (TPRR)\nscheduling policy to optimize caching intervals throughout the denoising\nprocess. Experimental results across various models demonstrate that TokenCache\nachieves an effective trade-off between generation quality and inference speed\nfor diffusion transformers. Our code will be publicly available.\n","authors":["Jinming Lou","Wenyang Luo","Yufan Liu","Bing Li","Xinmiao Ding","Weiming Hu","Jiajiong Cao","Yuming Li","Chenguang Ma"],"pdf_url":"https://arxiv.org/pdf/2409.18523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13607v3","updated":"2024-09-27T08:00:12Z","published":"2024-06-19T14:58:49Z","title":"Ultra-High-Definition Image Restoration: New Benchmarks and A Dual\n Interaction Prior-Driven Solution","summary":" Ultra-High-Definition (UHD) image restoration has acquired remarkable\nattention due to its practical demand. In this paper, we construct UHD snow and\nrain benchmarks, named UHD-Snow and UHD-Rain, to remedy the deficiency in this\nfield. The UHD-Snow/UHD-Rain is established by simulating the physics process\nof rain/snow into consideration and each benchmark contains 3200 degraded/clear\nimage pairs of 4K resolution. Furthermore, we propose an effective UHD image\nrestoration solution by considering gradient and normal priors in model design\nthanks to these priors' spatial and detail contributions. Specifically, our\nmethod contains two branches: (a) feature fusion and reconstruction branch in\nhigh-resolution space and (b) prior feature interaction branch in\nlow-resolution space. The former learns high-resolution features and fuses\nprior-guided low-resolution features to reconstruct clear images, while the\nlatter utilizes normal and gradient priors to mine useful spatial features and\ndetail features to guide high-resolution recovery better. To better utilize\nthese priors, we introduce single prior feature interaction and dual prior\nfeature interaction, where the former respectively fuses normal and gradient\npriors with high-resolution features to enhance prior ones, while the latter\ncalculates the similarity between enhanced prior ones and further exploits dual\nguided filtering to boost the feature interaction of dual priors. We conduct\nexperiments on both new and existing public datasets and demonstrate the\nstate-of-the-art performance of our method on UHD image low-light enhancement,\ndehazing, deblurring, desonwing, and deraining. The source codes and benchmarks\nare available at \\url{https://github.com/wlydlut/UHDDIP}.\n","authors":["Liyan Wang","Cong Wang","Jinshan Pan","Xiaofeng Liu","Weixiang Zhou","Xiaoran Sun","Wei Wang","Zhixun Su"],"pdf_url":"https://arxiv.org/pdf/2406.13607v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18506v1","updated":"2024-09-27T07:44:07Z","published":"2024-09-27T07:44:07Z","title":"Med-IC: Fusing a Single Layer Involution with Convolutions for Enhanced\n Medical Image Classification and Segmentation","summary":" The majority of medical images, especially those that resemble cells, have\nsimilar characteristics. These images, which occur in a variety of shapes,\noften show abnormalities in the organ or cell region. The convolution operation\npossesses a restricted capability to extract visual patterns across several\nspatial regions of an image. The involution process, which is the inverse\noperation of convolution, complements this inherent lack of spatial information\nextraction present in convolutions. In this study, we investigate how applying\na single layer of involution prior to a convolutional neural network (CNN)\narchitecture can significantly improve classification and segmentation\nperformance, with a comparatively negligible amount of weight parameters. The\nstudy additionally shows how excessive use of involution layers might result in\ninaccurate predictions in a particular type of medical image. According to our\nfindings from experiments, the strategy of adding only a single involution\nlayer before a CNN-based model outperforms most of the previous works.\n","authors":["Md. Farhadul Islam","Sarah Zabeen","Meem Arafat Manab","Mohammad Rakibul Hasan Mahin","Joyanta Jyoti Mondal","Md. Tanzim Reza","Md Zahidul Hasan","Munima Haque","Farig Sadeque","Jannatun Noor"],"pdf_url":"https://arxiv.org/pdf/2409.18506v1.pdf","comment":"13 pages, 5 figures, 4 tables, preprint submitted to an Elsevier\n journal"},{"id":"http://arxiv.org/abs/2409.18497v1","updated":"2024-09-27T07:30:12Z","published":"2024-09-27T07:30:12Z","title":"Neural Video Representation for Redundancy Reduction and Consistency\n Preservation","summary":" Implicit neural representations (INRs) embed various signals into networks.\nThey have gained attention in recent years because of their versatility in\nhandling diverse signal types. For videos, INRs achieve video compression by\nembedding video signals into networks and compressing them. Conventional\nmethods use an index that expresses the time of the frame or the features\nextracted from the frame as inputs to the network. The latter method provides\ngreater expressive capability as the input is specific to each video. However,\nthe features extracted from frames often contain redundancy, which contradicts\nthe purpose of video compression. Moreover, since frame time information is not\nexplicitly provided to the network, learning the relationships between frames\nis challenging. To address these issues, we aim to reduce feature redundancy by\nextracting features based on the high-frequency components of the frames. In\naddition, we use feature differences between adjacent frames in order for the\nnetwork to learn frame relationships smoothly. We propose a video\nrepresentation method that uses the high-frequency components of frames and the\ndifferences in features between adjacent frames. The experimental results show\nthat our method outperforms the existing HNeRV method in 90 percent of the\nvideos.\n","authors":["Taiga Hayami","Takahiro Shindo","Shunsuke Akamatsu","Hiroshi Watanabe"],"pdf_url":"https://arxiv.org/pdf/2409.18497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15256v4","updated":"2024-09-27T07:16:23Z","published":"2024-04-23T17:42:45Z","title":"TOP-Nav: Legged Navigation Integrating Terrain, Obstacle and\n Proprioception Estimation","summary":" Legged navigation is typically examined within open-world, off-road, and\nchallenging environments. In these scenarios, estimating external disturbances\nrequires a complex synthesis of multi-modal information. This underlines a\nmajor limitation in existing works that primarily focus on avoiding obstacles.\nIn this work, we propose TOP-Nav, a novel legged navigation framework that\nintegrates a comprehensive path planner with Terrain awareness, Obstacle\navoidance and close-loop Proprioception. TOP-Nav underscores the synergies\nbetween vision and proprioception in both path and motion planning. Within the\npath planner, we present and integrate a terrain estimator that enables the\nrobot to select waypoints on terrains with higher traversability while\neffectively avoiding obstacles. In the motion planning level, we not only\nimplement a locomotion controller to track the navigation commands, but also\nconstruct a proprioception advisor to provide motion evaluations for the path\nplanner. Based on the close-loop motion feedback, we make online corrections\nfor the vision-based terrain and obstacle estimations. Consequently, TOP-Nav\nachieves open-world navigation that the robot can handle terrains or\ndisturbances beyond the distribution of prior knowledge and overcomes\nconstraints imposed by visual conditions. Building upon extensive experiments\nconducted in both simulation and real-world environments, TOP-Nav demonstrates\nsuperior performance in open-world navigation compared to existing methods.\n","authors":["Junli Ren","Yikai Liu","Yingru Dai","Junfeng Long","Guijin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.15256v4.pdf","comment":"Published on CoRL 2024"},{"id":"http://arxiv.org/abs/2409.05088v2","updated":"2024-09-27T06:49:39Z","published":"2024-09-08T13:14:03Z","title":"Transformer with Leveraged Masked Autoencoder for video-based Pain\n Assessment","summary":" Accurate pain assessment is crucial in healthcare for effective diagnosis and\ntreatment; however, traditional methods relying on self-reporting are\ninadequate for populations unable to communicate their pain. Cutting-edge AI is\npromising for supporting clinicians in pain recognition using facial video\ndata. In this paper, we enhance pain recognition by employing facial video\nanalysis within a Transformer-based deep learning model. By combining a\npowerful Masked Autoencoder with a Transformers-based classifier, our model\neffectively captures pain level indicators through both expressions and\nmicro-expressions. We conducted our experiment on the AI4Pain dataset, which\nproduced promising results that pave the way for innovative healthcare\nsolutions that are both comprehensive and objective.\n","authors":["Minh-Duc Nguyen","Hyung-Jeong Yang","Soo-Hyung Kim","Ji-Eun Shin","Seung-Won Kim"],"pdf_url":"https://arxiv.org/pdf/2409.05088v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18478v1","updated":"2024-09-27T06:37:47Z","published":"2024-09-27T06:37:47Z","title":"Temporal2Seq: A Unified Framework for Temporal Video Understanding Tasks","summary":" With the development of video understanding, there is a proliferation of\ntasks for clip-level temporal video analysis, including temporal action\ndetection (TAD), temporal action segmentation (TAS), and generic event boundary\ndetection (GEBD). While task-specific video understanding models have exhibited\noutstanding performance in each task, there remains a dearth of a unified\nframework capable of simultaneously addressing multiple tasks, which is a\npromising direction for the next generation of AI. To this end, in this paper,\nwe propose a single unified framework, coined as Temporal2Seq, to formulate the\noutput of these temporal video understanding tasks as a sequence of discrete\ntokens. With this unified token representation, Temporal2Seq can train a\ngeneralist model within a single architecture on different video understanding\ntasks. In the absence of multi-task learning (MTL) benchmarks, we compile a\ncomprehensive co-training dataset by borrowing the datasets from TAD, TAS, and\nGEBD tasks. We evaluate our Temporal2Seq generalist model on the corresponding\ntest sets of three tasks, demonstrating that Temporal2Seq can produce\nreasonable results on various tasks and achieve advantages compared with\nsingle-task training on this framework. We also investigate the generalization\nperformance of our generalist model on new datasets from different tasks, which\nyields superior performance to the specific model.\n","authors":["Min Yang","Zichen Zhang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14206v2","updated":"2024-09-27T06:37:00Z","published":"2024-08-26T12:09:38Z","title":"Lemon and Orange Disease Classification using CNN-Extracted Features and\n Machine Learning Classifier","summary":" Lemons and oranges, both are the most economically significant citrus fruits\nglobally. The production of lemons and oranges is severely affected due to\ndiseases in its growth stages. Fruit quality has degraded due to the presence\nof flaws. Thus, it is necessary to diagnose the disease accurately so that we\ncan avoid major loss of lemons and oranges. To improve citrus farming, we\nproposed a disease classification approach for lemons and oranges. This\napproach would enable early disease detection and intervention, reduce yield\nlosses, and optimize resource allocation. For the initial modeling of disease\nclassification, the research uses innovative deep learning architectures such\nas VGG16, VGG19 and ResNet50. In addition, for achieving better accuracy, the\nbasic machine learning algorithms used for classification problems include\nRandom Forest, Naive Bayes, K-Nearest Neighbors (KNN) and Logistic Regression.\nThe lemon and orange fruits diseases are classified more accurately (95.0% for\nlemon and 99.69% for orange) by the model. The model's base features were\nextracted from the ResNet50 pre-trained model and the diseases are classified\nby the Logistic Regression which beats the performance given by VGG16 and VGG19\nfor other classifiers. Experimental outcomes show that the proposed model also\noutperforms existing models in which most of them classified the diseases using\nthe Softmax classifier without using any individual classifiers.\n","authors":["Khandoker Nosiba Arifin","Sayma Akter Rupa","Md Musfique Anwar","Israt Jahan"],"pdf_url":"https://arxiv.org/pdf/2408.14206v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18476v1","updated":"2024-09-27T06:33:35Z","published":"2024-09-27T06:33:35Z","title":"Underwater Image Enhancement with Physical-based Denoising Diffusion\n Implicit Models","summary":" Underwater vision is crucial for autonomous underwater vehicles (AUVs), and\nenhancing degraded underwater images in real-time on a resource-constrained AUV\nis a key challenge due to factors like light absorption and scattering, or the\nsufficient model computational complexity to resolve such factors. Traditional\nimage enhancement techniques lack adaptability to varying underwater\nconditions, while learning-based methods, particularly those using\nconvolutional neural networks (CNNs) and generative adversarial networks\n(GANs), offer more robust solutions but face limitations such as inadequate\nenhancement, unstable training, or mode collapse. Denoising diffusion\nprobabilistic models (DDPMs) have emerged as a state-of-the-art approach in\nimage-to-image tasks but require intensive computational complexity to achieve\nthe desired underwater image enhancement (UIE) using the recent UW-DDPM\nsolution. To address these challenges, this paper introduces UW-DiffPhys, a\nnovel physical-based and diffusion-based UIE approach. UW-DiffPhys combines\nlight-computation physical-based UIE network components with a denoising U-Net\nto replace the computationally intensive distribution transformation U-Net in\nthe existing UW-DDPM framework, reducing complexity while maintaining\nperformance. Additionally, the Denoising Diffusion Implicit Model (DDIM) is\nemployed to accelerate the inference process through non-Markovian sampling.\nExperimental results demonstrate that UW-DiffPhys achieved a substantial\nreduction in computational complexity and inference time compared to UW-DDPM,\nwith competitive performance in key metrics such as PSNR, SSIM, UCIQE, and an\nimprovement in the overall underwater image quality UIQM metric. The\nimplementation code can be found at the following repository:\nhttps://github.com/bachzz/UW-DiffPhys\n","authors":["Nguyen Gia Bach","Chanh Minh Tran","Eiji Kamioka","Phan Xuan Tan"],"pdf_url":"https://arxiv.org/pdf/2409.18476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15898v3","updated":"2024-09-27T06:26:36Z","published":"2024-09-24T09:17:08Z","title":"FedRepOpt: Gradient Re-parametrized Optimizers in Federated Learning","summary":" Federated Learning (FL) has emerged as a privacy-preserving method for\ntraining machine learning models in a distributed manner on edge devices.\nHowever, on-device models face inherent computational power and memory\nlimitations, potentially resulting in constrained gradient updates. As the\nmodel's size increases, the frequency of gradient updates on edge devices\ndecreases, ultimately leading to suboptimal training outcomes during any\nparticular FL round. This limits the feasibility of deploying advanced and\nlarge-scale models on edge devices, hindering the potential for performance\nenhancements. To address this issue, we propose FedRepOpt, a gradient\nre-parameterized optimizer for FL. The gradient re-parameterized method allows\ntraining a simple local model with a similar performance as a complex model by\nmodifying the optimizer's gradients according to a set of model-specific\nhyperparameters obtained from the complex models. In this work, we focus on\nVGG-style and Ghost-style models in the FL environment. Extensive experiments\ndemonstrate that models using FedRepOpt obtain a significant boost in\nperformance of 16.7% and 11.4% compared to the RepGhost-style and RepVGG-style\nnetworks, while also demonstrating a faster convergence time of 11.7% and 57.4%\ncompared to their complex structure.\n","authors":["Kin Wai Lau","Yasar Abbas Ur Rehman","Pedro Porto Buarque de Gusmão","Lai-Man Po","Lan Ma","Yuyang Xie"],"pdf_url":"https://arxiv.org/pdf/2409.15898v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11247v2","updated":"2024-09-27T06:22:50Z","published":"2024-03-17T15:41:35Z","title":"Compact 3D Gaussian Splatting For Dense Visual SLAM","summary":" Recent work has shown that 3D Gaussian-based SLAM enables high-quality\nreconstruction, accurate pose estimation, and real-time rendering of scenes.\nHowever, these approaches are built on a tremendous number of redundant 3D\nGaussian ellipsoids, leading to high memory and storage costs, and slow\ntraining speed. To address the limitation, we propose a compact 3D Gaussian\nSplatting SLAM system that reduces the number and the parameter size of\nGaussian ellipsoids. A sliding window-based masking strategy is first proposed\nto reduce the redundant ellipsoids. Then we observe that the covariance matrix\n(geometry) of most 3D Gaussian ellipsoids are extremely similar, which\nmotivates a novel geometry codebook to compress 3D Gaussian geometric\nattributes, i.e., the parameters. Robust and accurate pose estimation is\nachieved by a global bundle adjustment method with reprojection loss. Extensive\nexperiments demonstrate that our method achieves faster training and rendering\nspeed while maintaining the state-of-the-art (SOTA) quality of the scene\nrepresentation.\n","authors":["Tianchen Deng","Yaohui Chen","Leyan Zhang","Jianfei Yang","Shenghai Yuan","Jiuming Liu","Danwei Wang","Hesheng Wang","Weidong Chen"],"pdf_url":"https://arxiv.org/pdf/2403.11247v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02003v4","updated":"2024-09-27T06:10:01Z","published":"2024-02-03T03:13:50Z","title":"GenFace: A Large-Scale Fine-Grained Face Forgery Benchmark and Cross\n Appearance-Edge Learning","summary":" The rapid advancement of photorealistic generators has reached a critical\njuncture where the discrepancy between authentic and manipulated images is\nincreasingly indistinguishable. Thus, benchmarking and advancing techniques\ndetecting digital manipulation become an urgent issue. Although there have been\na number of publicly available face forgery datasets, the forgery faces are\nmostly generated using GAN-based synthesis technology, which does not involve\nthe most recent technologies like diffusion. The diversity and quality of\nimages generated by diffusion models have been significantly improved and thus\na much more challenging face forgery dataset shall be used to evaluate SOTA\nforgery detection literature. In this paper, we propose a large-scale, diverse,\nand fine-grained high-fidelity dataset, namely GenFace, to facilitate the\nadvancement of deepfake detection, which contains a large number of forgery\nfaces generated by advanced generators such as the diffusion-based model and\nmore detailed labels about the manipulation approaches and adopted generators.\nIn addition to evaluating SOTA approaches on our benchmark, we design an\ninnovative cross appearance-edge learning (CAEL) detector to capture\nmulti-grained appearance and edge global representations, and detect\ndiscriminative and general forgery traces. Moreover, we devise an\nappearance-edge cross-attention (AECA) module to explore the various\nintegrations across two domains. Extensive experiment results and\nvisualizations show that our detection model outperforms the state of the arts\non different settings like cross-generator, cross-forgery, and cross-dataset\nevaluations. Code and datasets will be available at\n\\url{https://github.com/Jenine-321/GenFace\n","authors":["Yaning Zhang","Zitong Yu","Tianyi Wang","Xiaobin Huang","Linlin Shen","Zan Gao","Jianfeng Ren"],"pdf_url":"https://arxiv.org/pdf/2402.02003v4.pdf","comment":"Accepted by IEEE Transactions on Information Forensics and Security"},{"id":"http://arxiv.org/abs/2407.03885v2","updated":"2024-09-27T05:58:13Z","published":"2024-07-04T12:23:39Z","title":"Perception-Guided Quality Metric of 3D Point Clouds Using Hybrid\n Strategy","summary":" Full-reference point cloud quality assessment (FR-PCQA) aims to infer the\nquality of distorted point clouds with available references. Most of the\nexisting FR-PCQA metrics ignore the fact that the human visual system (HVS)\ndynamically tackles visual information according to different distortion levels\n(i.e., distortion detection for high-quality samples and appearance perception\nfor low-quality samples) and measure point cloud quality using unified\nfeatures. To bridge the gap, in this paper, we propose a perception-guided\nhybrid metric (PHM) that adaptively leverages two visual strategies with\nrespect to distortion degree to predict point cloud quality: to measure visible\ndifference in high-quality samples, PHM takes into account the masking effect\nand employs texture complexity as an effective compensatory factor for absolute\ndifference; on the other hand, PHM leverages spectral graph theory to evaluate\nappearance degradation in low-quality samples. Variations in geometric signals\non graphs and changes in the spectral graph wavelet coefficients are utilized\nto characterize geometry and texture appearance degradation, respectively.\nFinally, the results obtained from the two components are combined in a\nnon-linear method to produce an overall quality score of the tested point\ncloud. The results of the experiment on five independent databases show that\nPHM achieves state-of-the-art (SOTA) performance and offers significant\nperformance improvement in multiple distortion environments. The code is\npublicly available at https://github.com/zhangyujie-1998/PHM.\n","authors":["Yujie Zhang","Qi Yang","Yiling Xu","Shan Liu"],"pdf_url":"https://arxiv.org/pdf/2407.03885v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18461v1","updated":"2024-09-27T05:49:48Z","published":"2024-09-27T05:49:48Z","title":"Towards Diverse Device Heterogeneous Federated Learning via Task\n Arithmetic Knowledge Integration","summary":" Federated Learning has emerged as a promising paradigm for collaborative\nmachine learning, while preserving user data privacy. Despite its potential,\nstandard FL lacks support for diverse heterogeneous device prototypes, which\nvary significantly in model and dataset sizes -- from small IoT devices to\nlarge workstations. This limitation is only partially addressed by existing\nknowledge distillation techniques, which often fail to transfer knowledge\neffectively across a broad spectrum of device prototypes with varied\ncapabilities. This failure primarily stems from two issues: the dilution of\ninformative logits from more capable devices by those from less capable ones,\nand the use of a single integrated logits as the distillation target across all\ndevices, which neglects their individual learning capacities and and the unique\ncontributions of each. To address these challenges, we introduce TAKFL, a novel\nKD-based framework that treats the knowledge transfer from each device\nprototype's ensemble as a separate task, independently distilling each to\npreserve its unique contributions and avoid dilution. TAKFL also incorporates a\nKD-based self-regularization technique to mitigate the issues related to the\nnoisy and unsupervised ensemble distillation process. To integrate the\nseparately distilled knowledge, we introduce an adaptive task arithmetic\nknowledge integration process, allowing each student model to customize the\nknowledge integration for optimal performance. Additionally, we present\ntheoretical results demonstrating the effectiveness of task arithmetic in\ntransferring knowledge across heterogeneous devices with varying capacities.\nComprehensive evaluations of our method across both CV and NLP tasks\ndemonstrate that TAKFL achieves SOTA results in a variety of datasets and\nsettings, significantly outperforming existing KD-based methods. Code is\nreleased at https://github.com/MMorafah/TAKFL\n","authors":["Mahdi Morafah","Vyacheslav Kungurtsev","Hojin Chang","Chen Chen","Bill Lin"],"pdf_url":"https://arxiv.org/pdf/2409.18461v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2109.06590v4","updated":"2024-09-27T05:44:00Z","published":"2021-09-14T11:23:48Z","title":"High-Fidelity GAN Inversion for Image Attribute Editing","summary":" We present a novel high-fidelity generative adversarial network (GAN)\ninversion framework that enables attribute editing with image-specific details\nwell-preserved (e.g., background, appearance, and illumination). We first\nanalyze the challenges of high-fidelity GAN inversion from the perspective of\nlossy data compression. With a low bit-rate latent code, previous works have\ndifficulties in preserving high-fidelity details in reconstructed and edited\nimages. Increasing the size of a latent code can improve the accuracy of GAN\ninversion but at the cost of inferior editability. To improve image fidelity\nwithout compromising editability, we propose a distortion consultation approach\nthat employs a distortion map as a reference for high-fidelity reconstruction.\nIn the distortion consultation inversion (DCI), the distortion map is first\nprojected to a high-rate latent map, which then complements the basic low-rate\nlatent code with more details via consultation fusion. To achieve high-fidelity\nediting, we propose an adaptive distortion alignment (ADA) module with a\nself-supervised training scheme, which bridges the gap between the edited and\ninversion images. Extensive experiments in the face and car domains show a\nclear improvement in both inversion and editing quality.\n","authors":["Tengfei Wang","Yong Zhang","Yanbo Fan","Jue Wang","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2109.06590v4.pdf","comment":"CVPR 2022; Project Page is at https://tengfei-wang.github.io/HFGI/"},{"id":"http://arxiv.org/abs/2409.18459v1","updated":"2024-09-27T05:43:22Z","published":"2024-09-27T05:43:22Z","title":"FoodMLLM-JP: Leveraging Multimodal Large Language Models for Japanese\n Recipe Generation","summary":" Research on food image understanding using recipe data has been a\nlong-standing focus due to the diversity and complexity of the data. Moreover,\nfood is inextricably linked to people's lives, making it a vital research area\nfor practical applications such as dietary management. Recent advancements in\nMultimodal Large Language Models (MLLMs) have demonstrated remarkable\ncapabilities, not only in their vast knowledge but also in their ability to\nhandle languages naturally. While English is predominantly used, they can also\nsupport multiple languages including Japanese. This suggests that MLLMs are\nexpected to significantly improve performance in food image understanding\ntasks. We fine-tuned open MLLMs LLaVA-1.5 and Phi-3 Vision on a Japanese recipe\ndataset and benchmarked their performance against the closed model GPT-4o. We\nthen evaluated the content of generated recipes, including ingredients and\ncooking procedures, using 5,000 evaluation samples that comprehensively cover\nJapanese food culture. Our evaluation demonstrates that the open models trained\non recipe data outperform GPT-4o, the current state-of-the-art model, in\ningredient generation. Our model achieved F1 score of 0.531, surpassing\nGPT-4o's F1 score of 0.481, indicating a higher level of accuracy. Furthermore,\nour model exhibited comparable performance to GPT-4o in generating cooking\nprocedure text.\n","authors":["Yuki Imajuku","Yoko Yamakata","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2409.18459v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.18458v1","updated":"2024-09-27T05:37:42Z","published":"2024-09-27T05:37:42Z","title":"Enhancing Crime Scene Investigations through Virtual Reality and Deep\n Learning Techniques","summary":" The analysis of a crime scene is a pivotal activity in forensic\ninvestigations. Crime Scene Investigators and forensic science practitioners\nrely on best practices, standard operating procedures, and critical thinking,\nto produce rigorous scientific reports to document the scenes of interest and\nmeet the quality standards expected in the courts. However, crime scene\nexamination is a complex and multifaceted task often performed in environments\nsusceptible to deterioration, contamination, and alteration, despite the use of\ncontact-free and non-destructive methods of analysis. In this context, the\ndocumentation of the sites, and the identification and isolation of traces of\nevidential value remain challenging endeavours. In this paper, we propose a\nphotogrammetric reconstruction of the crime scene for inspection in virtual\nreality (VR) and focus on fully automatic object recognition with deep learning\n(DL) algorithms through a client-server architecture. A pre-trained Faster-RCNN\nmodel was chosen as the best method that can best categorize relevant objects\nat the scene, selected by experts in the VR environment. These operations can\nconsiderably improve and accelerate crime scene analysis and help the forensic\nexpert in extracting measurements and analysing in detail the objects under\nanalysis. Experimental results on a simulated crime scene have shown that the\nproposed method can be effective in finding and recognizing objects with\npotential evidentiary value, enabling timely analyses of crime scenes,\nparticularly those with health and safety risks (e.g. fires, explosions,\nchemicals, etc.), while minimizing subjective bias and contamination of the\nscene.\n","authors":["Antonino Zappalà","Luca Guarnera","Vincenzo Rinaldi","Salvatore Livatino","Sebastiano Battiato"],"pdf_url":"https://arxiv.org/pdf/2409.18458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18457v1","updated":"2024-09-27T05:31:33Z","published":"2024-09-27T05:31:33Z","title":"DynaWeightPnP: Toward global real-time 3D-2D solver in PnP without\n correspondences","summary":" This paper addresses a special Perspective-n-Point (PnP) problem: estimating\nthe optimal pose to align 3D and 2D shapes in real-time without\ncorrespondences, termed as correspondence-free PnP. While several studies have\nfocused on 3D and 2D shape registration, achieving both real-time and accurate\nperformance remains challenging. This study specifically targets the 3D-2D\ngeometric shape registration tasks, applying the recently developed Reproducing\nKernel Hilbert Space (RKHS) to address the \"big-to-small\" issue. An iterative\nreweighted least squares method is employed to solve the RKHS-based formulation\nefficiently. Moreover, our work identifies a unique and interesting\nobservability issue in correspondence-free PnP: the numerical ambiguity between\nrotation and translation. To address this, we proposed DynaWeightPnP,\nintroducing a dynamic weighting sub-problem and an alternative searching\nalgorithm designed to enhance pose estimation and alignment accuracy.\nExperiments were conducted on a typical case, that is, a 3D-2D vascular\ncenterline registration task within Endovascular Image-Guided Interventions\n(EIGIs). Results demonstrated that the proposed algorithm achieves registration\nprocessing rates of 60 Hz (without post-refinement) and 31 Hz (with\npost-refinement) on modern single-core CPUs, with competitive accuracy\ncomparable to existing methods. These results underscore the suitability of\nDynaWeightPnP for future robot navigation tasks like EIGIs.\n","authors":["Jingwei Song","Maani Ghaffari"],"pdf_url":"https://arxiv.org/pdf/2409.18457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17779v2","updated":"2024-09-27T04:39:02Z","published":"2024-07-25T05:18:18Z","title":"DAC: 2D-3D Retrieval with Noisy Labels via Divide-and-Conquer Alignment\n and Correction","summary":" With the recent burst of 2D and 3D data, cross-modal retrieval has attracted\nincreasing attention recently. However, manual labeling by non-experts will\ninevitably introduce corrupted annotations given ambiguous 2D/3D content.\nThough previous works have addressed this issue by designing a naive division\nstrategy with hand-crafted thresholds, their performance generally exhibits\ngreat sensitivity to the threshold value. Besides, they fail to fully utilize\nthe valuable supervisory signals within each divided subset. To tackle this\nproblem, we propose a Divide-and-conquer 2D-3D cross-modal Alignment and\nCorrection framework (DAC), which comprises Multimodal Dynamic Division (MDD)\nand Adaptive Alignment and Correction (AAC). Specifically, the former performs\naccurate sample division by adaptive credibility modeling for each sample based\non the compensation information within multimodal loss distribution. Then in\nAAC, samples in distinct subsets are exploited with different alignment\nstrategies to fully enhance the semantic compactness and meanwhile alleviate\nover-fitting to noisy labels, where a self-correction strategy is introduced to\nimprove the quality of representation. Moreover. To evaluate the effectiveness\nin real-world scenarios, we introduce a challenging noisy benchmark, namely\nObjaverse-N200, which comprises 200k-level samples annotated with 1156\nrealistic noisy labels. Extensive experiments on both traditional and the newly\nproposed benchmarks demonstrate the generality and superiority of our DAC,\nwhere DAC outperforms state-of-the-art models by a large margin. (i.e., with\n+5.9% gain on ModelNet40 and +5.8% on Objaverse-N200).\n","authors":["Chaofan Gan","Yuanpeng Tu","Yuxi Li","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2407.17779v2.pdf","comment":"accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2409.18442v1","updated":"2024-09-27T04:38:14Z","published":"2024-09-27T04:38:14Z","title":"Gradient-free Decoder Inversion in Latent Diffusion Models","summary":" In latent diffusion models (LDMs), denoising diffusion process efficiently\ntakes place on latent space whose dimension is lower than that of pixel space.\nDecoder is typically used to transform the representation in latent space to\nthat in pixel space. While a decoder is assumed to have an encoder as an\naccurate inverse, exact encoder-decoder pair rarely exists in practice even\nthough applications often require precise inversion of decoder. Prior works for\ndecoder inversion in LDMs employed gradient descent inspired by inversions of\ngenerative adversarial networks. However, gradient-based methods require larger\nGPU memory and longer computation time for larger latent space. For example,\nrecent video LDMs can generate more than 16 frames, but GPUs with 24 GB memory\ncan only perform gradient-based decoder inversion for 4 frames. Here, we\npropose an efficient gradient-free decoder inversion for LDMs, which can be\napplied to diverse latent models. Theoretical convergence property of our\nproposed inversion has been investigated not only for the forward step method,\nbut also for the inertial Krasnoselskii-Mann (KM) iterations under mild\nassumption on cocoercivity that is satisfied by recent LDMs. Our proposed\ngradient-free method with Adam optimizer and learning rate scheduling\nsignificantly reduced computation time and memory usage over prior\ngradient-based methods and enabled efficient computation in applications such\nas noise-space watermarking while achieving comparable error levels.\n","authors":["Seongmin Hong","Suh Yoon Jeon","Kyeonghyun Lee","Ernest K. Ryu","Se Young Chun"],"pdf_url":"https://arxiv.org/pdf/2409.18442v1.pdf","comment":"19 pages, Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.10571v3","updated":"2024-09-27T04:04:24Z","published":"2024-08-20T06:17:56Z","title":"Prompt-Agnostic Adversarial Perturbation for Customized Diffusion Models","summary":" Diffusion models have revolutionized customized text-to-image generation,\nallowing for efficient synthesis of photos from personal data with textual\ndescriptions. However, these advancements bring forth risks including privacy\nbreaches and unauthorized replication of artworks. Previous researches\nprimarily center around using prompt-specific methods to generate adversarial\nexamples to protect personal images, yet the effectiveness of existing methods\nis hindered by constrained adaptability to different prompts. In this paper, we\nintroduce a Prompt-Agnostic Adversarial Perturbation (PAP) method for\ncustomized diffusion models. PAP first models the prompt distribution using a\nLaplace Approximation, and then produces prompt-agnostic perturbations by\nmaximizing a disturbance expectation based on the modeled distribution. This\napproach effectively tackles the prompt-agnostic attacks, leading to improved\ndefense stability. Extensive experiments in face privacy and artistic style\nprotection, demonstrate the superior generalization of PAP in comparison to\nexisting techniques. Our project page is available at\nhttps://github.com/vancyland/Prompt-Agnostic-Adversarial-Perturbation-for-Customized-Diffusion-Models.github.io.\n","authors":["Cong Wan","Yuhang He","Xiang Song","Yihong Gong"],"pdf_url":"https://arxiv.org/pdf/2408.10571v3.pdf","comment":"Accepted by NIPS 2024"},{"id":"http://arxiv.org/abs/2409.18431v1","updated":"2024-09-27T03:44:07Z","published":"2024-09-27T03:44:07Z","title":"Search3D: Hierarchical Open-Vocabulary 3D Segmentation","summary":" Open-vocabulary 3D segmentation enables the exploration of 3D spaces using\nfree-form text descriptions. Existing methods for open-vocabulary 3D instance\nsegmentation primarily focus on identifying object-level instances in a scene.\nHowever, they face challenges when it comes to understanding more fine-grained\nscene entities such as object parts, or regions described by generic\nattributes. In this work, we introduce Search3D, an approach that builds a\nhierarchical open-vocabulary 3D scene representation, enabling the search for\nentities at varying levels of granularity: fine-grained object parts, entire\nobjects, or regions described by attributes like materials. Our method aims to\nexpand the capabilities of open vocabulary instance-level 3D segmentation by\nshifting towards a more flexible open-vocabulary 3D search setting less\nanchored to explicit object-centric queries, compared to prior work. To ensure\na systematic evaluation, we also contribute a scene-scale open-vocabulary 3D\npart segmentation benchmark based on MultiScan, along with a set of\nopen-vocabulary fine-grained part annotations on ScanNet++. We verify the\neffectiveness of Search3D across several tasks, demonstrating that our approach\noutperforms baselines in scene-scale open-vocabulary 3D part segmentation,\nwhile maintaining strong performance in segmenting 3D objects and materials.\n","authors":["Ayca Takmaz","Alexandros Delitzas","Robert W. Sumner","Francis Engelmann","Johanna Wald","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2409.18431v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2406.18151v2","updated":"2024-09-27T03:36:47Z","published":"2024-06-26T08:04:42Z","title":"SynRS3D: A Synthetic Dataset for Global 3D Semantic Understanding from\n Monocular Remote Sensing Imagery","summary":" Global semantic 3D understanding from single-view high-resolution remote\nsensing (RS) imagery is crucial for Earth Observation (EO). However, this task\nfaces significant challenges due to the high costs of annotations and data\ncollection, as well as geographically restricted data availability. To address\nthese challenges, synthetic data offer a promising solution by being easily\naccessible and thus enabling the provision of large and diverse datasets. We\ndevelop a specialized synthetic data generation pipeline for EO and introduce\nSynRS3D, the largest synthetic RS 3D dataset. SynRS3D comprises 69,667\nhigh-resolution optical images that cover six different city styles worldwide\nand feature eight land cover types, precise height information, and building\nchange masks. To further enhance its utility, we develop a novel multi-task\nunsupervised domain adaptation (UDA) method, RS3DAda, coupled with our\nsynthetic dataset, which facilitates the RS-specific transition from synthetic\nto real scenarios for land cover mapping and height estimation tasks,\nultimately enabling global monocular 3D semantic understanding based on\nsynthetic data. Extensive experiments on various real-world datasets\ndemonstrate the adaptability and effectiveness of our synthetic dataset and\nproposed RS3DAda method. SynRS3D and related codes will be available.\n","authors":["Jian Song","Hongruixuan Chen","Weihao Xuan","Junshi Xia","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2406.18151v2.pdf","comment":"Accepted at NeurIPS 2024 as a Spotlight"},{"id":"http://arxiv.org/abs/2409.18419v1","updated":"2024-09-27T03:17:35Z","published":"2024-09-27T03:17:35Z","title":"Robust Network Learning via Inverse Scale Variational Sparsification","summary":" While neural networks have made significant strides in many AI tasks, they\nremain vulnerable to a range of noise types, including natural corruptions,\nadversarial noise, and low-resolution artifacts. Many existing approaches focus\non enhancing robustness against specific noise types, limiting their\nadaptability to others. Previous studies have addressed general robustness by\nadopting a spectral perspective, which tends to blur crucial features like\ntexture and object contours. Our proposed solution, however, introduces an\ninverse scale variational sparsification framework within a time-continuous\ninverse scale space formulation. This framework progressively learns\nfiner-scale features by discerning variational differences between pixels,\nultimately preserving only large-scale features in the smoothed image. Unlike\nfrequency-based methods, our approach not only removes noise by smoothing\nsmall-scale features where corruptions often occur but also retains\nhigh-contrast details such as textures and object contours. Moreover, our\nframework offers simplicity and efficiency in implementation. By integrating\nthis algorithm into neural network training, we guide the model to prioritize\nlearning large-scale features. We show the efficacy of our approach through\nenhanced robustness against various noise types.\n","authors":["Zhiling Zhou","Zirui Liu","Chengming Xu","Yanwei Fu","Xinwei Sun"],"pdf_url":"https://arxiv.org/pdf/2409.18419v1.pdf","comment":"21 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.18418v1","updated":"2024-09-27T03:17:01Z","published":"2024-09-27T03:17:01Z","title":"A3: Active Adversarial Alignment for Source-Free Domain Adaptation","summary":" Unsupervised domain adaptation (UDA) aims to transfer knowledge from a\nlabeled source domain to an unlabeled target domain. Recent works have focused\non source-free UDA, where only target data is available. This is challenging as\nmodels rely on noisy pseudo-labels and struggle with distribution shifts. We\npropose Active Adversarial Alignment (A3), a novel framework combining\nself-supervised learning, adversarial training, and active learning for robust\nsource-free UDA. A3 actively samples informative and diverse data using an\nacquisition function for training. It adapts models via adversarial losses and\nconsistency regularization, aligning distributions without source data access.\nA3 advances source-free UDA through its synergistic integration of active and\nadversarial learning for effective domain alignment and noise reduction.\n","authors":["Chrisantus Eze","Christopher Crick"],"pdf_url":"https://arxiv.org/pdf/2409.18418v1.pdf","comment":"Accepted at ICMLA 2024"},{"id":"http://arxiv.org/abs/2309.07322v3","updated":"2024-09-27T03:14:30Z","published":"2023-09-13T21:21:50Z","title":"$\\texttt{NePhi}$: Neural Deformation Fields for Approximately\n Diffeomorphic Medical Image Registration","summary":" This work proposes NePhi, a generalizable neural deformation model which\nresults in approximately diffeomorphic transformations. In contrast to the\npredominant voxel-based transformation fields used in learning-based\nregistration approaches, NePhi represents deformations functionally, leading to\ngreat flexibility within the design space of memory consumption during training\nand inference, inference time, registration accuracy, as well as transformation\nregularity. Specifically, NePhi 1) requires less memory compared to voxel-based\nlearning approaches, 2) improves inference speed by predicting latent codes,\ncompared to current existing neural deformation based registration approaches\nthat \\emph{only} rely on optimization, 3) improves accuracy via instance\noptimization, and 4) shows excellent deformation regularity which is highly\ndesirable for medical image registration. We demonstrate the performance of\nNePhi on a 2D synthetic dataset as well as for real 3D medical image datasets\n(e.g., lungs and brains). Our results show that NePhi can match the accuracy of\nvoxel-based representations in a single-resolution registration setting. For\nmulti-resolution registration, our method matches the accuracy of current SOTA\nlearning-based registration approaches with instance optimization while\nreducing memory requirements by a factor of five. Our code is available at\nhttps://github.com/uncbiag/NePhi.\n","authors":["Lin Tian","Hastings Greer","Raúl San José Estépar","Roni Sengupta","Marc Niethammer"],"pdf_url":"https://arxiv.org/pdf/2309.07322v3.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2409.15176v2","updated":"2024-09-27T03:05:52Z","published":"2024-09-23T16:28:41Z","title":"SpikeGS: Learning 3D Gaussian Fields from Continuous Spike Stream","summary":" A spike camera is a specialized high-speed visual sensor that offers\nadvantages such as high temporal resolution and high dynamic range compared to\nconventional frame cameras. These features provide the camera with significant\nadvantages in many computer vision tasks. However, the tasks of 3D\nreconstruction and novel view synthesis based on spike cameras remain\nunderdeveloped. Although there are existing methods for learning neural\nradiance fields from spike stream, they either lack robustness in extremely\nnoisy, low-quality lighting conditions or suffer from high computational\ncomplexity due to the deep fully connected neural networks and ray marching\nrendering strategies used in neural radiance fields, making it difficult to\nrecover fine texture details. In contrast, the latest advancements in 3DGS have\nachieved high-quality real-time rendering by optimizing the point cloud\nrepresentation into Gaussian ellipsoids. Building on this, we introduce\nSpikeGS, the first method to learn 3D Gaussian fields solely from spike stream.\nWe designed a differentiable spike stream rendering framework based on 3DGS,\nincorporating noise embedding and spiking neurons. By leveraging the multi-view\nconsistency of 3DGS and the tile-based multi-threaded parallel rendering\nmechanism, we achieved high-quality real-time rendering results. Additionally,\nwe introduced a spike rendering loss function that generalizes under varying\nillumination conditions. Our method can reconstruct view synthesis results with\nfine texture details from a continuous spike stream captured by a moving spike\ncamera, while demonstrating high robustness in extremely noisy low-light\nscenarios. Experimental results on both real and synthetic datasets demonstrate\nthat our method surpasses existing approaches in terms of rendering quality and\nspeed. Our code will be available at https://github.com/520jz/SpikeGS.\n","authors":["Jinze Yu","Xi Peng","Zhengda Lu","Laurent Kneip","Yiqun Wang"],"pdf_url":"https://arxiv.org/pdf/2409.15176v2.pdf","comment":"Accepted by ACCV 2024. Project page: https://github.com/520jz/SpikeGS"},{"id":"http://arxiv.org/abs/2409.18408v1","updated":"2024-09-27T02:54:24Z","published":"2024-09-27T02:54:24Z","title":"Query matching for spatio-temporal action detection with query-based\n object detector","summary":" In this paper, we propose a method that extends the query-based object\ndetection model, DETR, to spatio-temporal action detection, which requires\nmaintaining temporal consistency in videos. Our proposed method applies DETR to\neach frame and uses feature shift to incorporate temporal information. However,\nDETR's object queries in each frame may correspond to different objects, making\na simple feature shift ineffective. To overcome this issue, we propose query\nmatching across different frames, ensuring that queries for the same object are\nmatched and used for the feature shift. Experimental results show that\nperformance on the JHMDB21 dataset improves significantly when query features\nare shifted using the proposed query matching.\n","authors":["Shimon Hori","Kazuki Omi","Toru Tamaki"],"pdf_url":"https://arxiv.org/pdf/2409.18408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10484v2","updated":"2024-09-27T02:47:55Z","published":"2024-06-15T03:28:52Z","title":"Beyond Raw Videos: Understanding Edited Videos with Large Multimodal\n Model","summary":" The emerging video LMMs (Large Multimodal Models) have achieved significant\nimprovements on generic video understanding in the form of VQA (Visual Question\nAnswering), where the raw videos are captured by cameras. However, a large\nportion of videos in real-world applications are edited videos, \\textit{e.g.},\nusers usually cut and add effects/modifications to the raw video before\npublishing it on social media platforms. The edited videos usually have high\nview counts but they are not covered in existing benchmarks of video LMMs,\n\\textit{i.e.}, ActivityNet-QA, or VideoChatGPT benchmark. In this paper, we\nleverage the edited videos on a popular short video platform, \\textit{i.e.},\nTikTok, and build a video VQA benchmark (named EditVid-QA) covering four\ntypical editing categories, i.e., effect, funny, meme, and game. Funny and meme\nvideos benchmark nuanced understanding and high-level reasoning, while effect\nand game evaluate the understanding capability of artificial design. Most of\nthe open-source video LMMs perform poorly on the EditVid-QA benchmark,\nindicating a huge domain gap between edited short videos on social media and\nregular raw videos. To improve the generalization ability of LMMs, we collect a\ntraining set for the proposed benchmark based on both Panda-70M/WebVid raw\nvideos and small-scale TikTok/CapCut edited videos, which boosts the\nperformance on the proposed EditVid-QA benchmark, indicating the effectiveness\nof high-quality training data. We also identified a serious issue in the\nexisting evaluation protocol using the GPT-3.5 judge, namely a \"sorry\" attack,\nwhere a sorry-style naive answer can achieve an extremely high rating from the\nGPT judge, e.g., over 4.3 for correctness score on VideoChatGPT evaluation\nprotocol. To avoid the \"sorry\" attacks, we evaluate results with GPT-4 judge\nand keyword filtering. The dataset is released at\nhttps://github.com/XenonLamb/EditVid-QA.\n","authors":["Lu Xu","Sijie Zhu","Chunyuan Li","Chia-Wen Kuo","Fan Chen","Xinyao Wang","Guang Chen","Dawei Du","Ye Yuan","Longyin Wen"],"pdf_url":"https://arxiv.org/pdf/2406.10484v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03954v7","updated":"2024-09-27T02:43:48Z","published":"2024-03-06T18:58:49Z","title":"3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple\n 3D Representations","summary":" Imitation learning provides an efficient way to teach robots dexterous\nskills; however, learning complex skills robustly and generalizablely usually\nconsumes large amounts of human demonstrations. To tackle this challenging\nproblem, we present 3D Diffusion Policy (DP3), a novel visual imitation\nlearning approach that incorporates the power of 3D visual representations into\ndiffusion policies, a class of conditional action generative models. The core\ndesign of DP3 is the utilization of a compact 3D visual representation,\nextracted from sparse point clouds with an efficient point encoder. In our\nexperiments involving 72 simulation tasks, DP3 successfully handles most tasks\nwith just 10 demonstrations and surpasses baselines with a 24.2% relative\nimprovement. In 4 real robot tasks, DP3 demonstrates precise control with a\nhigh success rate of 85%, given only 40 demonstrations of each task, and shows\nexcellent generalization abilities in diverse aspects, including space,\nviewpoint, appearance, and instance. Interestingly, in real robot experiments,\nDP3 rarely violates safety requirements, in contrast to baseline methods which\nfrequently do, necessitating human intervention. Our extensive evaluation\nhighlights the critical importance of 3D representations in real-world robot\nlearning. Videos, code, and data are available on\nhttps://3d-diffusion-policy.github.io .\n","authors":["Yanjie Ze","Gu Zhang","Kangning Zhang","Chenyuan Hu","Muhan Wang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2403.03954v7.pdf","comment":"Published at Robotics: Science and Systems (RSS) 2024. Videos, code,\n and data: https://3d-diffusion-policy.github.io"},{"id":"http://arxiv.org/abs/2409.18401v1","updated":"2024-09-27T02:32:42Z","published":"2024-09-27T02:32:42Z","title":"GenesisTex2: Stable, Consistent and High-Quality Text-to-Texture\n Generation","summary":" Large-scale text-guided image diffusion models have shown astonishing results\nin text-to-image (T2I) generation. However, applying these models to synthesize\ntextures for 3D geometries remains challenging due to the domain gap between 2D\nimages and textures on a 3D surface. Early works that used a\nprojecting-and-inpainting approach managed to preserve generation diversity but\noften resulted in noticeable artifacts and style inconsistencies. While recent\nmethods have attempted to address these inconsistencies, they often introduce\nother issues, such as blurring, over-saturation, or over-smoothing. To overcome\nthese challenges, we propose a novel text-to-texture synthesis framework that\nleverages pretrained diffusion models. We first introduce a local attention\nreweighing mechanism in the self-attention layers to guide the model in\nconcentrating on spatial-correlated patches across different views, thereby\nenhancing local details while preserving cross-view consistency. Additionally,\nwe propose a novel latent space merge pipeline, which further ensures\nconsistency across different viewpoints without sacrificing too much diversity.\nOur method significantly outperforms existing state-of-the-art techniques\nregarding texture consistency and visual quality, while delivering results much\nfaster than distillation-based methods. Importantly, our framework does not\nrequire additional training or fine-tuning, making it highly adaptable to a\nwide range of models available on public platforms.\n","authors":["Jiawei Lu","Yingpeng Zhang","Zengjun Zhao","He Wang","Kun Zhou","Tianjia Shao"],"pdf_url":"https://arxiv.org/pdf/2409.18401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03958v2","updated":"2024-09-27T02:13:42Z","published":"2024-05-07T02:45:28Z","title":"Simple Drop-in LoRA Conditioning on Attention Layers Will Improve Your\n Diffusion Model","summary":" Current state-of-the-art diffusion models employ U-Net architectures\ncontaining convolutional and (qkv) self-attention layers. The U-Net processes\nimages while being conditioned on the time embedding input for each sampling\nstep and the class or caption embedding input corresponding to the desired\nconditional generation. Such conditioning involves scale-and-shift operations\nto the convolutional layers but does not directly affect the attention layers.\nWhile these standard architectural choices are certainly effective, not\nconditioning the attention layers feels arbitrary and potentially suboptimal.\nIn this work, we show that simply adding LoRA conditioning to the attention\nlayers without changing or tuning the other parts of the U-Net architecture\nimproves the image generation quality. For example, a drop-in addition of LoRA\nconditioning to EDM diffusion model yields FID scores of 1.91/1.75 for\nunconditional and class-conditional CIFAR-10 generation, improving upon the\nbaseline of 1.97/1.79.\n","authors":["Joo Young Choi","Jaesung R. Park","Inkyu Park","Jaewoong Cho","Albert No","Ernest K. Ryu"],"pdf_url":"https://arxiv.org/pdf/2405.03958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10727v4","updated":"2024-09-27T01:40:17Z","published":"2023-04-21T03:45:59Z","title":"RoCOCO: Robustness Benchmark of MS-COCO to Stress-test Image-Text\n Matching Models","summary":" With the extensive use of vision-language models in various downstream tasks,\nevaluating their robustness is crucial. In this paper, we propose a benchmark\nfor assessing the robustness of vision-language models. We believe that a\nrobust model should properly understand both linguistic and visual semantics\nand be resilient to explicit variations. In pursuit of this goal, we create new\nvariants of texts and images in the MS-COCO test set and re-evaluate the\nstate-of-the-art (SOTA) models with the new data. Specifically, we alter the\nmeaning of text by replacing a word, and generate visually altered images that\nmaintain some visual context while introducing noticeable pixel changes through\nimage mixing techniques.Our evaluations on the proposed benchmark reveal\nsubstantial performance degradation in many SOTA models (e.g., Image-to-Text\nRecall@1: 81.9\\% $\\rightarrow$ 48.4\\% in BLIP, 66.1\\% $\\rightarrow$ 37.6\\% in\nVSE$\\infty$), with the models often favoring the altered texts/images over the\noriginal ones. This indicates the current vision-language models struggle with\nsubtle changes and often fail to understand the overall context of texts and\nimages. Based on these findings, we propose semantic contrastive loss and\nvisual contrastive loss to learn more robust embedding. Datasets and code are\navailable at {\\url{https://github.com/pseulki/rococo}}.\n","authors":["Seulki Park","Daeho Um","Hajung Yoon","Sanghyuk Chun","Sangdoo Yun"],"pdf_url":"https://arxiv.org/pdf/2304.10727v4.pdf","comment":"Accepted to ECCV Synthetic Data for Computer Vision Workshop (Oral)"},{"id":"http://arxiv.org/abs/2409.18372v1","updated":"2024-09-27T01:16:15Z","published":"2024-09-27T01:16:15Z","title":"You Only Speak Once to See","summary":" Grounding objects in images using visual cues is a well-established approach\nin computer vision, yet the potential of audio as a modality for object\nrecognition and grounding remains underexplored. We introduce YOSS, \"You Only\nSpeak Once to See,\" to leverage audio for grounding objects in visual scenes,\ntermed Audio Grounding. By integrating pre-trained audio models with visual\nmodels using contrastive learning and multi-modal alignment, our approach\ncaptures speech commands or descriptions and maps them directly to\ncorresponding objects within images. Experimental results indicate that audio\nguidance can be effectively applied to object grounding, suggesting that\nincorporating audio guidance may enhance the precision and robustness of\ncurrent object grounding methods and improve the performance of robotic systems\nand computer vision applications. This finding opens new possibilities for\nadvanced object recognition, scene understanding, and the development of more\nintuitive and capable robotic systems.\n","authors":["Wenhao Yang","Jianguo Wei","Wenhuan Lu","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2409.18372v1.pdf","comment":"7 pages, 4 figures, submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2402.01188v3","updated":"2024-09-27T01:13:29Z","published":"2024-02-02T07:17:39Z","title":"Segment Any Change","summary":" Visual foundation models have achieved remarkable results in zero-shot image\nclassification and segmentation, but zero-shot change detection remains an open\nproblem. In this paper, we propose the segment any change models (AnyChange), a\nnew type of change detection model that supports zero-shot prediction and\ngeneralization on unseen change types and data distributions. AnyChange is\nbuilt on the segment anything model (SAM) via our training-free adaptation\nmethod, bitemporal latent matching. By revealing and exploiting intra-image and\ninter-image semantic similarities in SAM's latent space, bitemporal latent\nmatching endows SAM with zero-shot change detection capabilities in a\ntraining-free way. We also propose a point query mechanism to enable\nAnyChange's zero-shot object-centric change detection capability. We perform\nextensive experiments to confirm the effectiveness of AnyChange for zero-shot\nchange detection. AnyChange sets a new record on the SECOND benchmark for\nunsupervised change detection, exceeding the previous SOTA by up to 4.4% F$_1$\nscore, and achieving comparable accuracy with negligible manual annotations (1\npixel per image) for supervised change detection.\n","authors":["Zhuo Zheng","Yanfei Zhong","Liangpei Zhang","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2402.01188v3.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.18364v1","updated":"2024-09-27T00:49:08Z","published":"2024-09-27T00:49:08Z","title":"Multi-hypotheses Conditioned Point Cloud Diffusion for 3D Human\n Reconstruction from Occluded Images","summary":" 3D human shape reconstruction under severe occlusion due to human-object or\nhuman-human interaction is a challenging problem. Parametric models i.e.,\nSMPL(-X), which are based on the statistics across human shapes, can represent\nwhole human body shapes but are limited to minimally-clothed human shapes.\nImplicit-function-based methods extract features from the parametric models to\nemploy prior knowledge of human bodies and can capture geometric details such\nas clothing and hair. However, they often struggle to handle misaligned\nparametric models and inpaint occluded regions given a single RGB image. In\nthis work, we propose a novel pipeline, MHCDIFF, Multi-hypotheses Conditioned\nPoint Cloud Diffusion, composed of point cloud diffusion conditioned on\nprobabilistic distributions for pixel-aligned detailed 3D human reconstruction\nunder occlusion. Compared to previous implicit-function-based methods, the\npoint cloud diffusion model can capture the global consistent features to\ngenerate the occluded regions, and the denoising process corrects the\nmisaligned SMPL meshes. The core of MHCDIFF is extracting local features from\nmultiple hypothesized SMPL(-X) meshes and aggregating the set of features to\ncondition the diffusion model. In the experiments on CAPE and MultiHuman\ndatasets, the proposed method outperforms various SOTA methods based on SMPL,\nimplicit functions, point cloud diffusion, and their combined, under synthetic\nand real occlusions.\n","authors":["Donghwan Kim","Tae-Kyun Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18364v1.pdf","comment":"17 pages, 7 figures, accepted NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.18355v1","updated":"2024-09-27T00:22:02Z","published":"2024-09-27T00:22:02Z","title":"SinoSynth: A Physics-based Domain Randomization Approach for\n Generalizable CBCT Image Enhancement","summary":" Cone Beam Computed Tomography (CBCT) finds diverse applications in medicine.\nEnsuring high image quality in CBCT scans is essential for accurate diagnosis\nand treatment delivery. Yet, the susceptibility of CBCT images to noise and\nartifacts undermines both their usefulness and reliability. Existing methods\ntypically address CBCT artifacts through image-to-image translation approaches.\nThese methods, however, are limited by the artifact types present in the\ntraining data, which may not cover the complete spectrum of CBCT degradations\nstemming from variations in imaging protocols. Gathering additional data to\nencompass all possible scenarios can often pose a challenge. To address this,\nwe present SinoSynth, a physics-based degradation model that simulates various\nCBCT-specific artifacts to generate a diverse set of synthetic CBCT images from\nhigh-quality CT images without requiring pre-aligned data. Through extensive\nexperiments, we demonstrate that several different generative networks trained\non our synthesized data achieve remarkable results on heterogeneous\nmulti-institutional datasets, outperforming even the same networks trained on\nactual data. We further show that our degradation model conveniently provides\nan avenue to enforce anatomical constraints in conditional generative models,\nyielding high-quality and structure-preserving synthetic CT images.\n","authors":["Yunkui Pang","Yilin Liu","Xu Chen","Pew-Thian Yap","Jun Lian"],"pdf_url":"https://arxiv.org/pdf/2409.18355v1.pdf","comment":"MICCAI 2024"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2409.15816v2","updated":"2024-09-27T08:07:55Z","published":"2024-09-24T07:27:00Z","title":"Diffusion Models for Intelligent Transportation Systems: A Survey","summary":" Intelligent Transportation Systems (ITS) are vital in modern traffic\nmanagement and optimization, significantly enhancing traffic efficiency and\nsafety. Recently, diffusion models have emerged as transformative tools for\naddressing complex challenges within ITS. In this paper, we present a\ncomprehensive survey of diffusion models for ITS, covering both theoretical and\npractical aspects. First, we introduce the theoretical foundations of diffusion\nmodels and their key variants, including conditional diffusion models and\nlatent diffusion models, highlighting their suitability for modeling complex,\nmulti-modal traffic data and enabling controllable generation. Second, we\noutline the primary challenges in ITS and the corresponding advantages of\ndiffusion models, providing readers with a deeper understanding of the\nintersection between ITS and diffusion models. Third, we offer a\nmulti-perspective investigation of current applications of diffusion models in\nITS domains, including autonomous driving, traffic simulation, trajectory\nprediction, and traffic safety. Finally, we discuss state-of-the-art diffusion\nmodel techniques and highlight key ITS research directions that warrant further\ninvestigation. Through this structured overview, we aim to provide researchers\nwith a comprehensive understanding of diffusion models for ITS, thereby\nadvancing their future applications in the transportation domain.\n","authors":["Mingxing Peng","Kehua Chen","Xusen Guo","Qiming Zhang","Hongliang Lu","Hui Zhong","Di Chen","Meixin Zhu","Hai Yang"],"pdf_url":"https://arxiv.org/pdf/2409.15816v2.pdf","comment":"7 figures"},{"id":"http://arxiv.org/abs/2409.14738v2","updated":"2024-09-27T07:13:22Z","published":"2024-09-23T06:34:06Z","title":"Enabling On-Chip High-Frequency Adaptive Linear Optimal Control via\n Linearized Gaussian Process","summary":" Unpredictable and complex aerodynamic effects pose significant challenges to\nachieving precise flight control, such as the downwash effect from upper\nvehicles to lower ones. Conventional methods often struggle to accurately model\nthese interactions, leading to controllers that require large safety margins\nbetween vehicles. Moreover, the controller on real drones usually requires\nhigh-frequency and has limited on-chip computation, making the adaptive control\ndesign more difficult to implement. To address these challenges, we incorporate\nGaussian process (GP) to model the adaptive external aerodynamics with linear\nmodel predictive control. The GP is linearized to enable real-time\nhigh-frequency solutions. Moreover, to handle the error caused by\nlinearization, we integrate end-to-end Bayesian optimization during sample\ncollection stages to improve the control performance. Experimental results on\nboth simulations and real quadrotors show that we can achieve real-time\nsolvable computation speed with acceptable tracking errors.\n","authors":["Yuan Gao","Yinyi Lai","Jun Wang","Yini Fang"],"pdf_url":"https://arxiv.org/pdf/2409.14738v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18937v1","updated":"2024-09-27T17:37:42Z","published":"2024-09-27T17:37:42Z","title":"Robust Deep Reinforcement Learning for Volt-VAR Optimization in Active\n Distribution System under Uncertainty","summary":" The deep reinforcement learning (DRL) based Volt-VAR optimization (VVO)\nmethods have been widely studied for active distribution networks (ADNs).\nHowever, most of them lack safety guarantees in terms of power injection\nuncertainties due to the increase in distributed energy resources (DERs) and\nload demand, such as electric vehicles. This article proposes a robust deep\nreinforcement learning (RDRL) framework for VVO via a robust deep deterministic\npolicy gradient (DDPG) algorithm. This algorithm can effectively manage hybrid\naction spaces, considering control devices like capacitors, voltage regulators,\nand smart inverters. Additionally, it is designed to handle uncertainties by\nquantifying uncertainty sets with conformal prediction and modeling\nuncertainties as adversarial attacks to guarantee safe exploration across\naction spaces. Numerical results on three IEEE test cases demonstrate the\nsample efficiency and safety of the proposed robust DDPG against uncertainties\ncompared to the benchmark algorithms.\n","authors":["Zhengrong Chen","Siyao Cai","A. P. Sakis Meliopoulos"],"pdf_url":"https://arxiv.org/pdf/2409.18937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18867v1","updated":"2024-09-27T16:03:59Z","published":"2024-09-27T16:03:59Z","title":"Robust and efficient data-driven predictive control","summary":" We propose a robust and efficient data-driven predictive control (eDDPC)\nscheme which is more sample efficient (requires less offline data) compared to\nexisting schemes, and is also computationally efficient. This is done by\nleveraging an alternative data-based representation of the trajectories of\nlinear time-invariant (LTI) systems. The proposed scheme relies only on using\n(short and potentially irregularly measured) noisy input-output data, the\namount of which is independent of the prediction horizon. To account for\nmeasurement noise, we provide a novel result that quantifies the uncertainty\nbetween the true (unknown) restricted behavior of the system and the estimated\none from noisy data. Furthermore, we show that the robust eDDPC scheme is\nrecursively feasible and that the resulting closed-loop system is practically\nstable. Finally, we compare the performance of this scheme to existing ones on\na case study of a four tank system.\n","authors":["Mohammad Alsalti","Manuel Barkey","Victor G. Lopez","Matthias A. Müller"],"pdf_url":"https://arxiv.org/pdf/2409.18867v1.pdf","comment":"17 pages, 2 figures, submitted for Automatica"},{"id":"http://arxiv.org/abs/2409.18862v1","updated":"2024-09-27T15:57:52Z","published":"2024-09-27T15:57:52Z","title":"Safe Decentralized Multi-Agent Control using Black-Box Predictors,\n Conformal Decision Policies, and Control Barrier Functions","summary":" We address the challenge of safe control in decentralized multi-agent robotic\nsettings, where agents use uncertain black-box models to predict other agents'\ntrajectories. We use the recently proposed conformal decision theory to adapt\nthe restrictiveness of control barrier functions-based safety constraints based\non observed prediction errors. We use these constraints to synthesize\ncontrollers that balance between the objectives of safety and task\naccomplishment, despite the prediction errors. We provide an upper bound on the\naverage over time of the value of a monotonic function of the difference\nbetween the safety constraint based on the predicted trajectories and the\nconstraint based on the ground truth ones. We validate our theory through\nexperimental results showing the performance of our controllers when navigating\na robot in the multi-agent scenes in the Stanford Drone Dataset.\n","authors":["Sacha Huriot","Hussein Sibai"],"pdf_url":"https://arxiv.org/pdf/2409.18862v1.pdf","comment":"6 pages, 1 figure, submitted for ICRA 2025"},{"id":"http://arxiv.org/abs/2409.18806v1","updated":"2024-09-27T15:00:15Z","published":"2024-09-27T15:00:15Z","title":"Path Following Model Predictive Control of a Coupled Autonomous\n Underwater Vehicle","summary":" The operation of an autonomous underwater vehicle (AUV) faces challenges in\nfollowing predetermined waypoints due to coupled motions under environmental\ndisturbances. To address this, a 3D path following guidance and control system\nis developed in this work based on the line-of-sight (LOS) guidance method.\nConventionally, the 3D path following problem is transformed into heading and\ndepth control problems, assuming that the motion of the vehicle is decoupled in\nhorizontal and depth coordinates. The proposed control system design avoids\nthis simplifying assumption by transforming the problem into a 3D position and\norientation tracking problem. This design is achieved by computing a 2D\nhorizontal coordinate based on the desired heading and then computing a\ncorresponding LOS depth coordinate. A model predictive controller (MPC) is then\nimplemented using the 3D LOS coordinate and the computed orientation vector.\nThe MPC obtains a robust control by solving a minimax optimisation problem\nconsidering the effects of unknown ocean disturbances. The effectiveness of the\nproposed guidance and control system is demonstrated through the simulation of\na prototype AUV system. Numerical results show that the AUV can follow\npredetermined waypoints in the presence of time-varying disturbances, and the\nsystem is steered at a constant surge speed that is proportional to the radius\nof the circle of acceptance used to implement the guidance system.\n","authors":["Isah A. Jimoh","Hong Yue"],"pdf_url":"https://arxiv.org/pdf/2409.18806v1.pdf","comment":"6 pages, 4 figures, Presented at the IFAC CAMS 2024, Virginia, USA"},{"id":"http://arxiv.org/abs/2409.18796v1","updated":"2024-09-27T14:50:36Z","published":"2024-09-27T14:50:36Z","title":"Hierarchical Federated ADMM","summary":" In this paper, we depart from the widely-used gradient descent-based\nhierarchical federated learning (FL) algorithms to develop a novel hierarchical\nFL framework based on the alternating direction method of multipliers (ADMM).\nWithin this framework, we propose two novel FL algorithms, which both use ADMM\nin the top layer: one that employs ADMM in the lower layer and another that\nuses the conventional gradient descent-based approach. The proposed framework\nenhances privacy, and experiments demonstrate the superiority of the proposed\nalgorithms compared to the conventional algorithms in terms of learning\nconvergence and accuracy. Additionally, gradient descent on the lower layer\nperforms well even if the number of local steps is very limited, while ADMM on\nboth layers lead to better performance otherwise.\n","authors":["Seyed Mohammad Azimi-Abarghouyi","Nicola Bastianello","Karl H. Johansson","Viktoria Fodor"],"pdf_url":"https://arxiv.org/pdf/2409.18796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18787v1","updated":"2024-09-27T14:35:38Z","published":"2024-09-27T14:35:38Z","title":"Asymptotic tracking control of dynamic reference over homomorphically\n encrypted data with finite modulus","summary":" This paper considers a tracking control problem, in which the dynamic\ncontroller is encrypted with an additively homomorphic encryption scheme and\nthe output of a process tracks a dynamic reference asymptotically. Our paper is\nmotivated by the following problem: When dealing with both asymptotic tracking\nand dynamic reference, we find that the control input is generally subject to\noverflow issues under a finite modulus, though the dynamic controller consists\nof only integer coefficients. First, we provide a new controller design method\nsuch that the coefficients of the tracking controller can be transformed into\nintegers leveraging the zooming-in factor of dynamic quantization.\n By the Cayley-Hamilton theorem, we represent the control input as linear\ncombination of the previous control inputs. Leveraging the property above, we\ndesign an algorithm on the actuator side such that it can restore the control\ninput from the lower bits under a finite modulus. A lower bound of the modulus\nis also provided.\n As an extension of the first result, we further solve the problem of\nunbounded internal state taking place in the actuator. In particular, the\nactuator can restore the correct control input under the same modulus.\n A simulation example is provided to verify the control schemes proposed in\nour paper.\n","authors":["Shuai Feng","Junsoo Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18768v1","updated":"2024-09-27T14:12:49Z","published":"2024-09-27T14:12:49Z","title":"Learning from Demonstration with Implicit Nonlinear Dynamics Models","summary":" Learning from Demonstration (LfD) is a useful paradigm for training policies\nthat solve tasks involving complex motions. In practice, the successful\napplication of LfD requires overcoming error accumulation during policy\nexecution, i.e. the problem of drift due to errors compounding over time and\nthe consequent out-of-distribution behaviours. Existing works seek to address\nthis problem through scaling data collection, correcting policy errors with a\nhuman-in-the-loop, temporally ensembling policy predictions or through learning\nthe parameters of a dynamical system model. In this work, we propose and\nvalidate an alternative approach to overcoming this issue. Inspired by\nreservoir computing, we develop a novel neural network layer that includes a\nfixed nonlinear dynamical system with tunable dynamical properties. We validate\nthe efficacy of our neural network layer on the task of reproducing human\nhandwriting motions using the LASA Human Handwriting Dataset. Through empirical\nexperiments we demonstrate that incorporating our layer into existing neural\nnetwork architectures addresses the issue of compounding errors in LfD.\nFurthermore, we perform a comparative evaluation against existing approaches\nincluding a temporal ensemble of policy predictions and an Echo State Networks\n(ESNs) implementation. We find that our approach yields greater policy\nprecision and robustness on the handwriting task while also generalising to\nmultiple dynamics regimes and maintaining competitive latency scores.\n","authors":["Peter David Fagan","Subramanian Ramamoorthy"],"pdf_url":"https://arxiv.org/pdf/2409.18768v1.pdf","comment":"21 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.18766v1","updated":"2024-09-27T14:07:56Z","published":"2024-09-27T14:07:56Z","title":"Dual Pricing to Prioritize Renewable Energy and Consumer Preferences in\n Electricity Markets","summary":" Electricity markets currently fail to incorporate preferences of buyers,\ntreating polluting and renewable energy sources as having equal social benefit\nunder a system of uniform clearing prices. Meanwhile, renewable energy is prone\nto curtailment due to transmission constraints, forcing grid operators to\nreduce or shut down renewable energy production despite its availability and\nneed. This paper proposes a ``dual pricing mechanism\" which allows buyers to\nbid both their willingness to pay for electricity, and additionally, their\npreference for green energy. Designed for use in deregulated electricity\nmarkets, this mechanism prioritizes the dispatch of more renewable energy\nsources according to consumer preferences. Traditional uniform clearing prices,\nwhich treat all energy sources equally, do not reflect the growing share of\ngreen energy in the power grid and the environmental values of consumers. By\nallowing load-serving entities to bid their willingness to pay for renewable\nenergy directly into the clearing market, our proposed framework generates\ndistinct pricing signals for green and ``black\" electricity.\n","authors":["Emilie Jong","Samuel Chevalier","Spyros Chatzivasileiadis","Shie Mannor"],"pdf_url":"https://arxiv.org/pdf/2409.18766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18755v1","updated":"2024-09-27T13:47:17Z","published":"2024-09-27T13:47:17Z","title":"Transparency evaluation for the Kinematic Design of the Harnesses\n through Human-Exoskeleton Interaction Modeling","summary":" Lower Limb Exoskeletons (LLEs) are wearable robots that provide mechanical\npower to the user. Human-exoskeleton (HE) connections must preserve the user's\nnatural behavior during the interaction, avoiding undesired forces. Therefore,\nnumerous works focus on their minimization. Given the inherent complications of\nrepeatedly prototyping and experimentally testing a device, modeling the\nexoskeleton and its physical interaction with the user emerges as a valuable\napproach for assessing the design effects. This paper proposes a novel method\nto compare different exoskeleton configurations with a flexible simulation\ntool. This approach contemplates simulating the dynamics of the device,\nincluding its interaction with the wearer, to evaluate multiple connection\nmechanism designs along with the kinematics and actuation of the LLE. This\nevaluation is based on the minimization of the interaction wrenches through an\noptimization process that includes the impedance parameters at the interfaces\nas optimization variables and the similarity of the LLE's joint variables\ntrajectories with the motion of the wearer's articulations. Exploratory tests\nare conducted using the Wearable Walker LLE in different configurations and\nmeasuring the interaction forces. Experimental data are then compared to the\noptimization outcomes, proving that the proposed method provides contact wrench\nestimations consistent with the collected measurements and previous outcomes\nfrom the literature. Copyright 2024 IEEE. Personal use of this material is\npermitted. Permission from IEEE must be obtained for all other uses, in any\ncurrent or future media, including reprinting/republishing this material for\nadvertising or promotional purposes, creating new collective works, for resale\nor redistribution to servers or lists, or reuse of any copyrighted component of\nthis work in other works.\n","authors":["Riccardo Bezzini","Carlo Alberto Avizzano","Francesco Porcini","Alessandro Filippeschi"],"pdf_url":"https://arxiv.org/pdf/2409.18755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18742v1","updated":"2024-09-27T13:33:19Z","published":"2024-09-27T13:33:19Z","title":"A History-Guided Regional Partitioning Evolutionary Optimization for\n Solving the Flexible Job Shop Problem with Limited Multi-load Automated\n Guided Vehicles","summary":" In a flexible job shop environment, using Automated Guided Vehicles (AGVs) to\ntransport jobs and process materials is an important way to promote the\nintelligence of the workshop. Compared with single-load AGVs, multi-load AGVs\ncan improve AGV utilization, reduce path conflicts, etc. Therefore, this study\nproposes a history-guided regional partitioning algorithm (HRPEO) for the\nflexible job shop scheduling problem with limited multi-load AGVs (FJSPMA).\nFirst, the encoding and decoding rules are designed according to the\ncharacteristics of multi-load AGVs, and then the initialization rule based on\nthe branch and bound method is used to generate the initial population. Second,\nto prevent the algorithm from falling into a local optimum, the algorithm\nadopts a regional partitioning strategy. This strategy divides the solution\nspace into multiple regions and measures the potential of the regions. After\nthat, cluster the regions into multiple clusters in each iteration, and selects\nindividuals for evolutionary search based on the set of clusters. Third, a\nlocal search strategy is designed to improve the exploitation ability of the\nalgorithm, which uses a greedy approach to optimize machines selection and\ntransportation sequence according to the characteristics of FJSPMA. Finally, a\nlarge number of experiments are carried out on the benchmarks to test the\nperformance of the algorithm. Compared with multiple advanced algorithms, the\nresults show that the HRPEO has a better advantage in solving FJSPMA.\n","authors":["Feige Liu","Chao Lu","Xin Li"],"pdf_url":"https://arxiv.org/pdf/2409.18742v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2409.18734v1","updated":"2024-09-27T13:24:33Z","published":"2024-09-27T13:24:33Z","title":"On Adaptive Frequency Sampling for Data-driven MOR Applied to Antenna\n Responses","summary":" Frequency domain sweeps of array antennas are well-known to be\ntime-intensive, and different surrogate models have been used to improve the\nperformance. Data-driven model order reduction algorithms, such as the Loewner\nframework and vector fitting, can be integrated with these adaptive error\nestimates, in an iterative algorithm, to reduce the number of full-wave\nsimulations required to accurately capture the requested frequency behavior of\nmultiport array antennas. In this work, we propose two novel adaptive methods\nexploiting a block matrix function which is a key part of the Loewner framework\ngenerating system approach. The first algorithm leverages an inherent matrix\nparameter freedom in the block matrix function to identify frequency points\nwith large errors, whereas the second utilizes the condition number of the\nblock matrix function. Both methods effectively provide frequency domain error\nestimates, essential for improved performance. Numerical experiments on\nmultiport array antenna S-parameters demonstrate the effectiveness of our\nproposed algorithms within the Loewner framework.\n","authors":["Lucas Åkerstedt","Darwin Blanco","B. L. G. Jonsson"],"pdf_url":"https://arxiv.org/pdf/2409.18734v1.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.18681v1","updated":"2024-09-27T12:11:47Z","published":"2024-09-27T12:11:47Z","title":"Pseudometrics for scalable data-driven comparisons of nonlinear\n dynamical systems","summary":" Novel solutions for pseudometrics quantifying deviation from topological\nconjugacy between dynamical systems are presented. Deviation from conjugacy is\nquantified in a Pareto optimal sense that accounts for spectral properties of\nKoopman operators as well as trajectory geometry. Theoretical justification is\nprovided for computing such pseudometrics in Koopman eigenfunction space rather\nthan observable space. Furthermore, it is shown deriving the pseudometrics from\nunitary transformations is sufficient to recover a value of zero if two systems\nare topologically conjugate. Therefore the pseudometrics for quantifying\ndeviation from conjugacy are based on analytical solutions for unitary\ntransformations in Koopman eigenfunction space. Finally, geometric\nconsiderations for the Pareto optimality problem associated with deviation from\nconjugacy are used to develop pseudometrics that account for all possible\nsolutions given just two Pareto points based on analytical solutions.\n","authors":["Bryan Glaz"],"pdf_url":"https://arxiv.org/pdf/2409.18681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15614v2","updated":"2024-09-27T11:46:37Z","published":"2024-07-22T13:20:47Z","title":"Experimenting with Adaptive Bitrate Algorithms for Virtual Reality\n Streaming over Wi-Fi","summary":" Interactive Virtual Reality (VR) streaming over Wi-Fi networks encounters\nsignificant challenges due to bandwidth fluctuations caused by channel\ncontention and user mobility. Adaptive BitRate (ABR) algorithms dynamically\nadjust the video encoding bitrate based on the available network capacity,\naiming to maximize image quality while mitigating congestion and preserving the\nuser's Quality of Experience (QoE). In this paper, we experiment with ABR\nalgorithms for VR streaming using Air Light VR (ALVR), an open-source VR\nstreaming solution. We extend ALVR with a comprehensive set of metrics that\nprovide a robust characterization of the network's state, enabling more\ninformed bitrate adjustments. To demonstrate the utility of these performance\nindicators, we develop and test the Network-aware Step-wise ABR algorithm for\nVR streaming (NeSt-VR). Results validate the accuracy of the newly implemented\nnetwork performance metrics and demonstrate NeSt-VR's video bitrate adaptation\ncapabilities.\n","authors":["Ferran Maura","Miguel Casasnovas","Boris Bellalta"],"pdf_url":"https://arxiv.org/pdf/2407.15614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18657v1","updated":"2024-09-27T11:42:15Z","published":"2024-09-27T11:42:15Z","title":"Impact of number of elements on the directivity of planar array of\n monopole antenna","summary":" This research investigates how the number of elements affects the monopole\nantenna's planar array's directivity. This study also takes into account the\nantenna's effect on the whole field it radiates. The monopole antennas are\narranged in a planar configuration with all the components in their proper\nlocations using the Hadamard matrix approach. Each matrix's directivities and\narray factors were calculated, and a MATLAB tool was used to simulate the\nradiation pattern. A range of elements from 4 X 4 to 50 X 50 planar layouts\nwere taken into consideration during the investigation. Increasing the number\nof elements improves the directivity. Increasing the number of elements in the\nplanar array resulted in a great improvement in directivity, as seen by the\ncomputed and simulated results. Consequently, by increasing the antenna's\ndirectivity, a greater number of elements influences the overall field emitted.\n","authors":["S. E. Akpo","O. U. Omini","G. A. Tawo"],"pdf_url":"https://arxiv.org/pdf/2409.18657v1.pdf","comment":"8 pages, 19 Figures, article"},{"id":"http://arxiv.org/abs/2409.18641v1","updated":"2024-09-27T11:19:41Z","published":"2024-09-27T11:19:41Z","title":"Pseudo-kinematic trajectory control of tracked vehicles","summary":" Tracked vehicles are used in complex scenarios, where motion planning and\nnavigation can be very complex. They have complex dynamics, with many\nparameters that are difficult to identify and that change significantly based\non the operating conditions. We propose a simple pseudo-kinematic model, where\nthe intricate dynamic effects underlying the vehicle's motion are captured in a\nsmall set of velocity-dependent parameters. This choice enables the development\nof a Lyapunov-based trajectory controller with guaranteed performance and small\ncomputation time. We demonstrate the correctness of our approach with both\nsimulation and experimental data.\n","authors":["Michele Focchi","Daniele Fontanelli","Luigi Palopoli"],"pdf_url":"https://arxiv.org/pdf/2409.18641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18589v1","updated":"2024-09-27T09:47:10Z","published":"2024-09-27T09:47:10Z","title":"Towards Event-Triggered NMPC for Efficient 6G Communications:\n Experimental Results and Open Problems","summary":" Networked control systems enable real-time control and coordination of\ndistributed systems, leveraging the low latency, high reliability, and massive\nconnectivity offered by 5G and future 6G networks. Applications include\nautonomous vehicles, robotics, industrial automation, and smart grids. Despite\nnetworked control algorithms admitting nominal stability guarantees even in the\npresence of delays and packet dropouts, their practical performance still\nheavily depends on the specific characteristics and conditions of the\nunderlying network. To achieve the desired performance while efficiently using\ncommunication resources, co-design of control and communication is pivotal.\nAlthough periodic schemes, where communication instances are fixed, can provide\nreliable control performance, unnecessary transmissions, when updates are not\nneeded, result in inefficient usage of network resources. In this paper, we\ninvestigate the potential for co-design of model predictive control and network\ncommunication. To this end, we design and implement an event-triggered\nnonlinear model predictive controller for stabilizing a Furuta pendulum\ncommunicating over a tailored open radio access network 6G research platform.\nWe analyze the control performance as well as network utilization under varying\nchannel conditions and event-triggering criteria. Our results show that the\nevent-triggered control scheme achieves similar performance to periodic control\nwith reduced communication demand.\n","authors":["Jens Püttschneider","Julian Golembiewski","Niklas A. Wagner","Christian Wietfeld","Timm Faulwasser"],"pdf_url":"https://arxiv.org/pdf/2409.18589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18586v1","updated":"2024-09-27T09:45:21Z","published":"2024-09-27T09:45:21Z","title":"Analysis of Truncated Singular Value Decomposition for Koopman\n Operator-Based Lane Change Model","summary":" Understanding and modeling complex dynamic systems is crucial for enhancing\nvehicle performance and safety, especially in the context of autonomous\ndriving. Recently, popular methods such as Koopman operators and their\napproximators, known as Extended Dynamic Mode Decomposition (EDMD), have\nemerged for their effectiveness in transforming strongly nonlinear system\nbehavior into linear representations. This allows them to be integrated with\nconventional linear controllers. To achieve this, Singular Value Decomposition\n(SVD), specifically truncated SVD, is employed to approximate Koopman operators\nfrom extensive datasets efficiently. This study evaluates different basis\nfunctions used in EDMD and ranks for truncated SVD for representing lane change\nbehavior models, aiming to balance computational efficiency with information\nloss. The findings, however, suggest that the technique of truncated SVD does\nnot necessarily achieve substantial reductions in computational training time\nand results in significant information loss.\n","authors":["Chinnawut Nantabut"],"pdf_url":"https://arxiv.org/pdf/2409.18586v1.pdf","comment":"Submitted to the 21st International Conference on Informatics in\n Control, Automation and Robotics (ICINCO 2024)"},{"id":"http://arxiv.org/abs/2409.18585v1","updated":"2024-09-27T09:43:33Z","published":"2024-09-27T09:43:33Z","title":"Unscented Transform-based Pure Pursuit Path-Tracking Algorithm under\n Uncertainty","summary":" Automated driving has become more and more popular due to its potential to\neliminate road accidents by taking over driving tasks from humans. One of the\nremaining challenges is to follow a planned path autonomously, especially when\nuncertainties in self-localizing or understanding the surroundings can\ninfluence the decisions made by autonomous vehicles, such as calculating how\nmuch they need to steer to minimize tracking errors. In this paper, a modified\ngeometric pure pursuit path-tracking algorithm is proposed, taking into\nconsideration such uncertainties using the unscented transform. The algorithm\nis tested through simulations for typical road geometries, such as straight and\ncircular lines.\n","authors":["Chinnawut Nantabut"],"pdf_url":"https://arxiv.org/pdf/2409.18585v1.pdf","comment":"Submitted to the 21st International Conference on Informatics in\n Control, Automation and Robotics (ICINCO 2024)"},{"id":"http://arxiv.org/abs/2409.18549v1","updated":"2024-09-27T08:37:02Z","published":"2024-09-27T08:37:02Z","title":"CaΣoS: A nonlinear sum-of-squares optimization suite","summary":" We present Ca{\\Sigma}oS, the first MATLAB software specifically designed for\nnonlinear sum-of-squares optimization. A symbolic polynomial algebra system\nallows to formulate parametrized sum-of-squares optimization problems and\nfacilitates their fast, repeated evaluations. To that extent, we make use of\nCasADi's symbolic framework and realize concepts of monomial sparsity, linear\noperators (including duals), and functions between polynomials. Ca{\\Sigma}oS\ncurrently provides interfaces to the conic solvers SeDuMi, Mosek, and SCS as\nwell as methods to solve quasiconvex optimization problems (via bisection) and\nnonconvex optimization problems (via sequential convexification). Numerical\nexamples for benchmark problems including region-of-attraction and reachable\nset estimation for nonlinear dynamic systems demonstrate significant\nimprovements in computation time compared to existing toolboxes.. Ca{\\Sigma}oS\nis available open-source at https://github.com/ ifr-acso/casos.\n","authors":["Torbjørn Cunis","Jan Olucak"],"pdf_url":"https://arxiv.org/pdf/2409.18549v1.pdf","comment":"Submitted to 2025 American Control Conference"},{"id":"http://arxiv.org/abs/2404.16712v2","updated":"2024-09-27T08:28:35Z","published":"2024-04-25T16:22:12Z","title":"Distributed Model Predictive Control for Piecewise Affine Systems Based\n on Switching ADMM","summary":" This paper presents a novel approach for distributed model predictive control\n(MPC) for piecewise affine (PWA) systems. Existing approaches rely on solving\nmixed-integer optimization problems, requiring significant computation power or\ntime. We propose a distributed MPC scheme that requires solving only convex\noptimization problems. The key contribution is a novel method, based on the\nalternating direction method of multipliers, for solving the non-convex optimal\ncontrol problem that arises due to the PWA dynamics. We present a distributed\nMPC scheme, leveraging this method, that explicitly accounts for the coupling\nbetween subsystems by reaching agreement on the values of coupled states.\nStability and recursive feasibility are shown under additional assumptions on\nthe underlying system. Two numerical examples are provided, in which the\nproposed controller is shown to significantly improve the CPU time and\nclosed-loop performance over existing state-of-the-art approaches.\n","authors":["Samuel Mallick","Azita Dabiri","Bart De Schutter"],"pdf_url":"https://arxiv.org/pdf/2404.16712v2.pdf","comment":"15 pages, 9 figures, submitted to IEEE Transactions on Automatic\n Control, code available at\n https://github.com/SamuelMallick/stable-dmpc-pwa/tree/paper_2024 and\n https://github.com/SamuelMallick/hybrid-vehicle-platoon/tree/paper-2024"},{"id":"http://arxiv.org/abs/2409.18524v1","updated":"2024-09-27T08:05:56Z","published":"2024-09-27T08:05:56Z","title":"Adaptive Knowledge-based Multi-Objective Evolutionary Algorithm for\n Hybrid Flow Shop Scheduling Problems with Multiple Parallel Batch Processing\n Stages","summary":" Parallel batch processing machines have extensive applications in the\nsemiconductor manufacturing process. However, the problem models in previous\nstudies regard parallel batch processing as a fixed processing stage in the\nmachining process. This study generalizes the problem model, in which users can\narbitrarily set certain stages as parallel batch processing stages according to\ntheir needs. A Hybrid Flow Shop Scheduling Problem with Parallel Batch\nProcessing Machines (PBHFSP) is solved in this paper. Furthermore, an Adaptive\nKnowledge-based Multi-Objective Evolutionary Algorithm (AMOEA/D) is designed to\nsimultaneously optimize both makespan and Total Energy Consumption (TEC).\nFirstly, a hybrid initialization strategy with heuristic rules based on\nknowledge of PBHFSP is proposed to generate promising solutions. Secondly, the\ndisjunctive graph model has been established based on the knowledge to find the\ncritical-path of PBHFS. Then, a critical-path based neighborhood search is\nproposed to enhance the exploitation ability of AMOEA/D. Moreover, the search\ntime is adaptively adjusted based on learning experience from Q-learning and\nDecay Law. Afterward, to enhance the exploration capability of the algorithm,\nAMOEA/D designs an improved population updating strategy with a weight vector\nupdating strategy. These strategies rematch individuals with weight vectors,\nthereby maintaining the diversity of the population. Finally, the proposed\nalgorithm is compared with state-of-the-art algorithms. The experimental\nresults show that the AMOEA/D is superior to the comparison algorithms in\nsolving the PBHFSP.\n","authors":["Feige Liu","Xin Li","Chao Lu","Wenying Gong"],"pdf_url":"https://arxiv.org/pdf/2409.18524v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2405.00871v2","updated":"2024-09-27T07:18:39Z","published":"2024-05-01T21:11:29Z","title":"Learning to Boost the Performance of Stable Nonlinear Systems","summary":" The growing scale and complexity of safety-critical control systems\nunderscore the need to evolve current control architectures aiming for the\nunparalleled performances achievable through state-of-the-art optimization and\nmachine learning algorithms. However, maintaining closed-loop stability while\nboosting the performance of nonlinear control systems using data-driven and\ndeep-learning approaches stands as an important unsolved challenge. In this\npaper, we tackle the performance-boosting problem with closed-loop stability\nguarantees. Specifically, we establish a synergy between the Internal Model\nControl (IMC) principle for nonlinear systems and state-of-the-art\nunconstrained optimization approaches for learning stable dynamics. Our methods\nenable learning over arbitrarily deep neural network classes of\nperformance-boosting controllers for stable nonlinear systems; crucially, we\nguarantee L_p closed-loop stability even if optimization is halted prematurely,\nand even when the ground-truth dynamics are unknown, with vanishing\nconservatism in the class of stabilizing policies as the model uncertainty is\nreduced to zero. We discuss the implementation details of the proposed control\nschemes, including distributed ones, along with the corresponding optimization\nprocedures, demonstrating the potential of freely shaping the cost functions\nthrough several numerical experiments.\n","authors":["Luca Furieri","Clara Lucía Galimberti","Giancarlo Ferrari-Trecate"],"pdf_url":"https://arxiv.org/pdf/2405.00871v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15256v4","updated":"2024-09-27T07:16:23Z","published":"2024-04-23T17:42:45Z","title":"TOP-Nav: Legged Navigation Integrating Terrain, Obstacle and\n Proprioception Estimation","summary":" Legged navigation is typically examined within open-world, off-road, and\nchallenging environments. In these scenarios, estimating external disturbances\nrequires a complex synthesis of multi-modal information. This underlines a\nmajor limitation in existing works that primarily focus on avoiding obstacles.\nIn this work, we propose TOP-Nav, a novel legged navigation framework that\nintegrates a comprehensive path planner with Terrain awareness, Obstacle\navoidance and close-loop Proprioception. TOP-Nav underscores the synergies\nbetween vision and proprioception in both path and motion planning. Within the\npath planner, we present and integrate a terrain estimator that enables the\nrobot to select waypoints on terrains with higher traversability while\neffectively avoiding obstacles. In the motion planning level, we not only\nimplement a locomotion controller to track the navigation commands, but also\nconstruct a proprioception advisor to provide motion evaluations for the path\nplanner. Based on the close-loop motion feedback, we make online corrections\nfor the vision-based terrain and obstacle estimations. Consequently, TOP-Nav\nachieves open-world navigation that the robot can handle terrains or\ndisturbances beyond the distribution of prior knowledge and overcomes\nconstraints imposed by visual conditions. Building upon extensive experiments\nconducted in both simulation and real-world environments, TOP-Nav demonstrates\nsuperior performance in open-world navigation compared to existing methods.\n","authors":["Junli Ren","Yikai Liu","Yingru Dai","Junfeng Long","Guijin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.15256v4.pdf","comment":"Published on CoRL 2024"},{"id":"http://arxiv.org/abs/2408.07841v2","updated":"2024-09-27T07:02:12Z","published":"2024-08-14T22:43:52Z","title":"SustainDC -- Benchmarking for Sustainable Data Center Control","summary":" Machine learning has driven an exponential increase in computational demand,\nleading to massive data centers that consume significant amounts of energy and\ncontribute to climate change. This makes sustainable data center control a\npriority. In this paper, we introduce SustainDC, a set of Python environments\nfor benchmarking multi-agent reinforcement learning (MARL) algorithms for data\ncenters (DC). SustainDC supports custom DC configurations and tasks such as\nworkload scheduling, cooling optimization, and auxiliary battery management,\nwith multiple agents managing these operations while accounting for the effects\nof each other. We evaluate various MARL algorithms on SustainDC, showing their\nperformance across diverse DC designs, locations, weather conditions, grid\ncarbon intensity, and workload requirements. Our results highlight significant\nopportunities for improvement of data center operations using MARL algorithms.\nGiven the increasing use of DC due to AI, SustainDC provides a crucial platform\nfor the development and benchmarking of advanced algorithms essential for\nachieving sustainable computing and addressing other heterogeneous real-world\nchallenges.\n","authors":["Avisek Naug","Antonio Guillen","Ricardo Luna","Vineet Gundecha","Desik Rengarajan","Sahand Ghorbanpour","Sajad Mousavi","Ashwin Ramesh Babu","Dejan Markovikj","Lekhapriya D Kashyap","Soumyendu Sarkar"],"pdf_url":"https://arxiv.org/pdf/2408.07841v2.pdf","comment":"Under review at Advances in Neural Information Processing Systems\n 2024 (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2311.13824v2","updated":"2024-09-27T06:13:36Z","published":"2023-11-23T06:36:57Z","title":"Constraint-Guided Online Data Selection for Scalable Data-Driven Safety\n Filters in Uncertain Robotic Systems","summary":" As the use of autonomous robots expands in tasks that are complex and\nchallenging to model, the demand for robust data-driven control methods that\ncan certify safety and stability in uncertain conditions is increasing.\nHowever, the practical implementation of these methods often faces scalability\nissues due to the growing amount of data points with system complexity, and a\nsignificant reliance on high-quality training data. In response to these\nchallenges, this study presents a scalable data-driven controller that\nefficiently identifies and infers from the most informative data points for\nimplementing data-driven safety filters. Our approach is grounded in the\nintegration of a model-based certificate function-based method and Gaussian\nProcess (GP) regression, reinforced by a novel online data selection algorithm\nthat reduces time complexity from quadratic to linear relative to dataset size.\nEmpirical evidence, gathered from successful real-world cart-pole swing-up\nexperiments and simulated locomotion of a five-link bipedal robot, demonstrates\nthe efficacy of our approach. Our findings reveal that our efficient online\ndata selection algorithm, which strategically selects key data points, enhances\nthe practicality and efficiency of data-driven certifying filters in complex\nrobotic systems, significantly mitigating scalability concerns inherent in\nnonparametric learning-based control methods.\n","authors":["Jason J. Choi","Fernando Castañeda","Wonsuhk Jung","Bike Zhang","Claire J. Tomlin","Koushil Sreenath"],"pdf_url":"https://arxiv.org/pdf/2311.13824v2.pdf","comment":"The first three authors contributed equally to the work. This work\n has been submitted to the IEEE for possible publication. Copyright may be\n transferred without notice, after which this version may no longer be\n accessible"},{"id":"http://arxiv.org/abs/2409.18382v1","updated":"2024-09-27T01:48:16Z","published":"2024-09-27T01:48:16Z","title":"CurricuLLM: Automatic Task Curricula Design for Learning Complex Robot\n Skills using Large Language Models","summary":" Curriculum learning is a training mechanism in reinforcement learning (RL)\nthat facilitates the achievement of complex policies by progressively\nincreasing the task difficulty during training. However, designing effective\ncurricula for a specific task often requires extensive domain knowledge and\nhuman intervention, which limits its applicability across various domains. Our\ncore idea is that large language models (LLMs), with their extensive training\non diverse language data and ability to encapsulate world knowledge, present\nsignificant potential for efficiently breaking down tasks and decomposing\nskills across various robotics environments. Additionally, the demonstrated\nsuccess of LLMs in translating natural language into executable code for RL\nagents strengthens their role in generating task curricula. In this work, we\npropose CurricuLLM, which leverages the high-level planning and programming\ncapabilities of LLMs for curriculum design, thereby enhancing the efficient\nlearning of complex target tasks. CurricuLLM consists of: (Step 1) Generating\nsequence of subtasks that aid target task learning in natural language form,\n(Step 2) Translating natural language description of subtasks in executable\ntask code, including the reward code and goal distribution code, and (Step 3)\nEvaluating trained policies based on trajectory rollout and subtask\ndescription. We evaluate CurricuLLM in various robotics simulation\nenvironments, ranging from manipulation, navigation, and locomotion, to show\nthat CurricuLLM can aid learning complex robot control tasks. In addition, we\nvalidate humanoid locomotion policy learned through CurricuLLM in real-world.\nThe code is provided in https://github.com/labicon/CurricuLLM\n","authors":["Kanghyun Ryu","Qiayuan Liao","Zhongyu Li","Koushil Sreenath","Negar Mehr"],"pdf_url":"https://arxiv.org/pdf/2409.18382v1.pdf","comment":"Submitted to ICRA 2025"},{"id":"http://arxiv.org/abs/2404.13298v2","updated":"2024-09-27T01:23:51Z","published":"2024-04-20T07:04:46Z","title":"MARec: Metadata Alignment for cold-start Recommendation","summary":" For many recommender systems, the primary data source is a historical record\nof user clicks. The associated click matrix is often very sparse, as the number\nof users x products can be far larger than the number of clicks. Such sparsity\nis accentuated in cold-start settings, which makes the efficient use of\nmetadata information of paramount importance. In this work, we propose a simple\napproach to address cold-start recommendations by leveraging content metadata,\nMetadata Alignment for cold-start Recommendation. We show that this approach\ncan readily augment existing matrix factorization and autoencoder approaches,\nenabling a smooth transition to top performing algorithms in warmer set-ups.\nOur experimental results indicate three separate contributions: first, we show\nthat our proposed framework largely beats SOTA results on 4 cold-start datasets\nwith different sparsity and scale characteristics, with gains ranging from\n+8.4% to +53.8% on reported ranking metrics; second, we provide an ablation\nstudy on the utility of semantic features, and proves the additional gain\nobtained by leveraging such features ranges between +46.8% and +105.5%; and\nthird, our approach is by construction highly competitive in warm set-ups, and\nwe propose a closed-form solution outperformed by SOTA results by only 0.8% on\naverage.\n","authors":["Julien Monteil","Volodymyr Vaskovych","Wentao Lu","Anirban Majumder","Anton van den Hengel"],"pdf_url":"https://arxiv.org/pdf/2404.13298v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01193v2","updated":"2024-09-27T01:23:02Z","published":"2024-08-02T11:22:57Z","title":"On Game Based Distributed Decision Approach for Multi-agent Optimal\n Coverage Problem with Application to Constellations Reconfiguration","summary":" This paper focuses on the optimal coverage problem (OCP) for multi-agent\nsystems with decentralized optimization. A game based distributed decision\napproach for the the multi-agent OCP is proposed. The equivalence between the\nequilibrium of the game and the extreme value of the global performance\nobjective is strictly proved. Then, a distributed algorithm only using local\ninformation to obtain the global near-optimal coverage is developed, and its\nconvergence is proved. Finally, the proposed method is applied to maximize the\ncovering time of a satellite constellation for a target. The simulation results\nunder different scenarios show our method costs much less computation time\nunder some level index than traditional centralized optimization.\n","authors":["Zixin Feng","Wenchao Xue","Yifen Mu","Ming Wei","Bin Meng","Wei Cui"],"pdf_url":"https://arxiv.org/pdf/2408.01193v2.pdf","comment":"11 pages,11 figures"},{"id":"http://arxiv.org/abs/2407.00289v2","updated":"2024-09-27T01:15:18Z","published":"2024-06-29T02:58:44Z","title":"Personalised Outfit Recommendation via History-aware Transformers","summary":" We present the history-aware transformer (HAT), a transformer-based model\nthat uses shoppers' purchase history to personalise outfit predictions. The aim\nof this work is to recommend outfits that are internally coherent while\nmatching an individual shopper's style and taste. To achieve this, we stack two\ntransformer models, one that produces outfit representations and another one\nthat processes the history of purchased outfits for a given shopper. We use\nthese models to score an outfit's compatibility in the context of a shopper's\npreferences as inferred from their previous purchases. During training, the\nmodel learns to discriminate between purchased and random outfits using 3\nlosses: the focal loss for outfit compatibility typically used in the\nliterature, a contrastive loss to bring closer learned outfit embeddings from a\nshopper's history, and an adaptive margin loss to facilitate learning from weak\nnegatives. Together, these losses enable the model to make personalised\nrecommendations based on a shopper's purchase history.\n Our experiments on the IQON3000 and Polyvore datasets show that HAT\noutperforms strong baselines on the outfit Compatibility Prediction (CP) and\nthe Fill In The Blank (FITB) tasks. The model improves AUC for the CP hard task\nby 15.7% (IQON3000) and 19.4% (Polyvore) compared to previous SOTA results. It\nfurther improves accuracy on the FITB hard task by 6.5% and 9.7%, respectively.\nWe provide ablation studies on the personalisation, constrastive loss, and\nadaptive margin loss that highlight the importance of these modelling choices.\n","authors":["Myong Chol Jung","Julien Monteil","Philip Schulz","Volodymyr Vaskovych"],"pdf_url":"https://arxiv.org/pdf/2407.00289v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18361v1","updated":"2024-09-27T00:35:21Z","published":"2024-09-27T00:35:21Z","title":"iWalker: Imperative Visual Planning for Walking Humanoid Robot","summary":" Humanoid robots, with the potential to perform a broad range of tasks in\nenvironments designed for humans, have been deemed crucial for the basis of\ngeneral AI agents. When talking about planning and controlling, although\ntraditional models and task-specific methods have been extensively studied over\nthe past few decades, they are inadequate for achieving the flexibility and\nversatility needed for general autonomy. Learning approaches, especially\nreinforcement learning, are powerful and popular nowadays, but they are\ninherently \"blind\" during training, relying heavily on trials in simulation\nwithout proper guidance from physical principles or underlying dynamics. In\nresponse, we propose a novel end-to-end pipeline that seamlessly integrates\nperception, planning, and model-based control for humanoid robot walking. We\nrefer to our method as iWalker, which is driven by imperative learning (IL), a\nself-supervising neuro-symbolic learning framework. This enables the robot to\nlearn from arbitrary unlabeled data, significantly improving its adaptability\nand generalization capabilities. In experiments, iWalker demonstrates\neffectiveness in both simulated and real-world environments, representing a\nsignificant advancement toward versatile and autonomous humanoid robots.\n","authors":["Xiao Lin","Yuhao Huang","Taimeng Fu","Xiaobin Xiong","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18361v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2409.17985v2","updated":"2024-09-27T02:08:56Z","published":"2024-09-26T15:55:59Z","title":"Hypergame Theory for Decentralized Resource Allocation in Multi-user\n Semantic Communications","summary":" Semantic communications (SC) is an emerging communication paradigm in which\nwireless devices can send only relevant information from a source of data while\nrelying on computing resources to regenerate missing data points. However, the\ndesign of a multi-user SC system becomes more challenging because of the\ncomputing and communication overhead required for coordination. Existing\nsolutions for learning the semantic language and performing resource allocation\noften fail to capture the computing and communication tradeoffs involved in\nmultiuser SC. To address this gap, a novel framework for decentralized\ncomputing and communication resource allocation in multiuser SC systems is\nproposed. The challenge of efficiently allocating communication and computing\nresources (for reasoning) in a decentralized manner to maximize the quality of\ntask experience for the end users is addressed through the application of\nStackelberg hyper game theory. Leveraging the concept of second-level hyper\ngames, novel analytical formulations are developed to model misperceptions of\nthe users about each other's communication and control strategies. Further,\nequilibrium analysis of the learned resource allocation protocols examines the\nconvergence of the computing and communication strategies to a local\nStackelberg equilibria, considering misperceptions. Simulation results show\nthat the proposed Stackelberg hyper game results in efficient usage of\ncommunication and computing resources while maintaining a high quality of\nexperience for the users compared to state-of-the-art that does not account for\nthe misperceptions.\n","authors":["Christo Kurisummoottil Thomas","Walid Saad"],"pdf_url":"https://arxiv.org/pdf/2409.17985v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17763v2","updated":"2024-09-27T06:50:21Z","published":"2024-09-26T11:58:41Z","title":"Confidence intervals uncovered: Are we ready for real-world medical\n imaging AI?","summary":" Medical imaging is spearheading the AI transformation of healthcare.\nPerformance reporting is key to determine which methods should be translated\ninto clinical practice. Frequently, broad conclusions are simply derived from\nmean performance values. In this paper, we argue that this common practice is\noften a misleading simplification as it ignores performance variability. Our\ncontribution is threefold. (1) Analyzing all MICCAI segmentation papers (n =\n221) published in 2023, we first observe that more than 50% of papers do not\nassess performance variability at all. Moreover, only one (0.5%) paper reported\nconfidence intervals (CIs) for model performance. (2) To address the reporting\nbottleneck, we show that the unreported standard deviation (SD) in segmentation\npapers can be approximated by a second-order polynomial function of the mean\nDice similarity coefficient (DSC). Based on external validation data from 56\nprevious MICCAI challenges, we demonstrate that this approximation can\naccurately reconstruct the CI of a method using information provided in\npublications. (3) Finally, we reconstructed 95% CIs around the mean DSC of\nMICCAI 2023 segmentation papers. The median CI width was 0.03 which is three\ntimes larger than the median performance gap between the first and second\nranked method. For more than 60% of papers, the mean performance of the\nsecond-ranked method was within the CI of the first-ranked method. We conclude\nthat current publications typically do not provide sufficient evidence to\nsupport which models could potentially be translated into clinical practice.\n","authors":["Evangelia Christodoulou","Annika Reinke","Rola Houhou","Piotr Kalinowski","Selen Erkan","Carole H. Sudre","Ninon Burgos","Sofiène Boutaj","Sophie Loizillon","Maëlys Solal","Nicola Rieke","Veronika Cheplygina","Michela Antonelli","Leon D. Mayer","Minu D. Tizabi","M. Jorge Cardoso","Amber Simpson","Paul F. Jäger","Annette Kopp-Schneider","Gaël Varoquaux","Olivier Colliot","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2409.17763v2.pdf","comment":"Paper accepted at MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2409.17745v2","updated":"2024-09-27T08:19:29Z","published":"2024-09-26T11:19:09Z","title":"Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval\n Model","summary":" A supervised ranking model, despite its advantage of being effective, usually\ninvolves complex processing - typically multiple stages of task-specific\npre-training and fine-tuning. This has motivated researchers to explore simpler\npipelines leveraging large language models (LLMs) that are capable of working\nin a zero-shot manner. However, since zero-shot inference does not make use of\na training set of pairs of queries and their relevant documents, its\nperformance is mostly worse than that of supervised models, which are trained\non such example pairs. Motivated by the existing findings that training\nexamples generally improve zero-shot performance, in our work, we explore if\nthis also applies to ranking models. More specifically, given a query and a\npair of documents, the preference prediction task is improved by augmenting\nexamples of preferences for similar queries from a training set. Our proposed\npairwise few-shot ranker demonstrates consistent improvements over the\nzero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset)\nretrieval benchmarks. Our method also achieves a close performance to that of a\nsupervised model without requiring any complex training pipeline.\n","authors":["Nilanjan Sinhababu","Andrew Parry","Debasis Ganguly","Debasis Samanta","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2409.17745v2.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17699v2","updated":"2024-09-27T10:16:37Z","published":"2024-09-26T10:12:19Z","title":"MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard\n for Prompt Attacks","summary":" The proliferation of Large Language Models (LLMs) in diverse applications\nunderscores the pressing need for robust security measures to thwart potential\njailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger\ndata integrity and user privacy. Guardrails serve as crucial protective\nmechanisms against such threats, but existing models often fall short in terms\nof both detection accuracy, and computational efficiency. This paper advocates\nfor the significance of jailbreak attack prevention on LLMs, and emphasises the\nrole of input guardrails in safeguarding these models. We introduce MoJE\n(Mixture of Jailbreak Expert), a novel guardrail architecture designed to\nsurpass current limitations in existing state-of-the-art guardrails. By\nemploying simple linguistic statistical techniques, MoJE excels in detecting\njailbreak attacks while maintaining minimal computational overhead during model\ninference. Through rigorous experimentation, MoJE demonstrates superior\nperformance capable of detecting 90% of the attacks without compromising benign\nprompts, enhancing LLMs security against jailbreak attacks.\n","authors":["Giandomenico Cornacchia","Giulio Zizzo","Kieran Fraser","Muhammad Zaid Hamed","Ambrish Rawat","Mark Purcell"],"pdf_url":"https://arxiv.org/pdf/2409.17699v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17545v2","updated":"2024-09-27T06:48:08Z","published":"2024-09-26T05:24:14Z","title":"Modulated Intervention Preference Optimization (MIPO): Keep the Easy,\n Refine the Difficult","summary":" Preference optimization methods typically begin training with a well-trained\nSFT model as a reference model. In RLHF and DPO, a regularization term is used\nduring the preference optimization process to prevent the policy model from\ndeviating too far from the reference model's distribution, thereby avoiding the\ngeneration of anomalous responses. When the reference model is already\nwell-aligned with the given data or only requires slight adjustments, this\napproach can produce a well-aligned model. However, if the reference model is\nnot aligned with the given data and requires significant deviation from its\ncurrent state, a regularization term may actually hinder the model alignment.\nIn this study, we propose \\textbf{Modulated Intervention Preference\nOptimization (MIPO)} to address this issue. MIPO modulates the degree of\nintervention from the reference model based on how well the given data is\naligned with it. If the data is well-aligned, the intervention is increased to\nprevent the policy model from diverging significantly from reference model.\nConversely, if the alignment is poor, the interference is reduced to facilitate\nmore extensive training. We compare the performance of MIPO and DPO using\nMistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental\nresults demonstrate that MIPO consistently outperforms DPO across various\nevaluation scenarios.\n","authors":["Cheolhun Jang"],"pdf_url":"https://arxiv.org/pdf/2409.17545v2.pdf","comment":"8pages, submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2409.18964v1","updated":"2024-09-27T17:59:57Z","published":"2024-09-27T17:59:57Z","title":"PhysGen: Rigid-Body Physics-Grounded Image-to-Video Generation","summary":" We present PhysGen, a novel image-to-video generation method that converts a\nsingle image and an input condition (e.g., force and torque applied to an\nobject in the image) to produce a realistic, physically plausible, and\ntemporally consistent video. Our key insight is to integrate model-based\nphysical simulation with a data-driven video generation process, enabling\nplausible image-space dynamics. At the heart of our system are three core\ncomponents: (i) an image understanding module that effectively captures the\ngeometry, materials, and physical parameters of the image; (ii) an image-space\ndynamics simulation model that utilizes rigid-body physics and inferred\nparameters to simulate realistic behaviors; and (iii) an image-based rendering\nand refinement module that leverages generative video diffusion to produce\nrealistic video footage featuring the simulated motion. The resulting videos\nare realistic in both physics and appearance and are even precisely\ncontrollable, showcasing superior results over existing data-driven\nimage-to-video generation works through quantitative comparison and\ncomprehensive user study. PhysGen's resulting videos can be used for various\ndownstream applications, such as turning an image into a realistic animation or\nallowing users to interact with the image and create various dynamics. Project\npage: https://stevenlsw.github.io/physgen/\n","authors":["Shaowei Liu","Zhongzheng Ren","Saurabh Gupta","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18964v1.pdf","comment":"Accepted to ECCV 2024. Project page:\n https://stevenlsw.github.io/physgen/"},{"id":"http://arxiv.org/abs/2409.18962v1","updated":"2024-09-27T17:59:50Z","published":"2024-09-27T17:59:50Z","title":"Exploring Token Pruning in Vision State Space Models","summary":" State Space Models (SSMs) have the advantage of keeping linear computational\ncomplexity compared to attention modules in transformers, and have been applied\nto vision tasks as a new type of powerful vision foundation model. Inspired by\nthe observations that the final prediction in vision transformers (ViTs) is\nonly based on a subset of most informative tokens, we take the novel step of\nenhancing the efficiency of SSM-based vision models through token-based\npruning. However, direct applications of existing token pruning techniques\ndesigned for ViTs fail to deliver good performance, even with extensive\nfine-tuning. To address this issue, we revisit the unique computational\ncharacteristics of SSMs and discover that naive application disrupts the\nsequential token positions. This insight motivates us to design a novel and\ngeneral token pruning method specifically for SSM-based vision models. We first\nintroduce a pruning-aware hidden state alignment method to stabilize the\nneighborhood of remaining tokens for performance enhancement. Besides, based on\nour detailed analysis, we propose a token importance evaluation method adapted\nfor SSM models, to guide the token pruning. With efficient implementation and\npractical acceleration methods, our method brings actual speedup. Extensive\nexperiments demonstrate that our approach can achieve significant computation\nreduction with minimal impact on performance across different tasks. Notably,\nwe achieve 81.7\\% accuracy on ImageNet with a 41.6\\% reduction in the FLOPs for\npruned PlainMamba-L3. Furthermore, our work provides deeper insights into\nunderstanding the behavior of SSM-based vision models for future research.\n","authors":["Zheng Zhan","Zhenglun Kong","Yifan Gong","Yushu Wu","Zichong Meng","Hangyu Zheng","Xuan Shen","Stratis Ioannidis","Wei Niu","Pu Zhao","Yanzhi Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18962v1.pdf","comment":"NeurIPS'24"},{"id":"http://arxiv.org/abs/2409.18959v1","updated":"2024-09-27T17:59:10Z","published":"2024-09-27T17:59:10Z","title":"$O(d/T)$ Convergence Theory for Diffusion Probabilistic Models under\n Minimal Assumptions","summary":" Score-based diffusion models, which generate new data by learning to reverse\na diffusion process that perturbs data from the target distribution into noise,\nhave achieved remarkable success across various generative tasks. Despite their\nsuperior empirical performance, existing theoretical guarantees are often\nconstrained by stringent assumptions or suboptimal convergence rates. In this\npaper, we establish a fast convergence theory for a popular SDE-based sampler\nunder minimal assumptions. Our analysis shows that, provided\n$\\ell_{2}$-accurate estimates of the score functions, the total variation\ndistance between the target and generated distributions is upper bounded by\n$O(d/T)$ (ignoring logarithmic factors), where $d$ is the data dimensionality\nand $T$ is the number of steps. This result holds for any target distribution\nwith finite first-order moment. To our knowledge, this improves upon existing\nconvergence theory for both the SDE-based sampler and another ODE-based\nsampler, while imposing minimal assumptions on the target data distribution and\nscore estimates. This is achieved through a novel set of analytical tools that\nprovides a fine-grained characterization of how the error propagates at each\nstep of the reverse process.\n","authors":["Gen Li","Yuling Yan"],"pdf_url":"https://arxiv.org/pdf/2409.18959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18957v1","updated":"2024-09-27T17:58:50Z","published":"2024-09-27T17:58:50Z","title":"LML: Language Model Learning a Dataset for Data-Augmented Prediction","summary":" This paper introduces a new approach to using Large Language Models (LLMs)\nfor classification tasks, which are typically handled using Machine Learning\n(ML) models. Unlike ML models that rely heavily on data cleaning and feature\nengineering, this method streamlines the process using LLMs. This paper\nproposes a new concept called \"Language Model Learning (LML)\" powered by a new\nmethod called \"Data-Augmented Prediction (DAP)\". The classification is\nperformed by LLMs using a method similar to humans manually exploring and\nunderstanding the data and deciding classifications using data as a reference.\nTraining data is summarized and evaluated to determine the features that lead\nto the classification of each label the most. In the process of DAP, the system\nuses the data summary to automatically create a query, which is used to\nretrieve relevant rows from the dataset. A classification is generated by the\nLLM using data summary and relevant rows, ensuring satisfactory accuracy even\nwith complex data. Usage of data summary and similar data in DAP ensures\ncontext-aware decision-making. The proposed method uses the words \"Act as an\nExplainable Machine Learning Model\" in the prompt to enhance the\ninterpretability of the predictions by allowing users to review the logic\nbehind each prediction. In some test cases, the system scored an accuracy above\n90%, proving the effectiveness of the system and its potential to outperform\nconventional ML models in various scenarios. The code is available at\nhttps://github.com/Pro-GenAI/LML-DAP\n","authors":["Praneeth Vadlapati"],"pdf_url":"https://arxiv.org/pdf/2409.18957v1.pdf","comment":"First version"},{"id":"http://arxiv.org/abs/2409.18952v1","updated":"2024-09-27T17:52:34Z","published":"2024-09-27T17:52:34Z","title":"RepairBench: Leaderboard of Frontier Models for Program Repair","summary":" AI-driven program repair uses AI models to repair buggy software by producing\npatches. Rapid advancements in AI surely impact state-of-the-art performance of\nprogram repair. Yet, grasping this progress requires frequent and standardized\nevaluations. We propose RepairBench, a novel leaderboard for AI-driven program\nrepair. The key characteristics of RepairBench are: 1) it is execution-based:\nall patches are compiled and executed against a test suite, 2) it assesses\nfrontier models in a frequent and standardized way. RepairBench leverages two\nhigh-quality benchmarks, Defects4J and GitBug-Java, to evaluate frontier models\nagainst real-world program repair tasks. We publicly release the evaluation\nframework of RepairBench. We will update the leaderboard as new frontier models\nare released.\n","authors":["André Silva","Martin Monperrus"],"pdf_url":"https://arxiv.org/pdf/2409.18952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18951v1","updated":"2024-09-27T17:52:08Z","published":"2024-09-27T17:52:08Z","title":"Spectral Wavelet Dropout: Regularization in the Wavelet Domain","summary":" Regularization techniques help prevent overfitting and therefore improve the\nability of convolutional neural networks (CNNs) to generalize. One reason for\noverfitting is the complex co-adaptations among different parts of the network,\nwhich make the CNN dependent on their joint response rather than encouraging\neach part to learn a useful feature representation independently. Frequency\ndomain manipulation is a powerful strategy for modifying data that has temporal\nand spatial coherence by utilizing frequency decomposition. This work\nintroduces Spectral Wavelet Dropout (SWD), a novel regularization method that\nincludes two variants: 1D-SWD and 2D-SWD. These variants improve CNN\ngeneralization by randomly dropping detailed frequency bands in the discrete\nwavelet decomposition of feature maps. Our approach distinguishes itself from\nthe pre-existing Spectral \"Fourier\" Dropout (2D-SFD), which eliminates\ncoefficients in the Fourier domain. Notably, SWD requires only a single\nhyperparameter, unlike the two required by SFD. We also extend the literature\nby implementing a one-dimensional version of Spectral \"Fourier\" Dropout\n(1D-SFD), setting the stage for a comprehensive comparison. Our evaluation\nshows that both 1D and 2D SWD variants have competitive performance on\nCIFAR-10/100 benchmarks relative to both 1D-SFD and 2D-SFD. Specifically,\n1D-SWD has a significantly lower computational complexity compared to\n1D/2D-SFD. In the Pascal VOC Object Detection benchmark, SWD variants surpass\n1D-SFD and 2D-SFD in performance and demonstrate lower computational complexity\nduring training.\n","authors":["Rinor Cakaj","Jens Mehnert","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2409.18951v1.pdf","comment":"Accepted by The International Conference on Machine Learning and\n Applications (ICMLA) 2024"},{"id":"http://arxiv.org/abs/2409.18946v1","updated":"2024-09-27T17:46:05Z","published":"2024-09-27T17:46:05Z","title":"Unconditional stability of a recurrent neural circuit implementing\n divisive normalization","summary":" Stability in recurrent neural models poses a significant challenge,\nparticularly in developing biologically plausible neurodynamical models that\ncan be seamlessly trained. Traditional cortical circuit models are notoriously\ndifficult to train due to expansive nonlinearities in the dynamical system,\nleading to an optimization problem with nonlinear stability constraints that\nare difficult to impose. Conversely, recurrent neural networks (RNNs) excel in\ntasks involving sequential data but lack biological plausibility and\ninterpretability. In this work, we address these challenges by linking dynamic\ndivisive normalization (DN) to the stability of ORGaNICs, a biologically\nplausible recurrent cortical circuit model that dynamically achieves DN and has\nbeen shown to simulate a wide range of neurophysiological phenomena. By using\nthe indirect method of Lyapunov, we prove the remarkable property of\nunconditional local stability for an arbitrary-dimensional ORGaNICs circuit\nwhen the recurrent weight matrix is the identity. We thus connect ORGaNICs to a\nsystem of coupled damped harmonic oscillators, which enables us to derive the\ncircuit's energy function, providing a normative principle of what the circuit,\nand individual neurons, aim to accomplish. Further, for a generic recurrent\nweight matrix, we prove the stability of the 2D model and demonstrate\nempirically that stability holds in higher dimensions. Finally, we show that\nORGaNICs can be trained by backpropagation through time without gradient\nclipping/scaling, thanks to its intrinsic stability property and adaptive time\nconstants, which address the problems of exploding, vanishing, and oscillating\ngradients. By evaluating the model's performance on RNN benchmarks, we find\nthat ORGaNICs outperform alternative neurodynamical models on static image\nclassification tasks and perform comparably to LSTMs on sequential tasks.\n","authors":["Shivang Rawat","David J. Heeger","Stefano Martiniani"],"pdf_url":"https://arxiv.org/pdf/2409.18946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.04284v3","updated":"2024-09-27T17:29:24Z","published":"2022-08-08T17:24:04Z","title":"On Rademacher Complexity-based Generalization Bounds for Deep Learning","summary":" We show that the Rademacher complexity-based approach can generate\nnon-vacuous generalisation bounds on Convolutional Neural Networks (CNNs) for\nclassifying a small number of classes of images. The development of new\nTalagrand's contraction lemmas for high-dimensional mappings between function\nspaces and CNNs for general Lipschitz activation functions is a key technical\ncontribution. Our results show that the Rademacher complexity does not depend\non the network length for CNNs with some special types of activation functions\nsuch as ReLU, Leaky ReLU, Parametric Rectifier Linear Unit, Sigmoid, and Tanh.\n","authors":["Lan V. Truong"],"pdf_url":"https://arxiv.org/pdf/2208.04284v3.pdf","comment":"Extra experiments provided"},{"id":"http://arxiv.org/abs/2403.16877v2","updated":"2024-09-27T17:14:26Z","published":"2024-03-25T15:42:09Z","title":"Proprioception Is All You Need: Terrain Classification for Boreal\n Forests","summary":" Recent works in field robotics highlighted the importance of resiliency\nagainst different types of terrains. Boreal forests, in particular, are home to\nmany mobility-impeding terrains that should be considered for off-road\nautonomous navigation. Also, being one of the largest land biomes on Earth,\nboreal forests are an area where autonomous vehicles are expected to become\nincreasingly common. In this paper, we address this issue by introducing\nBorealTC, a publicly available dataset for proprioceptive-based terrain\nclassification (TC). Recorded with a Husky A200, our dataset contains 116 min\nof Inertial Measurement Unit (IMU), motor current, and wheel odometry data,\nfocusing on typical boreal forest terrains, notably snow, ice, and silty loam.\nCombining our dataset with another dataset from the state-of-the-art, we\nevaluate both a Convolutional Neural Network (CNN) and the novel state space\nmodel (SSM)-based Mamba architecture on a TC task. Interestingly, we show that\nwhile CNN outperforms Mamba on each separate dataset, Mamba achieves greater\naccuracy when trained on a combination of both. In addition, we demonstrate\nthat Mamba's learning capacity is greater than a CNN for increasing amounts of\ndata. We show that the combination of two TC datasets yields a latent space\nthat can be interpreted with the properties of the terrains. We also discuss\nthe implications of merging datasets on classification. Our source code and\ndataset are publicly available online:\nhttps://github.com/norlab-ulaval/BorealTC.\n","authors":["Damien LaRocque","William Guimont-Martin","David-Alexandre Duclos","Philippe Giguère","François Pomerleau"],"pdf_url":"https://arxiv.org/pdf/2403.16877v2.pdf","comment":"Accepted to the 2024 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2409.18915v1","updated":"2024-09-27T17:00:32Z","published":"2024-09-27T17:00:32Z","title":"A-FedPD: Aligning Dual-Drift is All Federated Primal-Dual Learning Needs","summary":" As a popular paradigm for juggling data privacy and collaborative training,\nfederated learning (FL) is flourishing to distributively process the large\nscale of heterogeneous datasets on edged clients. Due to bandwidth limitations\nand security considerations, it ingeniously splits the original problem into\nmultiple subproblems to be solved in parallel, which empowers primal dual\nsolutions to great application values in FL. In this paper, we review the\nrecent development of classical federated primal dual methods and point out a\nserious common defect of such methods in non-convex scenarios, which we say is\na \"dual drift\" caused by dual hysteresis of those longstanding inactive clients\nunder partial participation training. To further address this problem, we\npropose a novel Aligned Federated Primal Dual (A-FedPD) method, which\nconstructs virtual dual updates to align global consensus and local dual\nvariables for those protracted unparticipated local clients. Meanwhile, we\nprovide a comprehensive analysis of the optimization and generalization\nefficiency for the A-FedPD method on smooth non-convex objectives, which\nconfirms its high efficiency and practicality. Extensive experiments are\nconducted on several classical FL setups to validate the effectiveness of our\nproposed method.\n","authors":["Yan Sun","Li Shen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2409.18915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09299v2","updated":"2024-09-27T16:54:33Z","published":"2024-02-14T16:41:35Z","title":"Trained Without My Consent: Detecting Code Inclusion In Language Models\n Trained on Code","summary":" Code auditing ensures that the developed code adheres to standards,\nregulations, and copyright protection by verifying that it does not contain\ncode from protected sources. The recent advent of Large Language Models (LLMs)\nas coding assistants in the software development process poses new challenges\nfor code auditing. The dataset for training these models is mainly collected\nfrom publicly available sources. This raises the issue of intellectual property\ninfringement as developers' codes are already included in the dataset.\nTherefore, auditing code developed using LLMs is challenging, as it is\ndifficult to reliably assert if an LLM used during development has been trained\non specific copyrighted codes, given that we do not have access to the training\ndatasets of these models. Given the non-disclosure of the training datasets,\ntraditional approaches such as code clone detection are insufficient for\nasserting copyright infringement. To address this challenge, we propose a new\napproach, TraWiC; a model-agnostic and interpretable method based on membership\ninference for detecting code inclusion in an LLM's training dataset. We extract\nsyntactic and semantic identifiers unique to each program to train a classifier\nfor detecting code inclusion. In our experiments, we observe that TraWiC is\ncapable of detecting 83.87% of codes that were used to train an LLM. In\ncomparison, the prevalent clone detection tool NiCad is only capable of\ndetecting 47.64%. In addition to its remarkable performance, TraWiC has low\nresource overhead in contrast to pair-wise clone detection that is conducted\nduring the auditing process of tools like CodeWhisperer reference tracker,\nacross thousands of code snippets.\n","authors":["Vahid Majdinasab","Amin Nikanjam","Foutse Khomh"],"pdf_url":"https://arxiv.org/pdf/2402.09299v2.pdf","comment":"Accepted for publication in TOSEM (ACM Transactions on Software\n Engineering and Methodology)"},{"id":"http://arxiv.org/abs/2409.18909v1","updated":"2024-09-27T16:46:02Z","published":"2024-09-27T16:46:02Z","title":"Best Arm Identification with Minimal Regret","summary":" Motivated by real-world applications that necessitate responsible\nexperimentation, we introduce the problem of best arm identification (BAI) with\nminimal regret. This innovative variant of the multi-armed bandit problem\nelegantly amalgamates two of its most ubiquitous objectives: regret\nminimization and BAI. More precisely, the agent's goal is to identify the best\narm with a prescribed confidence level $\\delta$, while minimizing the\ncumulative regret up to the stopping time. Focusing on single-parameter\nexponential families of distributions, we leverage information-theoretic\ntechniques to establish an instance-dependent lower bound on the expected\ncumulative regret. Moreover, we present an intriguing impossibility result that\nunderscores the tension between cumulative regret and sample complexity in\nfixed-confidence BAI. Complementarily, we design and analyze the Double KL-UCB\nalgorithm, which achieves asymptotic optimality as the confidence level tends\nto zero. Notably, this algorithm employs two distinct confidence bounds to\nguide arm selection in a randomized manner. Our findings elucidate a fresh\nperspective on the inherent connections between regret minimization and BAI.\n","authors":["Junwen Yang","Vincent Y. F. Tan","Tianyuan Jin"],"pdf_url":"https://arxiv.org/pdf/2409.18909v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2409.18907v1","updated":"2024-09-27T16:45:35Z","published":"2024-09-27T16:45:35Z","title":"In-depth Analysis of Privacy Threats in Federated Learning for Medical\n Data","summary":" Federated learning is emerging as a promising machine learning technique in\nthe medical field for analyzing medical images, as it is considered an\neffective method to safeguard sensitive patient data and comply with privacy\nregulations. However, recent studies have revealed that the default settings of\nfederated learning may inadvertently expose private training data to privacy\nattacks. Thus, the intensity of such privacy risks and potential mitigation\nstrategies in the medical domain remain unclear. In this paper, we make three\noriginal contributions to privacy risk analysis and mitigation in federated\nlearning for medical data. First, we propose a holistic framework, MedPFL, for\nanalyzing privacy risks in processing medical data in the federated learning\nenvironment and developing effective mitigation strategies for protecting\nprivacy. Second, through our empirical analysis, we demonstrate the severe\nprivacy risks in federated learning to process medical images, where\nadversaries can accurately reconstruct private medical images by performing\nprivacy attacks. Third, we illustrate that the prevalent defense mechanism of\nadding random noises may not always be effective in protecting medical images\nagainst privacy attacks in federated learning, which poses unique and pressing\nchallenges related to protecting the privacy of medical data. Furthermore, the\npaper discusses several unique research questions related to the privacy\nprotection of medical data in the federated learning environment. We conduct\nextensive experiments on several benchmark medical image datasets to analyze\nand mitigate the privacy risks associated with federated learning for medical\ndata.\n","authors":["Badhan Chandra Das","M. Hadi Amini","Yanzhao Wu"],"pdf_url":"https://arxiv.org/pdf/2409.18907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18905v1","updated":"2024-09-27T16:44:43Z","published":"2024-09-27T16:44:43Z","title":"Probabilistic Analysis of Least Squares, Orthogonal Projection, and QR\n Factorization Algorithms Subject to Gaussian Noise","summary":" In this paper, we extend the work of Liesen et al. (2002), which analyzes how\nthe condition number of an orthonormal matrix Q changes when a column is added\n([Q, c]), particularly focusing on the perpendicularity of c to the span of Q.\nTheir result, presented in Theorem 2.3 of Liesen et al. (2002), assumes exact\narithmetic and orthonormality of Q, which is a strong assumption when applying\nthese results to numerical methods such as QR factorization algorithms. In our\nwork, we address this gap by deriving bounds on the condition number increase\nfor a matrix B without assuming perfect orthonormality, even when a column is\nnot perfectly orthogonal to the span of B. This framework allows us to analyze\nQR factorization methods where orthogonalization is imperfect and subject to\nGaussian noise. We also provide results on the performance of orthogonal\nprojection and least squares under Gaussian noise, further supporting the\ndevelopment of this theory.\n","authors":["Ali Lotfi","Julien Langou","Mohammad Meysami"],"pdf_url":"https://arxiv.org/pdf/2409.18905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18895v1","updated":"2024-09-27T16:32:57Z","published":"2024-09-27T16:32:57Z","title":"Multi-Source Hard and Soft Information Fusion Approach for Accurate\n Cryptocurrency Price Movement Prediction","summary":" One of the most important challenges in the financial and cryptocurrency\nfield is accurately predicting cryptocurrency price trends. Leveraging\nartificial intelligence (AI) is beneficial in addressing this challenge.\nCryptocurrency markets, marked by substantial growth and volatility, attract\ninvestors and scholars keen on deciphering and forecasting cryptocurrency price\nmovements. The vast and diverse array of data available for such predictions\nincreases the complexity of the task. In our study, we introduce a novel\napproach termed hard and soft information fusion (HSIF) to enhance the accuracy\nof cryptocurrency price movement forecasts. The hard information component of\nour approach encompasses historical price records alongside technical\nindicators. Complementing this, the soft data component extracts from X\n(formerly Twitter), encompassing news headlines and tweets about the\ncryptocurrency. To use this data, we use the Bidirectional Encoder\nRepresentations from Transformers (BERT)-based sentiment analysis method,\nfinancial BERT (FinBERT), which performs best. Finally, our model feeds on the\ninformation set including processed hard and soft data. We employ the\nbidirectional long short-term memory (BiLSTM) model because processing\ninformation in both forward and backward directions can capture long-term\ndependencies in sequential information. Our empirical findings emphasize the\nsuperiority of the HSIF approach over models dependent on single-source data by\ntesting on Bitcoin-related data. By fusing hard and soft information on Bitcoin\ndataset, our model has about 96.8\\% accuracy in predicting price movement.\nIncorporating information enables our model to grasp the influence of social\nsentiment on price fluctuations, thereby supplementing the technical\nanalysis-based predictions derived from hard information.\n","authors":["Saeed Mohammadi Dashtaki","Mehdi Hosseini Chagahi","Behzad Moshiri","Md. Jalil Piran"],"pdf_url":"https://arxiv.org/pdf/2409.18895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18893v1","updated":"2024-09-27T16:31:31Z","published":"2024-09-27T16:31:31Z","title":"HM3: Hierarchical Multi-Objective Model Merging for Pretrained Models","summary":" Model merging is a technique that combines multiple large pretrained models\ninto a single model with enhanced performance and broader task adaptability. It\nhas gained popularity in large pretrained model development due to its ability\nto bypass the need for original training data and further training processes.\nHowever, most existing model merging approaches focus solely on exploring the\nparameter space, merging models with identical architectures. Merging within\nthe architecture space, despite its potential, remains in its early stages due\nto the vast search space and the challenges of layer compatibility. This paper\nmarks a significant advance toward more flexible and comprehensive model\nmerging techniques by modeling the architecture-space merging process as a\nreinforcement learning task. We train policy and value networks using offline\nsampling of weight vectors, which are then employed for the online optimization\nof merging strategies. Moreover, a multi-objective optimization paradigm is\nintroduced to accommodate users' diverse task preferences, learning the Pareto\nfront of optimal models to offer customized merging suggestions. Experimental\nresults across multiple tasks, including text translation, mathematical\nreasoning, and code generation, validate the effectiveness and superiority of\nthe proposed framework in model merging. The code will be made publicly\navailable after the review process.\n","authors":["Yu Zhou","Xingyu Wu","Jibin Wu","Liang Feng","Kay Chen Tan"],"pdf_url":"https://arxiv.org/pdf/2409.18893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15657v3","updated":"2024-09-27T16:24:50Z","published":"2024-09-24T01:40:24Z","title":"M$^2$PT: Multimodal Prompt Tuning for Zero-shot Instruction Learning","summary":" Multimodal Large Language Models (MLLMs) demonstrate remarkable performance\nacross a wide range of domains, with increasing emphasis on enhancing their\nzero-shot generalization capabilities for unseen tasks across various\nmodalities. Instruction tuning has emerged as an effective strategy for\nachieving zero-shot generalization by finetuning pretrained models on diverse\nmultimodal tasks. As the scale of MLLMs continues to grow, parameter-efficient\nfinetuning becomes increasingly critical. However, most existing\nparameter-efficient approaches focus only on single modalities and often\noverlook the multimodal characteristics during finetuning. In this work, we\nintroduce a novel Multimodal Prompt Tuning (M$^2$PT) approach for efficient\ninstruction tuning of MLLMs. M$^2$PT effectively integrates visual and textual\nprompts into the vision encoder and language processor respectively during\nfinetuning, facilitating the extraction and alignment of features across\nmodalities. Empirical results on various multimodal evaluation datasets\ndemonstrate the superior performance of our approach compared to several\nstate-of-the-art baselines. A comprehensive set of ablation studies validates\nthe effectiveness of our prompt design and the efficiency of our approach.\n","authors":["Taowen Wang","Yiyang Liu","James Chenhao Liang","junhan zhao","Yiming Cui","Yuning Mao","Shaoliang Nie","Jiahao Liu","Fuli Feng","Zenglin Xu","Cheng Han","Lifu Huang","Qifan Wang","Dongfang Liu"],"pdf_url":"https://arxiv.org/pdf/2409.15657v3.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18885v1","updated":"2024-09-27T16:20:51Z","published":"2024-09-27T16:20:51Z","title":"HR-Extreme: A High-Resolution Dataset for Extreme Weather Forecasting","summary":" The application of large deep learning models in weather forecasting has led\nto significant advancements in the field, including higher-resolution\nforecasting and extended prediction periods exemplified by models such as Pangu\nand Fuxi. Despite these successes, previous research has largely been\ncharacterized by the neglect of extreme weather events, and the availability of\ndatasets specifically curated for such events remains limited. Given the\ncritical importance of accurately forecasting extreme weather, this study\nintroduces a comprehensive dataset that incorporates high-resolution extreme\nweather cases derived from the High-Resolution Rapid Refresh (HRRR) data, a\n3-km real-time dataset provided by NOAA. We also evaluate the current\nstate-of-the-art deep learning models and Numerical Weather Prediction (NWP)\nsystems on HR-Extreme, and provide a improved baseline deep learning model\ncalled HR-Heim which has superior performance on both general loss and\nHR-Extreme compared to others. Our results reveal that the errors of extreme\nweather cases are significantly larger than overall forecast error,\nhighlighting them as an crucial source of loss in weather prediction. These\nfindings underscore the necessity for future research to focus on improving the\naccuracy of extreme weather forecasts to enhance their practical utility.\n","authors":["Nian Ran","Peng Xiao","Yue Wang","Wesley Shi","Jianxin Lin","Qi Meng","Richard Allmendinger"],"pdf_url":"https://arxiv.org/pdf/2409.18885v1.pdf","comment":"10 pages, under review"},{"id":"http://arxiv.org/abs/2409.18874v1","updated":"2024-09-27T16:10:11Z","published":"2024-09-27T16:10:11Z","title":"CESNET-TimeSeries24: Time Series Dataset for Network Traffic Anomaly\n Detection and Forecasting","summary":" Anomaly detection in network traffic is crucial for maintaining the security\nof computer networks and identifying malicious activities. One of the primary\napproaches to anomaly detection are methods based on forecasting. Nevertheless,\nextensive real-world network datasets for forecasting and anomaly detection\ntechniques are missing, potentially causing performance overestimation of\nanomaly detection algorithms. This manuscript addresses this gap by introducing\na dataset comprising time series data of network entities' behavior, collected\nfrom the CESNET3 network. The dataset was created from 40 weeks of network\ntraffic of 275 thousand active IP addresses. The ISP origin of the presented\ndata ensures a high level of variability among network entities, which forms a\nunique and authentic challenge for forecasting and anomaly detection models. It\nprovides valuable insights into the practical deployment of forecast-based\nanomaly detection approaches.\n","authors":["Josef Koumar","Karel Hynek","Tomáš Čejka","Pavel Šiška"],"pdf_url":"https://arxiv.org/pdf/2409.18874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18872v1","updated":"2024-09-27T16:08:52Z","published":"2024-09-27T16:08:52Z","title":"Simulating Dynamic Tumor Contrast Enhancement in Breast MRI using\n Conditional Generative Adversarial Networks","summary":" This paper presents a method for virtual contrast enhancement in breast MRI,\noffering a promising non-invasive alternative to traditional contrast\nagent-based DCE-MRI acquisition. Using a conditional generative adversarial\nnetwork, we predict DCE-MRI images, including jointly-generated sequences of\nmultiple corresponding DCE-MRI timepoints, from non-contrast-enhanced MRIs,\nenabling tumor localization and characterization without the associated health\nrisks. Furthermore, we qualitatively and quantitatively evaluate the synthetic\nDCE-MRI images, proposing a multi-metric Scaled Aggregate Measure (SAMe),\nassessing their utility in a tumor segmentation downstream task, and conclude\nwith an analysis of the temporal patterns in multi-sequence DCE-MRI generation.\nOur approach demonstrates promising results in generating realistic and useful\nDCE-MRI sequences, highlighting the potential of virtual contrast enhancement\nfor improving breast cancer diagnosis and treatment, particularly for patients\nwhere contrast agent administration is contraindicated.\n","authors":["Richard Osuala","Smriti Joshi","Apostolia Tsirikoglou","Lidia Garrucho","Walter H. L. Pinaya","Daniel M. Lang","Julia A. Schnabel","Oliver Diaz","Karim Lekadir"],"pdf_url":"https://arxiv.org/pdf/2409.18872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03291v2","updated":"2024-09-27T16:04:40Z","published":"2024-09-05T06:55:13Z","title":"LLM Detectors Still Fall Short of Real World: Case of LLM-Generated\n Short News-Like Posts","summary":" With the emergence of widely available powerful LLMs, disinformation\ngenerated by large Language Models (LLMs) has become a major concern.\nHistorically, LLM detectors have been touted as a solution, but their\neffectiveness in the real world is still to be proven. In this paper, we focus\non an important setting in information operations -- short news-like posts\ngenerated by moderately sophisticated attackers.\n We demonstrate that existing LLM detectors, whether zero-shot or\npurpose-trained, are not ready for real-world use in that setting. All tested\nzero-shot detectors perform inconsistently with prior benchmarks and are highly\nvulnerable to sampling temperature increase, a trivial attack absent from\nrecent benchmarks. A purpose-trained detector generalizing across LLMs and\nunseen attacks can be developed, but it fails to generalize to new\nhuman-written texts.\n We argue that the former indicates domain-specific benchmarking is needed,\nwhile the latter suggests a trade-off between the adversarial evasion\nresilience and overfitting to the reference human text, with both needing\nevaluation in benchmarks and currently absent. We believe this suggests a\nre-consideration of current LLM detector benchmarking approaches and provides a\ndynamically extensible benchmark to allow it\n(https://github.com/Reliable-Information-Lab-HEVS/benchmark_llm_texts_detection).\n","authors":["Henrique Da Silva Gameiro","Andrei Kucharavy","Ljiljana Dolamic"],"pdf_url":"https://arxiv.org/pdf/2409.03291v2.pdf","comment":"20 pages, 7 tables, 13 figures, under consideration for EMNLP"},{"id":"http://arxiv.org/abs/2409.18868v1","updated":"2024-09-27T16:04:06Z","published":"2024-09-27T16:04:06Z","title":"Individuation in Neural Models with and without Visual Grounding","summary":" We show differences between a language-and-vision model CLIP and two\ntext-only models - FastText and SBERT - when it comes to the encoding of\nindividuation information. We study latent representations that CLIP provides\nfor substrates, granular aggregates, and various numbers of objects. We\ndemonstrate that CLIP embeddings capture quantitative differences in\nindividuation better than models trained on text-only data. Moreover, the\nindividuation hierarchy we deduce from the CLIP embeddings agrees with the\nhierarchies proposed in linguistics and cognitive science.\n","authors":["Alexey Tikhonov","Lisa Bylinina","Ivan P. Yamshchikov"],"pdf_url":"https://arxiv.org/pdf/2409.18868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18865v1","updated":"2024-09-27T16:02:12Z","published":"2024-09-27T16:02:12Z","title":"Positional Encoder Graph Quantile Neural Networks for Geographic Data","summary":" Positional Encoder Graph Neural Networks (PE-GNNs) are a leading approach for\nmodeling continuous spatial data. However, they often fail to produce\ncalibrated predictive distributions, limiting their effectiveness for\nuncertainty quantification. We introduce the Positional Encoder Graph Quantile\nNeural Network (PE-GQNN), a novel method that integrates PE-GNNs, Quantile\nNeural Networks, and recalibration techniques in a fully nonparametric\nframework, requiring minimal assumptions about the predictive distributions. We\npropose a new network architecture that, when combined with a quantile-based\nloss function, yields accurate and reliable probabilistic models without\nincreasing computational complexity. Our approach provides a flexible, robust\nframework for conditional density estimation, applicable beyond spatial data\ncontexts. We further introduce a structured method for incorporating a KNN\npredictor into the model while avoiding data leakage through the GNN layer\noperation. Experiments on benchmark datasets demonstrate that PE-GQNN\nsignificantly outperforms existing state-of-the-art methods in both predictive\naccuracy and uncertainty quantification.\n","authors":["William E. R. de Amorim","Scott A. Sisson","T. Rodrigues","David J. Nott","Guilherme S. Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2409.18865v1.pdf","comment":"17 main text pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.18859v1","updated":"2024-09-27T15:54:49Z","published":"2024-09-27T15:54:49Z","title":"Challenges of Generating Structurally Diverse Graphs","summary":" For many graph-related problems, it can be essential to have a set of\nstructurally diverse graphs. For instance, such graphs can be used for testing\ngraph algorithms or their neural approximations. However, to the best of our\nknowledge, the problem of generating structurally diverse graphs has not been\nexplored in the literature. In this paper, we fill this gap. First, we discuss\nhow to define diversity for a set of graphs, why this task is non-trivial, and\nhow one can choose a proper diversity measure. Then, for a given diversity\nmeasure, we propose and compare several algorithms optimizing it: we consider\napproaches based on standard random graph models, local graph optimization,\ngenetic algorithms, and neural generative models. We show that it is possible\nto significantly improve diversity over basic random graph generators.\nAdditionally, our analysis of generated graphs allows us to better understand\nthe properties of graph distances: depending on which diversity measure is used\nfor optimization, the obtained graphs may possess very different structural\nproperties which gives insights about the sensitivity of the graph distance\nunderlying the diversity measure.\n","authors":["Fedor Velikonivtsev","Mikhail Mironov","Liudmila Prokhorenkova"],"pdf_url":"https://arxiv.org/pdf/2409.18859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18850v1","updated":"2024-09-27T15:48:39Z","published":"2024-09-27T15:48:39Z","title":"Two Sparse Matrices are Better than One: Sparsifying Neural Networks\n with Double Sparse Factorization","summary":" Neural networks are often challenging to work with due to their large size\nand complexity. To address this, various methods aim to reduce model size by\nsparsifying or decomposing weight matrices, such as magnitude pruning and\nlow-rank or block-diagonal factorization. In this work, we present Double\nSparse Factorization (DSF), where we factorize each weight matrix into two\nsparse matrices. Although solving this problem exactly is computationally\ninfeasible, we propose an efficient heuristic based on alternating minimization\nvia ADMM that achieves state-of-the-art results, enabling unprecedented\nsparsification of neural networks. For instance, in a one-shot pruning setting,\nour method can reduce the size of the LLaMA2-13B model by 50% while maintaining\nbetter performance than the dense LLaMA2-7B model. We also compare favorably\nwith Optimal Brain Compression, the state-of-the-art layer-wise pruning\napproach for convolutional neural networks. Furthermore, accuracy improvements\nof our method persist even after further model fine-tuning.\n Code available at: https://github.com/usamec/double_sparse.\n","authors":["Vladimír Boža","Vladimír Macko"],"pdf_url":"https://arxiv.org/pdf/2409.18850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13550v2","updated":"2024-09-27T15:41:33Z","published":"2024-09-20T14:49:21Z","title":"A preliminary study on continual learning in computer vision using\n Kolmogorov-Arnold Networks","summary":" Deep learning has long been dominated by multi-layer perceptrons (MLPs),\nwhich have demonstrated superiority over other optimizable models in various\ndomains. Recently, a new alternative to MLPs has emerged - Kolmogorov-Arnold\nNetworks (KAN)- which are based on a fundamentally different mathematical\nframework. According to their authors, KANs address several major issues in\nMLPs, such as catastrophic forgetting in continual learning scenarios. However,\nthis claim has only been supported by results from a regression task on a toy\n1D dataset. In this paper, we extend the investigation by evaluating the\nperformance of KANs in continual learning tasks within computer vision,\nspecifically using the MNIST datasets. To this end, we conduct a structured\nanalysis of the behavior of MLPs and two KAN-based models in a\nclass-incremental learning scenario, ensuring that the architectures involved\nhave the same number of trainable parameters. Our results demonstrate that an\nefficient version of KAN outperforms both traditional MLPs and the original KAN\nimplementation. We further analyze the influence of hyperparameters in MLPs and\nKANs, as well as the impact of certain trainable parameters in KANs, such as\nbias and scale weights. Additionally, we provide a preliminary investigation of\nrecent KAN-based convolutional networks and compare their performance with that\nof traditional convolutional neural networks. Our codes can be found at\nhttps://github.com/MrPio/KAN-Continual_Learning_tests.\n","authors":["Alessandro Cacciatore","Valerio Morelli","Federica Paganica","Emanuele Frontoni","Lucia Migliorelli","Daniele Berardini"],"pdf_url":"https://arxiv.org/pdf/2409.13550v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18842v1","updated":"2024-09-27T15:36:24Z","published":"2024-09-27T15:36:24Z","title":"Classical Statistical (In-Sample) Intuitions Don't Generalize Well: A\n Note on Bias-Variance Tradeoffs, Overfitting and Moving from Fixed to Random\n Designs","summary":" The sudden appearance of modern machine learning (ML) phenomena like double\ndescent and benign overfitting may leave many classically trained statisticians\nfeeling uneasy -- these phenomena appear to go against the very core of\nstatistical intuitions conveyed in any introductory class on learning from\ndata. The historical lack of earlier observation of such phenomena is usually\nattributed to today's reliance on more complex ML methods,\noverparameterization, interpolation and/or higher data dimensionality. In this\nnote, we show that there is another reason why we observe behaviors today that\nappear at odds with intuitions taught in classical statistics textbooks, which\nis much simpler to understand yet rarely discussed explicitly. In particular,\nmany intuitions originate in fixed design settings, in which in-sample\nprediction error (under resampling of noisy outcomes) is of interest, while\nmodern ML evaluates its predictions in terms of generalization error, i.e.\nout-of-sample prediction error in random designs. Here, we highlight that this\nsimple move from fixed to random designs has (perhaps surprisingly)\nfar-reaching consequences on textbook intuitions relating to the bias-variance\ntradeoff, and comment on the resulting (im)possibility of observing double\ndescent and benign overfitting in fixed versus random designs.\n","authors":["Alicia Curth"],"pdf_url":"https://arxiv.org/pdf/2409.18842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03920v3","updated":"2024-09-27T15:31:24Z","published":"2024-06-06T10:02:49Z","title":"Towards Physically Consistent Deep Learning For Climate Model\n Parameterizations","summary":" Climate models play a critical role in understanding and projecting climate\nchange. Due to their complexity, their horizontal resolution of about 40-100 km\nremains too coarse to resolve processes such as clouds and convection, which\nneed to be approximated via parameterizations. These parameterizations are a\nmajor source of systematic errors and large uncertainties in climate\nprojections. Deep learning (DL)-based parameterizations, trained on data from\ncomputationally expensive short, high-resolution simulations, have shown great\npromise for improving climate models in that regard. However, their lack of\ninterpretability and tendency to learn spurious non-physical correlations\nresult in reduced trust in the climate simulation. We propose an efficient\nsupervised learning framework for DL-based parameterizations that leads to\nphysically consistent models with improved interpretability and negligible\ncomputational overhead compared to standard supervised training. First, key\nfeatures determining the target physical processes are uncovered. Subsequently,\nthe neural network is fine-tuned using only those relevant features. We show\nempirically that our method robustly identifies a small subset of the inputs as\nactual physical drivers, therefore removing spurious non-physical\nrelationships. This results in by design physically consistent and\ninterpretable neural networks while maintaining the predictive performance of\nunconstrained black-box DL-based parameterizations.\n","authors":["Birgit Kühbacher","Fernando Iglesias-Suarez","Niki Kilbertus","Veronika Eyring"],"pdf_url":"https://arxiv.org/pdf/2406.03920v3.pdf","comment":"Accepted at ICMLA 2024"},{"id":"http://arxiv.org/abs/2402.03646v4","updated":"2024-09-27T15:30:40Z","published":"2024-02-06T02:45:13Z","title":"Lens: A Foundation Model for Network Traffic","summary":" Network traffic refers to the amount of data being sent and received over the\ninternet or any system that connects computers. Analyzing and understanding\nnetwork traffic is vital for improving network security and management.\nHowever, the analysis of network traffic is challenging due to the diverse\nnature of data packets, which often feature heterogeneous headers and encrypted\npayloads lacking semantics. To capture the latent semantics of traffic, a few\nstudies have adopted pre-training techniques based on the Transformer encoder\nor decoder to learn the representations from massive traffic data. However,\nthese methods typically excel in traffic understanding (classification) or\ntraffic generation tasks. To address this issue, we develop Lens, a foundation\nmodel for network traffic that leverages the T5 architecture to learn the\npre-trained representations from large-scale unlabeled data. Harnessing the\nstrength of the encoder-decoder framework, which captures the global\ninformation while preserving the generative ability, our model can better learn\nthe representations from raw data. To further enhance pre-training\neffectiveness, we design a novel loss that combines three distinct tasks:\nMasked Span Prediction (MSP), Packet Order Prediction (POP), and Homologous\nTraffic Prediction (HTP). Evaluation results across various benchmark datasets\ndemonstrate that the proposed Lens outperforms the baselines in most downstream\ntasks related to both traffic understanding and generation. Notably, it also\nrequires much less labeled data for fine-tuning compared to current methods.\n","authors":["Qineng Wang","Chen Qian","Xiaochang Li","Ziyu Yao","Gang Zhou","Huajie Shao"],"pdf_url":"https://arxiv.org/pdf/2402.03646v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18836v1","updated":"2024-09-27T15:29:32Z","published":"2024-09-27T15:29:32Z","title":"Constructing Confidence Intervals for 'the' Generalization Error -- a\n Comprehensive Benchmark Study","summary":" When assessing the quality of prediction models in machine learning,\nconfidence intervals (CIs) for the generalization error, which measures\npredictive performance, are a crucial tool. Luckily, there exist many methods\nfor computing such CIs and new promising approaches are continuously being\nproposed. Typically, these methods combine various resampling procedures, most\npopular among them cross-validation and bootstrapping, with different variance\nestimation techniques. Unfortunately, however, there is currently no consensus\non when any of these combinations may be most reliably employed and how they\ngenerally compare. In this work, we conduct the first large-scale study\ncomparing CIs for the generalization error - empirically evaluating 13\ndifferent methods on a total of 18 tabular regression and classification\nproblems, using four different inducers and a total of eight loss functions. We\ngive an overview of the methodological foundations and inherent challenges of\nconstructing CIs for the generalization error and provide a concise review of\nall 13 methods in a unified framework. Finally, the CI methods are evaluated in\nterms of their relative coverage frequency, width, and runtime. Based on these\nfindings, we are able to identify a subset of methods that we would recommend.\nWe also publish the datasets as a benchmarking suite on OpenML and our code on\nGitHub to serve as a basis for further studies.\n","authors":["Hannah Schulz-Kümpel","Sebastian Fischer","Thomas Nagler","Anne-Laure Boulesteix","Bernd Bischl","Roman Hornung"],"pdf_url":"https://arxiv.org/pdf/2409.18836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18832v1","updated":"2024-09-27T15:27:04Z","published":"2024-09-27T15:27:04Z","title":"Classification and regression of trajectories rendered as images via 2D\n Convolutional Neural Networks","summary":" Trajectories can be regarded as time-series of coordinates, typically arising\nfrom motile objects. Methods for trajectory classification are particularly\nimportant to detect different movement patterns, while methods for regression\nto compute motility metrics and forecasting. Recent advances in computer vision\nhave facilitated the processing of trajectories rendered as images via\nartificial neural networks with 2d convolutional layers (CNNs). This approach\nleverages the capability of CNNs to learn spatial hierarchies of features from\nimages, necessary to recognize complex shapes. Moreover, it overcomes the\nlimitation of other machine learning methods that require input trajectories\nwith a fixed number of points. However, rendering trajectories as images can\nintroduce poorly investigated artifacts such as information loss due to the\nplotting of coordinates on a discrete grid, and spectral changes due to line\nthickness and aliasing. In this study, we investigate the effectiveness of CNNs\nfor solving classification and regression problems from synthetic trajectories\nthat have been rendered as images using different modalities. The parameters\nconsidered in this study include line thickness, image resolution, usage of\nmotion history (color-coding of the temporal component) and anti-aliasing.\nResults highlight the importance of choosing an appropriate image resolution\naccording to model depth and motion history in applications where movement\ndirection is critical.\n","authors":["Mariaclaudia Nicolai","Raffaella Fiamma Cabini","Diego Ulisse Pizzagalli"],"pdf_url":"https://arxiv.org/pdf/2409.18832v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.14857v3","updated":"2024-09-27T15:26:59Z","published":"2023-09-26T11:35:25Z","title":"Cluster Exploration using Informative Manifold Projections","summary":" Dimensionality reduction (DR) is one of the key tools for the visual\nexploration of high-dimensional data and uncovering its cluster structure in\ntwo- or three-dimensional spaces. The vast majority of DR methods in the\nliterature do not take into account any prior knowledge a practitioner may have\nregarding the dataset under consideration. We propose a novel method to\ngenerate informative embeddings which not only factor out the structure\nassociated with different kinds of prior knowledge but also aim to reveal any\nremaining underlying structure. To achieve this, we employ a linear combination\nof two objectives: firstly, contrastive PCA that discounts the structure\nassociated with the prior information, and secondly, kurtosis projection\npursuit which ensures meaningful data separation in the obtained embeddings. We\nformulate this task as a manifold optimization problem and validate it\nempirically across a variety of datasets considering three distinct types of\nprior knowledge. Lastly, we provide an automated framework to perform iterative\nvisual exploration of high-dimensional data.\n","authors":["Stavros Gerolymatos","Xenophon Evangelopoulos","Vladimir Gusev","John Y. Goulermas"],"pdf_url":"https://arxiv.org/pdf/2309.14857v3.pdf","comment":"This paper has been accepted in the 27th European Conference on\n Artificial Intelligence (ECAI) 2024"},{"id":"http://arxiv.org/abs/2307.13127v2","updated":"2024-09-27T15:24:00Z","published":"2023-07-24T21:03:25Z","title":"A Differentially Private Weighted Empirical Risk Minimization Procedure\n and its Application to Outcome Weighted Learning","summary":" It is common practice to use data containing personal information to build\npredictive models in the framework of empirical risk minimization (ERM). While\nthese models can be highly accurate in prediction, sharing the results from\nthese models trained on sensitive data may be susceptible to privacy attacks.\nDifferential privacy (DP) is an appealing framework for addressing such data\nprivacy issues by providing mathematically provable bounds on the privacy loss\nincurred when releasing information from sensitive data. Previous work has\nprimarily concentrated on applying DP to unweighted ERM. We consider weighted\nERM (wERM), an important generalization, where each individual's contribution\nto the objective function can be assigned varying weights. We propose the first\ndifferentially private algorithm for general wERM, with theoretical DP\nguarantees. Extending the existing DP-ERM procedures to wERM creates a pathway\nfor deriving privacy-preserving learning methods for individualized treatment\nrules, including the popular outcome weighted learning (OWL). We evaluate the\nperformance of the DP-wERM framework applied to OWL in both simulation studies\nand in a real clinical trial. All empirical results demonstrate the feasibility\nof training OWL models via wERM with DP guarantees while maintaining\nsufficiently robust model performance, providing strong evidence for the\npracticality of implementing the proposed privacy-preserving OWL procedure in\nreal-world scenarios involving sensitive data.\n","authors":["Spencer Giddens","Yiwang Zhou","Kevin R. Krull","Tara M. Brinkman","Peter X. K. Song","Fang Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13127v2.pdf","comment":"29 pages, 1 figure, and 1 table for the main manuscript; 10 pages, 4\n figures, and 1 table for the supplementary materials"},{"id":"http://arxiv.org/abs/2409.18827v1","updated":"2024-09-27T15:22:28Z","published":"2024-09-27T15:22:28Z","title":"ARLBench: Flexible and Efficient Benchmarking for Hyperparameter\n Optimization in Reinforcement Learning","summary":" Hyperparameters are a critical factor in reliably training well-performing\nreinforcement learning (RL) agents. Unfortunately, developing and evaluating\nautomated approaches for tuning such hyperparameters is both costly and\ntime-consuming. As a result, such approaches are often only evaluated on a\nsingle domain or algorithm, making comparisons difficult and limiting insights\ninto their generalizability. We propose ARLBench, a benchmark for\nhyperparameter optimization (HPO) in RL that allows comparisons of diverse HPO\napproaches while being highly efficient in evaluation. To enable research into\nHPO in RL, even in settings with low compute resources, we select a\nrepresentative subset of HPO tasks spanning a variety of algorithm and\nenvironment combinations. This selection allows for generating a performance\nprofile of an automated RL (AutoRL) method using only a fraction of the compute\npreviously necessary, enabling a broader range of researchers to work on HPO in\nRL. With the extensive and large-scale dataset on hyperparameter landscapes\nthat our selection is based on, ARLBench is an efficient, flexible, and\nfuture-oriented foundation for research on AutoRL. Both the benchmark and the\ndataset are available at https://github.com/automl/arlbench.\n","authors":["Jannis Becktepe","Julian Dierkes","Carolin Benjamins","Aditya Mohan","David Salinas","Raghu Rajan","Frank Hutter","Holger Hoos","Marius Lindauer","Theresa Eimer"],"pdf_url":"https://arxiv.org/pdf/2409.18827v1.pdf","comment":"Accepted at the 17th European Workshop on Reinforcement Learning"},{"id":"http://arxiv.org/abs/2409.15204v2","updated":"2024-09-27T15:19:23Z","published":"2024-09-23T16:51:43Z","title":"RAMBO: Enhancing RAG-based Repository-Level Method Body Completion","summary":" Code completion is essential in software development, helping developers by\npredicting code snippets based on context. Among completion tasks, Method Body\nCompletion (MBC) is particularly challenging as it involves generating complete\nmethod bodies based on their signatures and context. This task becomes\nsignificantly harder in large repositories, where method bodies must integrate\nrepositoryspecific elements such as custom APIs, inter-module dependencies, and\nproject-specific conventions. In this paper, we introduce RAMBO, a novel\nRAG-based approach for repository-level MBC. Instead of retrieving similar\nmethod bodies, RAMBO identifies essential repository-specific elements, such as\nclasses, methods, and variables/fields, and their relevant usages. By\nincorporating these elements and their relevant usages into the code generation\nprocess, RAMBO ensures more accurate and contextually relevant method bodies.\nOur experimental results with leading code LLMs across 40 Java projects show\nthat RAMBO significantly outperformed the state-of-the-art repository-level MBC\napproaches, with the improvements of up to 46% in BLEU, 57% in CodeBLEU, 36% in\nCompilation Rate, and up to 3X in Exact Match. Notably, RAMBO surpassed\nRepoCoder Oracle method by up to 12% in Exact Match, setting a new benchmark\nfor repository-level MBC.\n","authors":["Tuan-Dung Bui","Duc-Thieu Luu-Van","Thanh-Phat Nguyen","Thu-Trang Nguyen","Son Nguyen","Hieu Dinh Vo"],"pdf_url":"https://arxiv.org/pdf/2409.15204v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11792v4","updated":"2024-09-27T15:10:47Z","published":"2024-08-21T17:25:40Z","title":"Optical ISAC: Fundamental Performance Limits and Transceiver Design","summary":" This paper characterizes the optimal Capacity-Distortion (C-D) tradeoff in an\noptical point-to-point system with Single-Input Single-Output (SISO) for\ncommunication and Single-Input Multiple-Output (SIMO) for sensing within an\nIntegrated Sensing and Communication (ISAC) framework. We consider the optimal\nRate-Distortion (R-D) region and explore several Inner (IB) and Outer Bounds\n(OB). We introduce practical, asymptotically optimal Maximum A Posteriori (MAP)\nand Maximum Likelihood Estimators (MLE) for target distance, addressing\nnonlinear measurement-to-state relationships and non-conjugate priors. As the\nnumber of sensing antennas increases, these estimators converge to the Bayesian\nCram\\'er-Rao Bound (BCRB). We also establish that the achievable\nRate-Cram\\'er-Rao Bound (R-CRB) serves as an OB for the optimal C-D region,\nvalid for both unbiased estimators and asymptotically large numbers of receive\nantennas. To clarify that the input distribution determines the tradeoff across\nthe Pareto boundary of the C-D region, we propose two algorithms: i) an\niterative Blahut-Arimoto Algorithm (BAA)-type method, and ii) a\nmemory-efficient Closed-Form (CF) approach. The CF approach includes a CF\noptimal distribution for high Optical Signal-to-Noise Ratio (O-SNR) conditions.\nAdditionally, we adapt and refine the Deterministic-Random Tradeoff (DRT) to\nthis optical ISAC context.\n","authors":["Alireza Ghazavi Khorasgani","Mahtab Mirmohseni","Ahmed Elzanaty"],"pdf_url":"https://arxiv.org/pdf/2408.11792v4.pdf","comment":"This paper is 8 pages long and includes 1 algorithm, 3 figures, and 3\n tables. It has been accepted for presentation at the 2024 Global\n Communications Conference. For further discussion, please visit AlphaXiv or\n email the authors"},{"id":"http://arxiv.org/abs/2312.15124v2","updated":"2024-09-27T15:08:48Z","published":"2023-12-23T00:35:23Z","title":"On fundamental aspects of quantum extreme learning machines","summary":" Quantum Extreme Learning Machines (QELMs) have emerged as a promising\nframework for quantum machine learning. Their appeal lies in the rich feature\nmap induced by the dynamics of a quantum substrate - the quantum reservoir -\nand the efficient post-measurement training via linear regression. Here we\nstudy the expressivity of QELMs by decomposing the prediction of QELMs into a\nFourier series. We show that the achievable Fourier frequencies are determined\nby the data encoding scheme, while Fourier coefficients depend on both the\nreservoir and the measurement. Notably, the expressivity of QELMs is\nfundamentally limited by the number of Fourier frequencies and the number of\nobservables, while the complexity of the prediction hinges on the reservoir. As\na cautionary note on scalability, we identify four sources that can lead to the\nexponential concentration of the observables as the system size grows\n(randomness, hardware noise, entanglement, and global measurements) and show\nhow this can turn QELMs into useless input-agnostic oracles. In particular, our\nresult on the reservoir-induced concentration strongly indicates that quantum\nreservoirs drawn from a highly random ensemble make QELM models unscalable. Our\nanalysis elucidates the potential and fundamental limitations of QELMs, and\nlays the groundwork for systematically exploring quantum reservoir systems for\nother machine learning tasks.\n","authors":["Weijie Xiong","Giorgio Facelli","Mehrad Sahebi","Owen Agnel","Thiparat Chotibut","Supanut Thanasilp","Zoë Holmes"],"pdf_url":"https://arxiv.org/pdf/2312.15124v2.pdf","comment":"20+21 pages, 9+2 figures"},{"id":"http://arxiv.org/abs/2409.18814v1","updated":"2024-09-27T15:07:26Z","published":"2024-09-27T15:07:26Z","title":"Early diagnosis of Alzheimer's disease from MRI images with deep\n learning model","summary":" It is acknowledged that the most common cause of dementia worldwide is\nAlzheimer's disease (AD). This condition progresses in severity from mild to\nsevere and interferes with people's everyday routines. Early diagnosis plays a\ncritical role in patient care and clinical trials. Convolutional neural\nnetworks (CNN) are used to create a framework for identifying specific disease\nfeatures from MRI scans Classification of dementia involves approaches such as\nmedical history review, neuropsychological tests, and magnetic resonance\nimaging (MRI). However, the image dataset obtained from Kaggle faces a\nsignificant issue of class imbalance, which requires equal distribution of\nsamples from each class to address. In this article, to address this imbalance,\nthe Synthetic Minority Oversampling Technique (SMOTE) is utilized. Furthermore,\na pre-trained convolutional neural network has been applied to the DEMNET\ndementia network to extract key features from AD images. The proposed model\nachieved an impressive accuracy of 98.67%.\n","authors":["Sajjad Aghasi Javid","Mahmood Mohassel Feghhi"],"pdf_url":"https://arxiv.org/pdf/2409.18814v1.pdf","comment":"7 pages, 3 figures, Presented at the 20-th CSI International\n Symposium on Artificial Intelligence and Signal Processing (AISP) 21-22\n February, 2024, Mazandaran University of Science and Technology, Babol, Iran"},{"id":"http://arxiv.org/abs/2406.14591v2","updated":"2024-09-27T15:01:06Z","published":"2024-06-20T10:21:55Z","title":"Physics-informed neural networks for parameter learning of wildfire\n spreading","summary":" Wildland fires pose a terrifying natural hazard, underscoring the urgent need\nto develop data-driven and physics-informed digital twins for wildfire\nprevention, monitoring, intervention, and response. In this direction of\nresearch, this work introduces a physics-informed neural network (PiNN)\ndesigned to learn the unknown parameters of an interpretable wildfire spreading\nmodel. The considered modeling approach integrates fundamental physical laws\narticulated by key model parameters essential for capturing the complex\nbehavior of wildfires. The proposed machine learning framework leverages the\ntheory of artificial neural networks with the physical constraints governing\nwildfire dynamics, including the first principles of mass and energy\nconservation. Training of the PiNN for physics-informed parameter\nidentification is realized using synthetic data on the spatiotemporal evolution\nof one- and two-dimensional firefronts, derived from a high-fidelity simulator,\nas well as empirical data (ground surface thermal images) from the Troy Fire\nthat occurred on June 19, 2002, in California. The parameter learning results\ndemonstrate the predictive ability of the proposed PiNN in uncovering the\nunknown coefficients of the wildfire model in one- and two-dimensional fire\nspreading scenarios as well as the Troy Fire. Additionally, this methodology\nexhibits robustness by identifying the same parameters even in the presence of\nnoisy data. By integrating this PiNN approach into a comprehensive framework,\nthe envisioned physics-informed digital twin will enhance intelligent wildfire\nmanagement and risk assessment, providing a powerful tool for proactive and\nreactive strategies.\n","authors":["Konstantinos Vogiatzoglou","Costas Papadimitriou","Vasilis Bontozoglou","Konstantinos Ampountolas"],"pdf_url":"https://arxiv.org/pdf/2406.14591v2.pdf","comment":"32 pages, 14 figures, 2 Tables"},{"id":"http://arxiv.org/abs/2409.18804v1","updated":"2024-09-27T14:57:18Z","published":"2024-09-27T14:57:18Z","title":"Convergence of Diffusion Models Under the Manifold Hypothesis in\n High-Dimensions","summary":" Denoising Diffusion Probabilistic Models (DDPM) are powerful state-of-the-art\nmethods used to generate synthetic data from high-dimensional data\ndistributions and are widely used for image, audio and video generation as well\nas many more applications in science and beyond. The manifold hypothesis states\nthat high-dimensional data often lie on lower-dimensional manifolds within the\nambient space, and is widely believed to hold in provided examples. While\nrecent results has provided invaluable insight into how diffusion models adapt\nto the manifold hypothesis, they do not capture the great empirical success of\nthese models, making this a very fruitful research direction.\n In this work, we study DDPMs under the manifold hypothesis and prove that\nthey achieve rates independent of the ambient dimension in terms of learning\nthe score. In terms of sampling, we obtain rates independent of the ambient\ndimension w.r.t. the Kullback-Leibler divergence, and $O(\\sqrt{D})$ w.r.t. the\nWasserstein distance. We do this by developing a new framework connecting\ndiffusion models to the well-studied theory of extrema of Gaussian Processes.\n","authors":["Iskander Azangulov","George Deligiannidis","Judith Rousseau"],"pdf_url":"https://arxiv.org/pdf/2409.18804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18798v1","updated":"2024-09-27T14:53:04Z","published":"2024-09-27T14:53:04Z","title":"Esports Debut as a Medal Event at 2023 Asian Games: Exploring Public\n Perceptions with BERTopic and GPT-4 Topic Fine-Tuning","summary":" This study examined the public opinions of esports at the 2023 Asian Games\nand value co-creation during the event using an LLM-enhanced BERTopic modeling\nanalysis. We identified five major themes representing public perceptions, as\nwell as how major stakeholders co-created value within and beyond the esports\necosystem. Key findings highlighted the strategic use of social media marketing\nto influence public opinion and promote esports events and brands, emphasizing\nthe importance of event logistics and infrastructure. Additionally, the study\nrevealed the co-creation value contributed by stakeholders outside the\ntraditional esports ecosystem, particularly in promoting national\nrepresentation and performance. Our findings supported the ongoing efforts to\nlegitimize esports as a sport, noting that mainstream recognition remains a\nchallenge. The inclusion of esports as a medal event showcased broader\nacceptance and helped mitigate negative public perceptions. Moreover,\ncontributions from non-traditional stakeholders underscored the value of\ncross-subcultural collaborations in esports.\n","authors":["Tyreal Yizhou Qian","Bo Yu","Weizhe Li","Chenglong Xu"],"pdf_url":"https://arxiv.org/pdf/2409.18798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18796v1","updated":"2024-09-27T14:50:36Z","published":"2024-09-27T14:50:36Z","title":"Hierarchical Federated ADMM","summary":" In this paper, we depart from the widely-used gradient descent-based\nhierarchical federated learning (FL) algorithms to develop a novel hierarchical\nFL framework based on the alternating direction method of multipliers (ADMM).\nWithin this framework, we propose two novel FL algorithms, which both use ADMM\nin the top layer: one that employs ADMM in the lower layer and another that\nuses the conventional gradient descent-based approach. The proposed framework\nenhances privacy, and experiments demonstrate the superiority of the proposed\nalgorithms compared to the conventional algorithms in terms of learning\nconvergence and accuracy. Additionally, gradient descent on the lower layer\nperforms well even if the number of local steps is very limited, while ADMM on\nboth layers lead to better performance otherwise.\n","authors":["Seyed Mohammad Azimi-Abarghouyi","Nicola Bastianello","Karl H. Johansson","Viktoria Fodor"],"pdf_url":"https://arxiv.org/pdf/2409.18796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10494v4","updated":"2024-09-27T14:50:23Z","published":"2023-02-21T07:48:34Z","title":"The Role of Masking for Efficient Supervised Knowledge Distillation of\n Vision Transformers","summary":" Knowledge distillation is an effective method for training lightweight vision\nmodels. However, acquiring teacher supervision for training samples is often\ncostly, especially from large-scale models like vision transformers (ViTs). In\nthis paper, we develop a simple framework to reduce the supervision cost of ViT\ndistillation: masking out a fraction of input tokens given to the teacher. By\nmasking input tokens, one can skip the computations associated with the masked\ntokens without requiring any change to teacher parameters or architecture. We\nfind that masking patches with the lowest student attention scores is highly\neffective, saving up to 50% of teacher FLOPs without any drop in student\naccuracy, while other masking criterion leads to suboptimal efficiency gains.\nThrough in-depth analyses, we reveal that the student-guided masking provides a\ngood curriculum to the student, making teacher supervision easier to follow\nduring the early stage and challenging in the later stage.\n","authors":["Seungwoo Son","Jegwang Ryu","Namhoon Lee","Jaeho Lee"],"pdf_url":"https://arxiv.org/pdf/2302.10494v4.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2404.07164v2","updated":"2024-09-27T14:32:19Z","published":"2024-04-10T17:00:04Z","title":"PIM-Opt: Demystifying Distributed Optimization Algorithms on a\n Real-World Processing-In-Memory System","summary":" Modern Machine Learning (ML) training on large-scale datasets is a very\ntime-consuming workload. It relies on the optimization algorithm Stochastic\nGradient Descent (SGD) due to its effectiveness, simplicity, and generalization\nperformance. Processor-centric architectures (e.g., CPUs, GPUs) commonly used\nfor modern ML training workloads based on SGD are bottlenecked by data movement\nbetween the processor and memory units due to the poor data locality in\naccessing large datasets. As a result, processor-centric architectures suffer\nfrom low performance and high energy consumption while executing ML training\nworkloads. Processing-In-Memory (PIM) is a promising solution to alleviate the\ndata movement bottleneck by placing the computation mechanisms inside or near\nmemory.\n Our goal is to understand the capabilities of popular distributed SGD\nalgorithms on real-world PIM systems to accelerate data-intensive ML training\nworkloads. To this end, we 1) implement several representative centralized\nparallel SGD algorithms on the real-world UPMEM PIM system, 2) rigorously\nevaluate these algorithms for ML training on large-scale datasets in terms of\nperformance, accuracy, and scalability, 3) compare to conventional CPU and GPU\nbaselines, and 4) discuss implications for future PIM hardware and highlight\nthe need for a shift to an algorithm-hardware codesign.\n Our results demonstrate three major findings: 1) The UPMEM PIM system can be\na viable alternative to state-of-the-art CPUs and GPUs for many memory-bound ML\ntraining workloads, especially when operations and datatypes are natively\nsupported by PIM hardware, 2) it is important to carefully choose the\noptimization algorithms that best fit PIM, and 3) the UPMEM PIM system does not\nscale approximately linearly with the number of nodes for many data-intensive\nML training workloads. We open source all our code to facilitate future\nresearch.\n","authors":["Steve Rhyner","Haocong Luo","Juan Gómez-Luna","Mohammad Sadrosadati","Jiawei Jiang","Ataberk Olgun","Harshita Gupta","Ce Zhang","Onur Mutlu"],"pdf_url":"https://arxiv.org/pdf/2404.07164v2.pdf","comment":"\"PIM-Opt: Demystifying Distributed Optimization Algorithms on a\n Real-World Processing-In-Memory System\" in Proceedings of the 33rd\n International Conference on Parallel Architectures and Compilation Techniques\n (PACT), Long Beach, CA, USA, October 2024"},{"id":"http://arxiv.org/abs/2409.18778v1","updated":"2024-09-27T14:24:16Z","published":"2024-09-27T14:24:16Z","title":"HardCore Generation: Generating Hard UNSAT Problems for Data\n Augmentation","summary":" Efficiently determining the satisfiability of a boolean equation -- known as\nthe SAT problem for brevity -- is crucial in various industrial problems.\nRecently, the advent of deep learning methods has introduced significant\npotential for enhancing SAT solving. However, a major barrier to the\nadvancement of this field has been the scarcity of large, realistic datasets.\nThe majority of current public datasets are either randomly generated or\nextremely limited, containing only a few examples from unrelated problem\nfamilies. These datasets are inadequate for meaningful training of deep\nlearning methods. In light of this, researchers have started exploring\ngenerative techniques to create data that more accurately reflect SAT problems\nencountered in practical situations. These methods have so far suffered from\neither the inability to produce challenging SAT problems or time-scalability\nobstacles. In this paper we address both by identifying and manipulating the\nkey contributors to a problem's ``hardness'', known as cores. Although some\nprevious work has addressed cores, the time costs are unacceptably high due to\nthe expense of traditional heuristic core detection techniques. We introduce a\nfast core detection procedure that uses a graph neural network. Our empirical\nresults demonstrate that we can efficiently generate problems that remain hard\nto solve and retain key attributes of the original example problems. We show\nvia experiment that the generated synthetic SAT problems can be used in a data\naugmentation setting to provide improved prediction of solver runtimes.\n","authors":["Joseph Cotnareanu","Zhanguang Zhang","Hui-Ling Zhen","Yingxue Zhang","Mark Coates"],"pdf_url":"https://arxiv.org/pdf/2409.18778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11876v2","updated":"2024-09-27T14:18:56Z","published":"2024-03-18T15:28:35Z","title":"Deep Bayesian Future Fusion for Self-Supervised, High-Resolution,\n Off-Road Mapping","summary":" High-speed off-road navigation requires long-range, high-resolution maps to\nenable robots to safely navigate over different surfaces while avoiding\ndangerous obstacles. However, due to limited computational power and sensing\nnoise, most approaches to off-road mapping focus on producing coarse (20-40cm)\nmaps of the environment. In this paper, we propose Future Fusion, a framework\ncapable of generating dense, high-resolution maps from sparse sensing data (30m\nforward at 2cm). This is accomplished by - (1) the efficient realization of the\nwell-known Bayes filtering within the standard deep learning models that\nexplicitly accounts for the sparsity pattern in stereo and LiDAR depth data,\nand (2) leveraging perceptual losses common in generative image completion. The\nproposed methodology outperforms the conventional baselines. Moreover, the\nlearned features and the completed dense maps lead to improvements in the\ndownstream navigation task.\n","authors":["Shubhra Aich","Wenshan Wang","Parv Maheshwari","Matthew Sivaprakasam","Samuel Triest","Cherie Ho","Jason M. Gregory","John G. Rogers III","Sebastian Scherer"],"pdf_url":"https://arxiv.org/pdf/2403.11876v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18772v1","updated":"2024-09-27T14:16:35Z","published":"2024-09-27T14:16:35Z","title":"A method of using RSVD in residual calculation of LowBit GEMM","summary":" The advancements of hardware technology in recent years has brought many\npossibilities for low-precision applications. However, the use of low precision\ncan introduce significant computational errors, posing a considerable challenge\nto maintaining the computational accuracy.\n We propose low-rank residuals quantized matrix multiplication(LRQMM) method\nwhich introduces low-rank approximation in residual compensation for dense low\nprecision quantization matrix multiplication. It can bring several times\naccuracy improvement with only BLAS-2 level extra time overhead. Moreover,\nLRQMM is a completely data-free quantization method that does not require\nadditional data for pre-training. And it only works with low precision GEMM\noperator, which is easy to couple with other methods.\n Through experimentation, LRQMM can reduce the error of direct quantized\nmatrix multiplication by 1~2 orders of magnitude, when dealing with larger\nmatrix sizes, the computational speed is only reduced by approximately 20\\%. In\ndeep learning networks, LRQMM-4bit achieves 61.8% ImageNet Top-1 accuracy in\nResnet-50, while the Direct Quant accuracy is only 8.3%.\n","authors":["Hongyaoxing Gu"],"pdf_url":"https://arxiv.org/pdf/2409.18772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18768v1","updated":"2024-09-27T14:12:49Z","published":"2024-09-27T14:12:49Z","title":"Learning from Demonstration with Implicit Nonlinear Dynamics Models","summary":" Learning from Demonstration (LfD) is a useful paradigm for training policies\nthat solve tasks involving complex motions. In practice, the successful\napplication of LfD requires overcoming error accumulation during policy\nexecution, i.e. the problem of drift due to errors compounding over time and\nthe consequent out-of-distribution behaviours. Existing works seek to address\nthis problem through scaling data collection, correcting policy errors with a\nhuman-in-the-loop, temporally ensembling policy predictions or through learning\nthe parameters of a dynamical system model. In this work, we propose and\nvalidate an alternative approach to overcoming this issue. Inspired by\nreservoir computing, we develop a novel neural network layer that includes a\nfixed nonlinear dynamical system with tunable dynamical properties. We validate\nthe efficacy of our neural network layer on the task of reproducing human\nhandwriting motions using the LASA Human Handwriting Dataset. Through empirical\nexperiments we demonstrate that incorporating our layer into existing neural\nnetwork architectures addresses the issue of compounding errors in LfD.\nFurthermore, we perform a comparative evaluation against existing approaches\nincluding a temporal ensemble of policy predictions and an Echo State Networks\n(ESNs) implementation. We find that our approach yields greater policy\nprecision and robustness on the handwriting task while also generalising to\nmultiple dynamics regimes and maintaining competitive latency scores.\n","authors":["Peter David Fagan","Subramanian Ramamoorthy"],"pdf_url":"https://arxiv.org/pdf/2409.18768v1.pdf","comment":"21 pages, 9 figures"},{"id":"http://arxiv.org/abs/2311.13833v2","updated":"2024-09-27T14:04:35Z","published":"2023-11-23T07:33:38Z","title":"Lego: Learning to Disentangle and Invert Personalized Concepts Beyond\n Object Appearance in Text-to-Image Diffusion Models","summary":" Text-to-Image (T2I) models excel at synthesizing concepts such as nouns,\nappearances, and styles. To enable customized content creation based on a few\nexample images of a concept, methods such as Textual Inversion and DreamBooth\ninvert the desired concept and enable synthesizing it in new scenes. However,\ninverting personalized concepts that go beyond object appearance and style\n(adjectives and verbs) through natural language remains a challenge. Two key\ncharacteristics of these concepts contribute to the limitations of current\ninversion methods. 1) Adjectives and verbs are entangled with nouns (subject)\nand can hinder appearance-based inversion methods, where the subject appearance\nleaks into the concept embedding, and 2) describing such concepts often extends\nbeyond single word embeddings.\n In this study, we introduce Lego, a textual inversion method designed to\ninvert subject-entangled concepts from a few example images. Lego disentangles\nconcepts from their associated subjects using a simple yet effective Subject\nSeparation step and employs a Context Loss that guides the inversion of\nsingle/multi-embedding concepts. In a thorough user study, Lego-generated\nconcepts were preferred over 70% of the time when compared to the baseline in\nterms of authentically generating concepts according to a reference.\nAdditionally, visual question answering using an LLM suggested Lego-generated\nconcepts are better aligned with the text description of the concept.\n","authors":["Saman Motamed","Danda Pani Paudel","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2311.13833v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18761v1","updated":"2024-09-27T13:55:10Z","published":"2024-09-27T13:55:10Z","title":"Geometric deep learning for galaxy-halo connection: a case study for\n galaxy intrinsic alignments","summary":" Forthcoming cosmological imaging surveys, such as the Rubin Observatory LSST,\nrequire large-scale simulations encompassing realistic galaxy populations for a\nvariety of scientific applications. Of particular concern is the phenomenon of\nintrinsic alignments (IA), whereby galaxies orient themselves towards\noverdensities, potentially introducing significant systematic biases in weak\ngravitational lensing analyses if they are not properly modeled. Due to\ncomputational constraints, simulating the intricate details of galaxy formation\nand evolution relevant to IA across vast volumes is impractical. As an\nalternative, we propose a Deep Generative Model trained on the IllustrisTNG-100\nsimulation to sample 3D galaxy shapes and orientations to accurately reproduce\nintrinsic alignments along with correlated scalar features. We model the cosmic\nweb as a set of graphs, each graph representing a halo with nodes representing\nthe subhalos/galaxies. The architecture consists of a SO(3) $\\times$\n$\\mathbb{R}^n$ diffusion generative model, for galaxy orientations and $n$\nscalars, implemented with E(3) equivariant Graph Neural Networks that\nexplicitly respect the Euclidean symmetries of our Universe. The model is able\nto learn and predict features such as galaxy orientations that are\nstatistically consistent with the reference simulation. Notably, our model\ndemonstrates the ability to jointly model Euclidean-valued scalars (galaxy\nsizes, shapes, and colors) along with non-Euclidean valued SO(3) quantities\n(galaxy orientations) that are governed by highly complex galactic physics at\nnon-linear scales.\n","authors":["Yesukhei Jagvaral","Francois Lanusse","Rachel Mandelbaum"],"pdf_url":"https://arxiv.org/pdf/2409.18761v1.pdf","comment":"12 pages, 5 figures. submitted to MNRAS"},{"id":"http://arxiv.org/abs/2409.18749v1","updated":"2024-09-27T13:39:47Z","published":"2024-09-27T13:39:47Z","title":"TensorSocket: Shared Data Loading for Deep Learning Training","summary":" Training deep learning models is a repetitive and resource-intensive process.\nData scientists often train several models before landing on set of parameters\n(e.g., hyper-parameter tuning), model architecture (e.g., neural architecture\nsearch), among other things that yields the highest accuracy. The computational\nefficiency of these training tasks depends highly on how well we can supply the\ntraining process with training data. The repetitive nature of these tasks\nresults in the same data processing pipelines running over and over\nexacerbating the need for and costs of computational resources.\n In this paper, we present Tensorsocket to reduce the computational needs of\ndeep learning training by enabling simultaneous training processes to share the\nsame data loader. Tensorsocket mitigates CPU-side bottlenecks in cases where\nthe collocated training workloads have high throughput on GPU, but are held\nback by lower data-loading throughput on CPU. Tensorsocket achieves this by\nreducing redundant computations across collocated training processes and\nleveraging modern GPU-GPU interconnects. We demonstrate the hardware- and\npipeline-agnostic nature of Tensorsocket and evaluate it using a variety of\ntraining scenarios.\n Our evaluation shows that Tensorsocket enables scenarios that are infeasible\nwithout data sharing, increases training throughput by up to $100\\%$, and when\nutilizing cloud instances, Tensorsocket achieves cost savings of $50\\%$ by\nreducing the hardware resource needs on the CPU side. Furthermore, Tensorsocket\noutperforms the state-of-the-art solutions for shared data loading such as\nCoorDL and Joader. It is easier to use, maintain, and deploy, and either\nachieves higher or matches the throughput of other solutions while requiring\nless CPU resources.\n","authors":["Ties Robroek","Neil Kim Nielsen","Pınar Tözün"],"pdf_url":"https://arxiv.org/pdf/2409.18749v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18747v1","updated":"2024-09-27T13:38:36Z","published":"2024-09-27T13:38:36Z","title":"Cottention: Linear Transformers With Cosine Attention","summary":" Attention mechanisms, particularly softmax attention, have been instrumental\nin the success of transformer-based models such as GPT. However, the quadratic\nmemory complexity of softmax attention with respect to sequence length poses\nsignificant challenges for processing longer sequences. We introduce\nCottention, a novel attention mechanism that replaces the softmax operation\nwith cosine similarity. By leveraging the properties of cosine similarity and\nrearranging the attention equation, Cottention achieves native linear memory\ncomplexity with respect to sequence length, making it inherently more\nmemory-efficient than softmax attention. We demonstrate that Cottention can be\nreformulated as a recurrent neural network (RNN) with a finite hidden state,\nallowing for constant memory usage during inference. We evaluate Cottention on\nboth the bidirectional BERT and causal GPT tasks, demonstrating comparable\nperformance to softmax attention while significantly reducing memory\nrequirements. To ensure efficient computation, we develop a custom CUDA kernel\nfor Cottention. Our results show that Cottention is a promising alternative to\nsoftmax attention, enabling the processing of longer sequences without\nsacrificing performance, due to its native linear memory complexity and ability\nto maintain a constant memory footprint during inference.\n","authors":["Gabriel Mongaras","Trevor Dohm","Eric C. Larson"],"pdf_url":"https://arxiv.org/pdf/2409.18747v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.20611v3","updated":"2024-09-27T13:29:00Z","published":"2024-05-31T03:57:19Z","title":"Bi-Directional Transformers vs. word2vec: Discovering Vulnerabilities in\n Lifted Compiled Code","summary":" Detecting vulnerabilities within compiled binaries is challenging due to lost\nhigh-level code structures and other factors such as architectural\ndependencies, compilers, and optimization options. To address these obstacles,\nthis research explores vulnerability detection using natural language\nprocessing (NLP) embedding techniques with word2vec, BERT, and RoBERTa to learn\nsemantics from intermediate representation (LLVM IR) code. Long short-term\nmemory (LSTM) neural networks were trained on embeddings from encoders created\nusing approximately 48k LLVM functions from the Juliet dataset. This study is\npioneering in its comparison of word2vec models with multiple bidirectional\ntransformers (BERT, RoBERTa) embeddings built using LLVM code to train neural\nnetworks to detect vulnerabilities in compiled binaries. Word2vec Skip-Gram\nmodels achieved 92% validation accuracy in detecting vulnerabilities,\noutperforming word2vec Continuous Bag of Words (CBOW), BERT, and RoBERTa. This\nsuggests that complex contextual embeddings may not provide advantages over\nsimpler word2vec models for this task when a limited number (e.g. 48K) of data\nsamples are used to train the bidirectional transformer-based models. The\ncomparative results provide novel insights into selecting optimal embeddings\nfor learning compiler-independent semantic code representations to advance\nmachine learning detection of vulnerabilities in compiled binaries.\n","authors":["Gary A. McCully","John D. Hastings","Shengjie Xu","Adam Fortier"],"pdf_url":"https://arxiv.org/pdf/2405.20611v3.pdf","comment":"Updated with improvements"},{"id":"http://arxiv.org/abs/2409.18735v1","updated":"2024-09-27T13:27:15Z","published":"2024-09-27T13:27:15Z","title":"Autoregressive Policy Optimization for Constrained Allocation Tasks","summary":" Allocation tasks represent a class of problems where a limited amount of\nresources must be allocated to a set of entities at each time step. Prominent\nexamples of this task include portfolio optimization or distributing\ncomputational workloads across servers. Allocation tasks are typically bound by\nlinear constraints describing practical requirements that have to be strictly\nfulfilled at all times. In portfolio optimization, for example, investors may\nbe obligated to allocate less than 30\\% of the funds into a certain industrial\nsector in any investment period. Such constraints restrict the action space of\nallowed allocations in intricate ways, which makes learning a policy that\navoids constraint violations difficult. In this paper, we propose a new method\nfor constrained allocation tasks based on an autoregressive process to\nsequentially sample allocations for each entity. In addition, we introduce a\nnovel de-biasing mechanism to counter the initial bias caused by sequential\nsampling. We demonstrate the superior performance of our approach compared to a\nvariety of Constrained Reinforcement Learning (CRL) methods on three distinct\nconstrained allocation tasks: portfolio optimization, computational workload\ndistribution, and a synthetic allocation benchmark. Our code is available at:\nhttps://github.com/niklasdbs/paspo\n","authors":["David Winkel","Niklas Strauß","Maximilian Bernhard","Zongyue Li","Thomas Seidl","Matthias Schubert"],"pdf_url":"https://arxiv.org/pdf/2409.18735v1.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.18721v1","updated":"2024-09-27T13:17:59Z","published":"2024-09-27T13:17:59Z","title":"Scalable Cross-Entropy Loss for Sequential Recommendations with Large\n Item Catalogs","summary":" Scalability issue plays a crucial role in productionizing modern recommender\nsystems. Even lightweight architectures may suffer from high computational\noverload due to intermediate calculations, limiting their practicality in\nreal-world applications. Specifically, applying full Cross-Entropy (CE) loss\noften yields state-of-the-art performance in terms of recommendations quality.\nStill, it suffers from excessive GPU memory utilization when dealing with large\nitem catalogs. This paper introduces a novel Scalable Cross-Entropy (SCE) loss\nfunction in the sequential learning setup. It approximates the CE loss for\ndatasets with large-size catalogs, enhancing both time efficiency and memory\nusage without compromising recommendations quality. Unlike traditional negative\nsampling methods, our approach utilizes a selective GPU-efficient computation\nstrategy, focusing on the most informative elements of the catalog,\nparticularly those most likely to be false positives. This is achieved by\napproximating the softmax distribution over a subset of the model outputs\nthrough the maximum inner product search. Experimental results on multiple\ndatasets demonstrate the effectiveness of SCE in reducing peak memory usage by\na factor of up to 100 compared to the alternatives, retaining or even exceeding\ntheir metrics values. The proposed approach also opens new perspectives for\nlarge-scale developments in different domains, such as large language models.\n","authors":["Gleb Mezentsev","Danil Gusak","Ivan Oseledets","Evgeny Frolov"],"pdf_url":"https://arxiv.org/pdf/2409.18721v1.pdf","comment":"11 pages, accepted for RecSys'24"},{"id":"http://arxiv.org/abs/2404.03471v3","updated":"2024-09-27T13:12:23Z","published":"2024-04-04T14:24:06Z","title":"The Impact of Unstated Norms in Bias Analysis of Language Models","summary":" Bias in large language models (LLMs) has many forms, from overt\ndiscrimination to implicit stereotypes. Counterfactual bias evaluation is a\nwidely used approach to quantifying bias and often relies on template-based\nprobes that explicitly state group membership. It measures whether the outcome\nof a task, performed by an LLM, is invariant to a change of group membership.\nIn this work, we find that template-based probes can lead to unrealistic bias\nmeasurements. For example, LLMs appear to mistakenly cast text associated with\nWhite race as negative at higher rates than other groups. We hypothesize that\nthis arises artificially via a mismatch between commonly unstated norms, in the\nform of markedness, in the pretraining text of LLMs (e.g., Black president vs.\npresident) and templates used for bias measurement (e.g., Black president vs.\nWhite president). The findings highlight the potential misleading impact of\nvarying group membership through explicit mention in counterfactual bias\nquantification.\n","authors":["Farnaz Kohankhaki","D. B. Emerson","Jacob-Junqi Tian","Laleh Seyyed-Kalantari","Faiza Khan Khattak"],"pdf_url":"https://arxiv.org/pdf/2404.03471v3.pdf","comment":"23 Pages, 5 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2409.12059v2","updated":"2024-09-27T13:07:26Z","published":"2024-09-18T15:32:48Z","title":"Dual-Layer Training and Decoding of Large Language Model with\n Simultaneously Thinking and Speaking","summary":" Large Language Model can reasonably understand and generate human expressions\nbut may lack of thorough thinking and reasoning mechanisms. Recently there have\nbeen several studies which enhance the thinking ability of language models but\nmost of them are not data-driven or training-based. In this paper, we are\nmotivated by the cognitive mechanism in the natural world, and design a novel\nmodel architecture called TaS which allows it to first consider the thoughts\nand then express the response based upon the query. We design several pipelines\nto annotate or generate the thought contents from prompt-response samples, then\nadd language heads in a middle layer which behaves as the thinking layer. We\ntrain the language model by the thoughts-augmented data and successfully let\nthe thinking layer automatically generate reasonable thoughts and finally\noutput more reasonable responses. Both qualitative examples and quantitative\nresults validate the effectiveness and performance of TaS. Our code is\navailable at https://anonymous.4open.science/r/TadE.\n","authors":["Ningyuan Xi","Xiaoyu Wang","Yetao Wu","Teng Chen","Qingqing Gu","Jinxian Qu","Zhonglin Jiang","Yong Chen","Luo Ji"],"pdf_url":"https://arxiv.org/pdf/2409.12059v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.18718v1","updated":"2024-09-27T13:05:02Z","published":"2024-09-27T13:05:02Z","title":"Enhancing Spectrum Efficiency in 6G Satellite Networks: A GAIL-Powered\n Policy Learning via Asynchronous Federated Inverse Reinforcement Learning","summary":" In this paper, a novel generative adversarial imitation learning\n(GAIL)-powered policy learning approach is proposed for optimizing beamforming,\nspectrum allocation, and remote user equipment (RUE) association in NTNs.\nTraditional reinforcement learning (RL) methods for wireless network\noptimization often rely on manually designed reward functions, which can\nrequire extensive parameter tuning. To overcome these limitations, we employ\ninverse RL (IRL), specifically leveraging the GAIL framework, to automatically\nlearn reward functions without manual design. We augment this framework with an\nasynchronous federated learning approach, enabling decentralized\nmulti-satellite systems to collaboratively derive optimal policies. The\nproposed method aims to maximize spectrum efficiency (SE) while meeting minimum\ninformation rate requirements for RUEs. To address the non-convex, NP-hard\nnature of this problem, we combine the many-to-one matching theory with a\nmulti-agent asynchronous federated IRL (MA-AFIRL) framework. This allows agents\nto learn through asynchronous environmental interactions, improving training\nefficiency and scalability. The expert policy is generated using the Whale\noptimization algorithm (WOA), providing data to train the automatic reward\nfunction within GAIL. Simulation results show that the proposed MA-AFIRL method\noutperforms traditional RL approaches, achieving a $14.6\\%$ improvement in\nconvergence and reward value. The novel GAIL-driven policy learning establishes\na novel benchmark for 6G NTN optimization.\n","authors":["Sheikh Salman Hassan","Yu Min Park","Yan Kyaw Tun","Walid Saad","Zhu Han","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2409.18718v1.pdf","comment":"Submitted to IEEE Transactions on Mobile Computing (16 pages, 10\n figures)"},{"id":"http://arxiv.org/abs/2409.01869v2","updated":"2024-09-27T13:04:55Z","published":"2024-09-03T13:12:49Z","title":"Feature-Based Interpretable Surrogates for Optimization","summary":" For optimization models to be used in practice, it is crucial that users\ntrust the results. A key factor in this aspect is the interpretability of the\nsolution process. A previous framework for inherently interpretable\noptimization models used decision trees to map instances to solutions of the\nunderlying optimization model. Based on this work, we investigate how we can\nuse more general optimization rules to further increase interpretability and,\nat the same time, give more freedom to the decision-maker. The proposed rules\ndo not map to a concrete solution but to a set of solutions characterized by\ncommon features. To find such optimization rules, we present an exact\nmethodology using mixed-integer programming formulations as well as heuristics.\nWe also outline the challenges and opportunities that these methods present. In\nparticular, we demonstrate the improvement in solution quality that our\napproach offers compared to existing interpretable surrogates for optimization,\nand we discuss the relationship between interpretability and performance. These\nfindings are supported by experiments using both synthetic and real-world data.\n","authors":["Marc Goerigk","Michael Hartisch","Sebastian Merten","Kartikey Sharma"],"pdf_url":"https://arxiv.org/pdf/2409.01869v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15360v2","updated":"2024-09-27T12:36:58Z","published":"2024-09-18T02:35:41Z","title":"Reward-Robust RLHF in LLMs","summary":" As Large Language Models (LLMs) continue to progress toward more advanced\nforms of intelligence, Reinforcement Learning from Human Feedback (RLHF) is\nincreasingly seen as a key pathway toward achieving Artificial General\nIntelligence (AGI). However, the reliance on reward-model-based (RM-based)\nalignment methods introduces significant challenges due to the inherent\ninstability and imperfections of Reward Models (RMs), which can lead to\ncritical issues such as reward hacking and misalignment with human intentions.\nIn this paper, we introduce a reward-robust RLHF framework aimed at addressing\nthese fundamental challenges, paving the way for more reliable and resilient\nlearning in LLMs. Our approach introduces a novel optimization objective that\ncarefully balances performance and robustness by incorporating Bayesian Reward\nModel Ensembles (BRME) to model the uncertainty set of reward functions. This\nallows the framework to integrate both nominal performance and minimum reward\nsignals, ensuring more stable learning even with imperfect RMs. Empirical\nresults demonstrate that our framework consistently outperforms baselines\nacross diverse benchmarks, showing improved accuracy and long-term stability.\nWe also provide a theoretical analysis, demonstrating that reward-robust RLHF\napproaches the stability of constant reward settings, which proves to be\nacceptable even in a stochastic-case analysis. Together, these contributions\nhighlight the framework potential to enhance both the performance and stability\nof LLM alignment.\n","authors":["Yuzi Yan","Xingzhou Lou","Jialian Li","Yiping Zhang","Jian Xie","Chao Yu","Yu Wang","Dong Yan","Yuan Shen"],"pdf_url":"https://arxiv.org/pdf/2409.15360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18696v1","updated":"2024-09-27T12:34:08Z","published":"2024-09-27T12:34:08Z","title":"Rethinking the Power of Timestamps for Robust Time Series Forecasting: A\n Global-Local Fusion Perspective","summary":" Time series forecasting has played a pivotal role across various industries,\nincluding finance, transportation, energy, healthcare, and climate. Due to the\nabundant seasonal information they contain, timestamps possess the potential to\noffer robust global guidance for forecasting techniques. However, existing\nworks primarily focus on local observations, with timestamps being treated\nmerely as an optional supplement that remains underutilized. When data gathered\nfrom the real world is polluted, the absence of global information will damage\nthe robust prediction capability of these algorithms. To address these\nproblems, we propose a novel framework named GLAFF. Within this framework, the\ntimestamps are modeled individually to capture the global dependencies. Working\nas a plugin, GLAFF adaptively adjusts the combined weights for global and local\ninformation, enabling seamless collaboration with any time series forecasting\nbackbone. Extensive experiments conducted on nine real-world datasets\ndemonstrate that GLAFF significantly enhances the average performance of widely\nused mainstream forecasting models by 12.5%, surpassing the previous\nstate-of-the-art method by 5.5%.\n","authors":["Chengsen Wang","Qi Qi","Jingyu Wang","Haifeng Sun","Zirui Zhuang","Jinming Wu","Jianxin Liao"],"pdf_url":"https://arxiv.org/pdf/2409.18696v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.18692v1","updated":"2024-09-27T12:28:18Z","published":"2024-09-27T12:28:18Z","title":"MG-Net: Learn to Customize QAOA with Circuit Depth Awareness","summary":" Quantum Approximate Optimization Algorithm (QAOA) and its variants exhibit\nimmense potential in tackling combinatorial optimization challenges. However,\ntheir practical realization confronts a dilemma: the requisite circuit depth\nfor satisfactory performance is problem-specific and often exceeds the maximum\ncapability of current quantum devices. To address this dilemma, here we first\nanalyze the convergence behavior of QAOA, uncovering the origins of this\ndilemma and elucidating the intricate relationship between the employed mixer\nHamiltonian, the specific problem at hand, and the permissible maximum circuit\ndepth. Harnessing this understanding, we introduce the Mixer Generator Network\n(MG-Net), a unified deep learning framework adept at dynamically formulating\noptimal mixer Hamiltonians tailored to distinct tasks and circuit depths.\nSystematic simulations, encompassing Ising models and weighted Max-Cut\ninstances with up to 64 qubits, substantiate our theoretical findings,\nhighlighting MG-Net's superior performance in terms of both approximation ratio\nand efficiency.\n","authors":["Yang Qian","Xinbiao Wang","Yuxuan Du","Yong Luo","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2409.18692v1.pdf","comment":"29 pages, 16 figures"},{"id":"http://arxiv.org/abs/2405.08027v3","updated":"2024-09-27T12:27:48Z","published":"2024-05-12T13:36:58Z","title":"Automating Data Annotation under Strategic Human Agents: Risks and\n Potential Solutions","summary":" As machine learning (ML) models are increasingly used in social domains to\nmake consequential decisions about humans, they often have the power to reshape\ndata distributions. Humans, as strategic agents, continuously adapt their\nbehaviors in response to the learning system. As populations change\ndynamically, ML systems may need frequent updates to ensure high performance.\nHowever, acquiring high-quality human-annotated samples can be highly\nchallenging and even infeasible in social domains. A common practice to address\nthis issue is using the model itself to annotate unlabeled data samples. This\npaper investigates the long-term impacts when ML models are retrained with\nmodel-annotated samples when they incorporate human strategic responses. We\nfirst formalize the interactions between strategic agents and the model and\nthen analyze how they evolve under such dynamic interactions. We find that\nagents are increasingly likely to receive positive decisions as the model gets\nretrained, whereas the proportion of agents with positive labels may decrease\nover time. We thus propose a refined retraining process to stabilize the\ndynamics. Last, we examine how algorithmic fairness can be affected by these\nretraining processes and find that enforcing common fairness constraints at\nevery round may not benefit the disadvantaged group in the long run.\nExperiments on (semi-)synthetic and real data validate the theoretical\nfindings.\n","authors":["Tian Xie","Xueru Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.08027v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06069v2","updated":"2024-09-27T12:23:04Z","published":"2024-03-10T03:22:57Z","title":"Implicit Image-to-Image Schrodinger Bridge for Image Restoration","summary":" Diffusion-based models are widely recognized for their effectiveness in image\nrestoration tasks; however, their iterative denoising process, which begins\nfrom Gaussian noise, often results in slow inference speeds. The Image-to-Image\nSchr\\\"odinger Bridge (I$^2$SB) presents a promising alternative by starting the\ngenerative process from corrupted images and leveraging training techniques\nfrom score-based diffusion models. In this paper, we introduce the Implicit\nImage-to-Image Schr\\\"odinger Bridge (I$^3$SB) to further accelerate the\ngenerative process of I$^2$SB. I$^3$SB reconfigures the generative process into\na non-Markovian framework by incorporating the initial corrupted image into\neach step, while ensuring that the marginal distribution aligns with that of\nI$^2$SB. This allows for the direct use of the pretrained network from I$^2$SB.\nExtensive experiments on natural images, human face images, and medical images\nvalidate the acceleration benefits of I$^3$SB. Compared to I$^2$SB, I$^3$SB\nachieves the same perceptual quality with fewer generative steps, while\nmaintaining equal or improved fidelity to the ground truth.\n","authors":["Yuang Wang","Siyeop Yoon","Pengfei Jin","Matthew Tivnan","Sifan Song","Zhennong Chen","Rui Hu","Li Zhang","Quanzheng Li","Zhiqiang Chen","Dufan Wu"],"pdf_url":"https://arxiv.org/pdf/2403.06069v2.pdf","comment":"23 pages, 8 figures, submitted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2409.18685v1","updated":"2024-09-27T12:19:41Z","published":"2024-09-27T12:19:41Z","title":"Understanding the Benefits of SimCLR Pre-Training in Two-Layer\n Convolutional Neural Networks","summary":" SimCLR is one of the most popular contrastive learning methods for vision\ntasks. It pre-trains deep neural networks based on a large amount of unlabeled\ndata by teaching the model to distinguish between positive and negative pairs\nof augmented images. It is believed that SimCLR can pre-train a deep neural\nnetwork to learn efficient representations that can lead to a better\nperformance of future supervised fine-tuning. Despite its effectiveness, our\ntheoretical understanding of the underlying mechanisms of SimCLR is still\nlimited. In this paper, we theoretically introduce a case study of the SimCLR\nmethod. Specifically, we consider training a two-layer convolutional neural\nnetwork (CNN) to learn a toy image data model. We show that, under certain\nconditions on the number of labeled data, SimCLR pre-training combined with\nsupervised fine-tuning achieves almost optimal test loss. Notably, the label\ncomplexity for SimCLR pre-training is far less demanding compared to direct\ntraining on supervised data. Our analysis sheds light on the benefits of SimCLR\nin learning with fewer labels.\n","authors":["Han Zhang","Yuan Cao"],"pdf_url":"https://arxiv.org/pdf/2409.18685v1.pdf","comment":"65 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.07309v4","updated":"2024-09-27T12:02:44Z","published":"2024-02-11T21:16:26Z","title":"HyperBERT: Mixing Hypergraph-Aware Layers with Language Models for Node\n Classification on Text-Attributed Hypergraphs","summary":" Hypergraphs are characterized by complex topological structure, representing\nhigher-order interactions among multiple entities through hyperedges. Lately,\nhypergraph-based deep learning methods to learn informative data\nrepresentations for the problem of node classification on text-attributed\nhypergraphs have garnered increasing research attention. However, existing\nmethods struggle to simultaneously capture the full extent of hypergraph\nstructural information and the rich linguistic attributes inherent in the nodes\nattributes, which largely hampers their effectiveness and generalizability. To\novercome these challenges, we explore ways to further augment a pretrained BERT\nmodel with specialized hypergraph-aware layers for the task of node\nclassification. Such layers introduce higher-order structural inductive bias\ninto the language model, thus improving the model's capacity to harness both\nhigher-order context information from the hypergraph structure and semantic\ninformation present in text. In this paper, we propose a new architecture,\nHyperBERT, a mixed text-hypergraph model which simultaneously models hypergraph\nrelational structure while maintaining the high-quality text encoding\ncapabilities of a pre-trained BERT. Notably, HyperBERT presents results that\nachieve a new state-of-the-art on five challenging text-attributed hypergraph\nnode classification benchmarks.\n","authors":["Adrián Bazaga","Pietro Liò","Gos Micklem"],"pdf_url":"https://arxiv.org/pdf/2402.07309v4.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18664v1","updated":"2024-09-27T11:50:10Z","published":"2024-09-27T11:50:10Z","title":"How green is continual learning, really? Analyzing the energy\n consumption in continual training of vision foundation models","summary":" With the ever-growing adoption of AI, its impact on the environment is no\nlonger negligible. Despite the potential that continual learning could have\ntowards Green AI, its environmental sustainability remains relatively\nuncharted. In this work we aim to gain a systematic understanding of the energy\nefficiency of continual learning algorithms. To that end, we conducted an\nextensive set of empirical experiments comparing the energy consumption of\nrecent representation-, prompt-, and exemplar-based continual learning\nalgorithms and two standard baseline (fine tuning and joint training) when used\nto continually adapt a pre-trained ViT-B/16 foundation model. We performed our\nexperiments on three standard datasets: CIFAR-100, ImageNet-R, and DomainNet.\nAdditionally, we propose a novel metric, the Energy NetScore, which we use\nmeasure the algorithm efficiency in terms of energy-accuracy trade-off. Through\nnumerous evaluations varying the number and size of the incremental learning\nsteps, our experiments demonstrate that different types of continual learning\nalgorithms have very different impacts on energy consumption during both\ntraining and inference. Although often overlooked in the continual learning\nliterature, we found that the energy consumed during the inference phase is\ncrucial for evaluating the environmental sustainability of continual learning\nmodels.\n","authors":["Tomaso Trinci","Simone Magistri","Roberto Verdecchia","Andrew D. Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2409.18664v1.pdf","comment":"This manuscript has been accepted at the Green FOundation MOdels\n (GreenFOMO) ECCV 2024 Workshop"},{"id":"http://arxiv.org/abs/2404.06349v2","updated":"2024-09-27T11:45:09Z","published":"2024-04-09T14:40:08Z","title":"CausalBench: A Comprehensive Benchmark for Causal Learning Capability of\n LLMs","summary":" The ability to understand causality significantly impacts the competence of\nlarge language models (LLMs) in output explanation and counterfactual\nreasoning, as causality reveals the underlying data distribution. However, the\nlack of a comprehensive benchmark currently limits the evaluation of LLMs'\ncausal learning capabilities. To fill this gap, this paper develops CausalBench\nbased on data from the causal research community, enabling comparative\nevaluations of LLMs against traditional causal learning algorithms. To provide\na comprehensive investigation, we offer three tasks of varying difficulties,\nincluding correlation, causal skeleton, and causality identification.\nEvaluations of 19 leading LLMs reveal that, while closed-source LLMs show\npotential for simple causal relationships, they significantly lag behind\ntraditional algorithms on larger-scale networks ($>50$ nodes). Specifically,\nLLMs struggle with collider structures but excel at chain structures,\nespecially at long-chain causality analogous to Chains-of-Thought techniques.\nThis supports the current prompt approaches while suggesting directions to\nenhance LLMs' causal reasoning capability. Furthermore, CausalBench\nincorporates background knowledge and training data into prompts to thoroughly\nunlock LLMs' text-comprehension ability during evaluation, whose findings\nindicate that, LLM understand causality through semantic associations with\ndistinct entities, rather than directly from contextual information or\nnumerical distributions.\n","authors":["Yu Zhou","Xingyu Wu","Beicheng Huang","Jibin Wu","Liang Feng","Kay Chen Tan"],"pdf_url":"https://arxiv.org/pdf/2404.06349v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17461v2","updated":"2024-09-27T11:34:05Z","published":"2024-05-23T05:25:45Z","title":"EMR-Merging: Tuning-Free High-Performance Model Merging","summary":" The success of pretrain-finetune paradigm brings about the release of\nnumerous model weights. In this case, merging models finetuned on different\ntasks to enable a single model with multi-task capabilities is gaining\nincreasing attention for its practicability. Existing model merging methods\nusually suffer from (1) significant performance degradation or (2) requiring\ntuning by additional data or training. In this paper, we rethink and analyze\nthe existing model merging paradigm. We discover that using a single model's\nweights can hardly simulate all the models' performance. To tackle this issue,\nwe propose Elect, Mask & Rescale-Merging (EMR-Merging). We first (a) elect a\nunified model from all the model weights and then (b) generate extremely\nlightweight task-specific modulators, including masks and rescalers, to align\nthe direction and magnitude between the unified model and each specific model,\nrespectively. EMR-Merging is tuning-free, thus requiring no data availability\nor any additional training while showing impressive performance. We find that\nEMR-Merging shows outstanding performance compared to existing merging methods\nunder different classical and newly-established settings, including merging\ndifferent numbers of vision models (up to 30), NLP models, PEFT models, and\nmulti-modal models.\n","authors":["Chenyu Huang","Peng Ye","Tao Chen","Tong He","Xiangyu Yue","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2405.17461v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.11678v2","updated":"2024-09-27T11:17:13Z","published":"2024-09-18T03:34:31Z","title":"An Enhanced-State Reinforcement Learning Algorithm for Multi-Task Fusion\n in Large-Scale Recommender Systems","summary":" As the last key stage of Recommender Systems (RSs), Multi-Task Fusion (MTF)\nis in charge of combining multiple scores predicted by Multi-Task Learning\n(MTL) into a final score to maximize user satisfaction, which decides the\nultimate recommendation results. In recent years, to maximize long-term user\nsatisfaction within a recommendation session, Reinforcement Learning (RL) is\nwidely used for MTF in large-scale RSs. However, limited by their modeling\npattern, all the current RL-MTF methods can only utilize user features as the\nstate to generate actions for each user, but unable to make use of item\nfeatures and other valuable features, which leads to suboptimal results.\nAddressing this problem is a challenge that requires breaking through the\ncurrent modeling pattern of RL-MTF. To solve this problem, we propose a novel\nmethod called Enhanced-State RL for MTF in RSs. Unlike the existing methods\nmentioned above, our method first defines user features, item features, and\nother valuable features collectively as the enhanced state; then proposes a\nnovel actor and critic learning process to utilize the enhanced state to make\nmuch better action for each user-item pair. To the best of our knowledge, this\nnovel modeling pattern is being proposed for the first time in the field of\nRL-MTF. We conduct extensive offline and online experiments in a large-scale\nRS. The results demonstrate that our model outperforms other models\nsignificantly. Enhanced-State RL has been fully deployed in our RS more than\nhalf a year, improving +3.84% user valid consumption and +0.58% user duration\ntime compared to baseline.\n","authors":["Peng Liu","Jiawei Zhu","Cong Xu","Ming Zhao","Bin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.11678v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2404.17589"},{"id":"http://arxiv.org/abs/2404.17589v3","updated":"2024-09-27T11:08:17Z","published":"2024-04-19T08:43:03Z","title":"An Off-Policy Reinforcement Learning Algorithm Customized for Multi-Task\n Fusion in Large-Scale Recommender Systems","summary":" As the last critical stage of RSs, Multi-Task Fusion (MTF) is responsible for\ncombining multiple scores outputted by Multi-Task Learning (MTL) into a final\nscore to maximize user satisfaction, which determines the ultimate\nrecommendation results. Recently, to optimize long-term user satisfaction\nwithin a recommendation session, Reinforcement Learning (RL) is used for MTF in\nthe industry. However, the off-policy RL algorithms used for MTF so far have\nthe following severe problems: 1) to avoid out-of-distribution (OOD) problem,\ntheir constraints are overly strict, which seriously damage their performance;\n2) they are unaware of the exploration policy used for producing training data\nand never interact with real environment, so only suboptimal policy can be\nlearned; 3) the traditional exploration policies are inefficient and hurt user\nexperience. To solve the above problems, we propose a novel method named\nIntegratedRL-MTF customized for MTF in large-scale RSs. IntegratedRL-MTF\nintegrates off-policy RL model with our online exploration policy to relax\noverstrict and complicated constraints, which significantly improves its\nperformance. We also design an extremely efficient exploration policy, which\neliminates low-value exploration space and focuses on exploring potential\nhigh-value state-action pairs. Moreover, we adopt progressive training mode to\nfurther enhance our model's performance with the help of our exploration\npolicy. We conduct extensive offline and online experiments in the short video\nchannel of Tencent News. The results demonstrate that our model outperforms\nother models remarkably. IntegratedRL-MTF has been fully deployed in our RS and\nother large-scale RSs in Tencent, which have achieved significant improvements.\n","authors":["Peng Liu","Cong Xu","Ming Zhao","Jiawei Zhu","Bin Wang","Yi Ren"],"pdf_url":"https://arxiv.org/pdf/2404.17589v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10357v2","updated":"2024-09-27T10:59:21Z","published":"2024-09-16T15:06:12Z","title":"2D or not 2D: How Does the Dimensionality of Gesture Representation\n Affect 3D Co-Speech Gesture Generation?","summary":" Co-speech gestures are fundamental for communication. The advent of recent\ndeep learning techniques has facilitated the creation of lifelike, synchronous\nco-speech gestures for Embodied Conversational Agents. \"In-the-wild\" datasets,\naggregating video content from platforms like YouTube via human pose detection\ntechnologies, provide a feasible solution by offering 2D skeletal sequences\naligned with speech. Concurrent developments in lifting models enable the\nconversion of these 2D sequences into 3D gesture databases. However, it is\nimportant to note that the 3D poses estimated from the 2D extracted poses are,\nin essence, approximations of the ground-truth, which remains in the 2D domain.\nThis distinction raises questions about the impact of gesture representation\ndimensionality on the quality of generated motions - a topic that, to our\nknowledge, remains largely unexplored. Our study examines the effect of using\neither 2D or 3D joint coordinates as training data on the performance of\nspeech-to-gesture deep generative models. We employ a lifting model for\nconverting generated 2D pose sequences into 3D and assess how gestures created\ndirectly in 3D stack up against those initially generated in 2D and then\nconverted to 3D. We perform an objective evaluation using widely used metrics\nin the gesture generation field as well as a user study to qualitatively\nevaluate the different approaches.\n","authors":["Téo Guichoux","Laure Soulier","Nicolas Obin","Catherine Pelachaud"],"pdf_url":"https://arxiv.org/pdf/2409.10357v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2406.15111"},{"id":"http://arxiv.org/abs/2409.18630v1","updated":"2024-09-27T10:58:18Z","published":"2024-09-27T10:58:18Z","title":"Entropy, concentration, and learning: a statistical mechanics primer","summary":" Artificial intelligence models trained through loss minimization have\ndemonstrated significant success, grounded in principles from fields like\ninformation theory and statistical physics. This work explores these\nestablished connections through the lens of statistical mechanics, starting\nfrom first-principles sample concentration behaviors that underpin AI and\nmachine learning. Our development of statistical mechanics for modeling\nhighlights the key role of exponential families, and quantities of statistics,\nphysics, and information theory.\n","authors":["Akshay Balsubramani"],"pdf_url":"https://arxiv.org/pdf/2409.18630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18628v1","updated":"2024-09-27T10:55:58Z","published":"2024-09-27T10:55:58Z","title":"Towards Integrating Epistemic Uncertainty Estimation into the\n Radiotherapy Workflow","summary":" The precision of contouring target structures and organs-at-risk (OAR) in\nradiotherapy planning is crucial for ensuring treatment efficacy and patient\nsafety. Recent advancements in deep learning (DL) have significantly improved\nOAR contouring performance, yet the reliability of these models, especially in\nthe presence of out-of-distribution (OOD) scenarios, remains a concern in\nclinical settings. This application study explores the integration of epistemic\nuncertainty estimation within the OAR contouring workflow to enable OOD\ndetection in clinically relevant scenarios, using specifically compiled data.\nFurthermore, we introduce an advanced statistical method for OOD detection to\nenhance the methodological framework of uncertainty estimation. Our empirical\nevaluation demonstrates that epistemic uncertainty estimation is effective in\nidentifying instances where model predictions are unreliable and may require an\nexpert review. Notably, our approach achieves an AUC-ROC of 0.95 for OOD\ndetection, with a specificity of 0.95 and a sensitivity of 0.92 for implant\ncases, underscoring its efficacy. This study addresses significant gaps in the\ncurrent research landscape, such as the lack of ground truth for uncertainty\nestimation and limited empirical evaluations. Additionally, it provides a\nclinically relevant application of epistemic uncertainty estimation in an\nFDA-approved and widely used clinical solution for OAR segmentation from\nVarian, a Siemens Healthineers company, highlighting its practical benefits.\n","authors":["Marvin Tom Teichmann","Manasi Datar","Lisa Kratzke","Fernando Vega","Florin C. Ghesu"],"pdf_url":"https://arxiv.org/pdf/2409.18628v1.pdf","comment":"Keywords: Epistemic Uncertainty - Out-of-Distribution Detection - CT\n Segmentation - OAR contouring - Radiotherapy"},{"id":"http://arxiv.org/abs/2405.05192v2","updated":"2024-09-27T10:53:01Z","published":"2024-05-08T16:30:45Z","title":"Full error analysis of the random deep splitting method for nonlinear\n parabolic PDEs and PIDEs","summary":" In this paper, we present a randomized extension of the deep splitting\nalgorithm introduced in [Beck, Becker, Cheridito, Jentzen, and Neufeld (2021)]\nusing random neural networks suitable to approximately solve both\nhigh-dimensional nonlinear parabolic PDEs and PIDEs with jumps having\n(possibly) infinite activity. We provide a full error analysis of our so-called\nrandom deep splitting method. In particular, we prove that our random deep\nsplitting method converges to the (unique viscosity) solution of the nonlinear\nPDE or PIDE under consideration. Moreover, we empirically analyze our random\ndeep splitting method by considering several numerical examples including both\nnonlinear PDEs and nonlinear PIDEs relevant in the context of pricing of\nfinancial derivatives under default risk. In particular, we empirically\ndemonstrate in all examples that our random deep splitting method can\napproximately solve nonlinear PDEs and PIDEs in 10'000 dimensions within\nseconds.\n","authors":["Ariel Neufeld","Philipp Schmocker","Sizhou Wu"],"pdf_url":"https://arxiv.org/pdf/2405.05192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18624v1","updated":"2024-09-27T10:50:49Z","published":"2024-09-27T10:50:49Z","title":"Unsupervised Cognition","summary":" Unsupervised learning methods have a soft inspiration in cognition models. To\nthis day, the most successful unsupervised learning methods revolve around\nclustering samples in a mathematical space. In this paper we propose a\nstate-of-the-art primitive-based unsupervised learning approach for\ndecision-making inspired by novel cognition models. This representation-centric\napproach models the input space constructively as a distributed hierarchical\nstructure in an input-agnostic way. We compared our approach with current\nstate-of-the-art in unsupervised learning classification, and with current\nstate-of-the-art in cancer type classification. We show how our proposal\noutperforms previous state-of-the-art. We also evaluate some cognition-like\nproperties of our proposal where it not only outperforms the compared\nalgorithms (even supervised learning ones), but it also shows a different, more\ncognition-like, behaviour.\n","authors":["Alfredo Ibias","Hector Antona","Guillem Ramirez-Miranda","Enric Guinovart","Eduard Alarcon"],"pdf_url":"https://arxiv.org/pdf/2409.18624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18611v1","updated":"2024-09-27T10:18:14Z","published":"2024-09-27T10:18:14Z","title":"Differentially Private Non Parametric Copulas: Generating synthetic data\n with non parametric copulas under privacy guarantees","summary":" Creation of synthetic data models has represented a significant advancement\nacross diverse scientific fields, but this technology also brings important\nprivacy considerations for users. This work focuses on enhancing a\nnon-parametric copula-based synthetic data generation model, DPNPC, by\nincorporating Differential Privacy through an Enhanced Fourier Perturbation\nmethod. The model generates synthetic data for mixed tabular databases while\npreserving privacy. We compare DPNPC with three other models (PrivBayes,\nDP-Copula, and DP-Histogram) across three public datasets, evaluating privacy,\nutility, and execution time. DPNPC outperforms others in modeling multivariate\ndependencies, maintaining privacy for small $\\epsilon$ values, and reducing\ntraining times. However, limitations include the need to assess the model's\nperformance with different encoding methods and consider additional privacy\nattacks. Future research should address these areas to enhance\nprivacy-preserving synthetic data generation.\n","authors":["Pablo A. Osorio-Marulanda","John Esteban Castro Ramirez","Mikel Hernández Jiménez","Nicolas Moreno Reyes","Gorka Epelde Unanue"],"pdf_url":"https://arxiv.org/pdf/2409.18611v1.pdf","comment":"12 pages, 5 figures, deciding 2025 conference to which to submit"},{"id":"http://arxiv.org/abs/2409.14378v2","updated":"2024-09-27T10:04:29Z","published":"2024-09-22T09:48:45Z","title":"Sparse Low-Ranked Self-Attention Transformer for Remaining Useful\n Lifetime Prediction of Optical Fiber Amplifiers","summary":" Optical fiber amplifiers are key elements in present optical networks.\nFailures of these components result in high financial loss of income of the\nnetwork operator as the communication traffic over an affected link is\ninterrupted. Applying Remaining useful lifetime (RUL) prediction in the context\nof Predictive Maintenance (PdM) to optical fiber amplifiers to predict upcoming\nsystem failures at an early stage, so that network outages can be minimized\nthrough planning of targeted maintenance actions, ensures reliability and\nsafety. Optical fiber amplifier are complex systems, that work under various\noperating conditions, which makes correct forecasting a difficult task.\nIncreased monitoring capabilities of systems results in datasets that\nfacilitate the application of data-driven RUL prediction methods. Deep learning\nmodels in particular have shown good performance, but generalization based on\ncomparatively small datasets for RUL prediction is difficult. In this paper, we\npropose Sparse Low-ranked self-Attention Transformer (SLAT) as a novel RUL\nprediction method. SLAT is based on an encoder-decoder architecture, wherein\ntwo parallel working encoders extract features for sensors and time steps. By\nutilizing the self-attention mechanism, long-term dependencies can be learned\nfrom long sequences. The implementation of sparsity in the attention matrix and\na low-rank parametrization reduce overfitting and increase generalization.\nExperimental application to optical fiber amplifiers exemplified on EDFA, as\nwell as a reference dataset from turbofan engines, shows that SLAT outperforms\nthe state-of-the-art methods.\n","authors":["Dominic Schneider","Lutz Rapp"],"pdf_url":"https://arxiv.org/pdf/2409.14378v2.pdf","comment":"9 pages, 7 figures, submitted to IEEE Transactions on Machine\n Learning in Communications and Networking (TMLCN)"},{"id":"http://arxiv.org/abs/2407.12789v2","updated":"2024-09-27T10:00:15Z","published":"2024-06-17T13:53:39Z","title":"Generalisation to unseen topologies: Towards control of biological\n neural network activity","summary":" Novel imaging and neurostimulation techniques open doors for advancements in\nclosed-loop control of activity in biological neural networks. This would allow\nfor applications in the investigation of activity propagation, and for\ndiagnosis and treatment of pathological behaviour. Due to the partially\nobservable characteristics of activity propagation, through networks in which\nedges can not be observed, and the dynamic nature of neuronal systems, there is\na need for adaptive, generalisable control. In this paper, we introduce an\nenvironment that procedurally generates neuronal networks with different\ntopologies to investigate this generalisation problem. Additionally, an\nexisting transformer-based architecture is adjusted to evaluate the\ngeneralisation performance of a deep RL agent in the presented partially\nobservable environment. The agent demonstrates the capability to generalise\ncontrol from a limited number of training networks to unseen test networks.\n","authors":["Laurens Engwegen","Daan Brinks","Wendelin Böhmer"],"pdf_url":"https://arxiv.org/pdf/2407.12789v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04924v5","updated":"2024-09-27T09:57:42Z","published":"2024-02-07T14:49:10Z","title":"Two Trades is not Baffled: Condensing Graph via Crafting Rational\n Gradient Matching","summary":" Training on large-scale graphs has achieved remarkable results in graph\nrepresentation learning, but its cost and storage have raised growing concerns.\nAs one of the most promising directions, graph condensation methods address\nthese issues by employing gradient matching, aiming to condense the full graph\ninto a more concise yet information-rich synthetic set. Though encouraging,\nthese strategies primarily emphasize matching directions of the gradients,\nwhich leads to deviations in the training trajectories. Such deviations are\nfurther magnified by the differences between the condensation and evaluation\nphases, culminating in accumulated errors, which detrimentally affect the\nperformance of the condensed graphs. In light of this, we propose a novel graph\ncondensation method named \\textbf{C}raf\\textbf{T}ing \\textbf{R}ationa\\textbf{L}\ntrajectory (\\textbf{CTRL}), which offers an optimized starting point closer to\nthe original dataset's feature distribution and a more refined strategy for\ngradient matching. Theoretically, CTRL can effectively neutralize the impact of\naccumulated errors on the performance of condensed graphs. We provide extensive\nexperiments on various graph datasets and downstream tasks to support the\neffectiveness of CTRL. Code is released at\nhttps://github.com/NUS-HPC-AI-Lab/CTRL.\n","authors":["Tianle Zhang","Yuchen Zhang","Kun Wang","Kai Wang","Beining Yang","Kaipeng Zhang","Wenqi Shao","Ping Liu","Joey Tianyi Zhou","Yang You"],"pdf_url":"https://arxiv.org/pdf/2402.04924v5.pdf","comment":"An effective method for graph condensation"},{"id":"http://arxiv.org/abs/2409.18597v1","updated":"2024-09-27T09:56:20Z","published":"2024-09-27T09:56:20Z","title":"TemporalPaD: a reinforcement-learning framework for temporal feature\n representation and dimension reduction","summary":" Recent advancements in feature representation and dimension reduction have\nhighlighted their crucial role in enhancing the efficacy of predictive\nmodeling. This work introduces TemporalPaD, a novel end-to-end deep learning\nframework designed for temporal pattern datasets. TemporalPaD integrates\nreinforcement learning (RL) with neural networks to achieve concurrent feature\nrepresentation and feature reduction. The framework consists of three\ncooperative modules: a Policy Module, a Representation Module, and a\nClassification Module, structured based on the Actor-Critic (AC) framework. The\nPolicy Module, responsible for dimensionality reduction through RL, functions\nas the actor, while the Representation Module for feature extraction and the\nClassification Module collectively serve as the critic. We comprehensively\nevaluate TemporalPaD using 29 UCI datasets, a well-known benchmark for\nvalidating feature reduction algorithms, through 10 independent tests and\n10-fold cross-validation. Additionally, given that TemporalPaD is specifically\ndesigned for time series data, we apply it to a real-world DNA classification\nproblem involving enhancer category and enhancer strength. The results\ndemonstrate that TemporalPaD is an efficient and effective framework for\nachieving feature reduction, applicable to both structured data and sequence\ndatasets. The source code of the proposed TemporalPaD is freely available as\nsupplementary material to this article and at\nhttp://www.healthinformaticslab.org/supp/.\n","authors":["Xuechen Mu","Zhenyu Huang","Kewei Li","Haotian Zhang","Xiuli Wang","Yusi Fan","Kai Zhang","Fengfeng Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.18597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18596v1","updated":"2024-09-27T09:56:02Z","published":"2024-09-27T09:56:02Z","title":"ASAG2024: A Combined Benchmark for Short Answer Grading","summary":" Open-ended questions test a more thorough understanding than closed-ended\nquestions and are often a preferred assessment method. However, open-ended\nquestions are tedious to grade and subject to personal bias. Therefore, there\nhave been efforts to speed up the grading process through automation. Short\nAnswer Grading (SAG) systems aim to automatically score students' answers.\nDespite growth in SAG methods and capabilities, there exists no comprehensive\nshort-answer grading benchmark across different subjects, grading scales, and\ndistributions. Thus, it is hard to assess the capabilities of current automated\ngrading methods in terms of their generalizability. In this preliminary work,\nwe introduce the combined ASAG2024 benchmark to facilitate the comparison of\nautomated grading systems. Combining seven commonly used short-answer grading\ndatasets in a common structure and grading scale. For our benchmark, we\nevaluate a set of recent SAG methods, revealing that while LLM-based approaches\nreach new high scores, they still are far from reaching human performance. This\nopens up avenues for future research on human-machine SAG systems.\n","authors":["Gérôme Meyer","Philip Breuer","Jonathan Fürst"],"pdf_url":"https://arxiv.org/pdf/2409.18596v1.pdf","comment":"Accepted at SIGCSE-Virtual 2024"},{"id":"http://arxiv.org/abs/2409.18594v1","updated":"2024-09-27T09:53:48Z","published":"2024-09-27T09:53:48Z","title":"\"Oh LLM, I'm Asking Thee, Please Give Me a Decision Tree\": Zero-Shot\n Decision Tree Induction and Embedding with Large Language Models","summary":" Large language models (LLMs) provide powerful means to leverage prior\nknowledge for predictive modeling when data is limited. In this work, we\ndemonstrate how LLMs can use their compressed world knowledge to generate\nintrinsically interpretable machine learning models, i.e., decision trees,\nwithout any training data. We find that these zero-shot decision trees can\nsurpass data-driven trees on some small-sized tabular datasets and that\nembeddings derived from these trees perform on par with data-driven tree-based\nembeddings on average. Our knowledge-driven decision tree induction and\nembedding approaches therefore serve as strong new baselines for data-driven\nmachine learning methods in the low-data regime.\n","authors":["Ricardo Knauer","Mario Koddenbrock","Raphael Wallsberger","Nicholas M. Brisson","Georg N. Duda","Deborah Falla","David W. Evans","Erik Rodner"],"pdf_url":"https://arxiv.org/pdf/2409.18594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15039v2","updated":"2024-09-27T09:40:34Z","published":"2023-09-26T16:15:54Z","title":"Can-SAVE: Mass Cancer Risk Prediction via Survival Analysis Variables\n and EHR","summary":" Specific medical cancer screening methods are often costly, time-consuming,\nand weakly applicable on a large scale. Advanced Artificial Intelligence (AI)\nmethods greatly help cancer detection but require specific or deep medical\ndata. These aspects prevent the mass implementation of cancer screening\nmethods. For this reason, it is a disruptive change for healthcare to apply AI\nmethods for mass personalized assessment of the cancer risk among patients\nbased on the existing Electronic Health Records (EHR) volume. This paper\npresents a novel Can-SAVE cancer risk assessment method combining a survival\nanalysis approach with a gradient-boosting algorithm. It is highly accessible\nand resource-efficient, utilizing only a sequence of high-level medical events.\nWe tested the proposed method in a long-term retrospective experiment covering\nmore than 1.1 million people and four regions of Russia. The Can-SAVE method\nsignificantly exceeds the baselines by the Average Precision metric of\n22.8%$\\pm$2.7% vs 15.1%$\\pm$2.6%. The extensive ablation study also confirmed\nthe proposed method's dominant performance. The experiment supervised by\noncologists shows a reliable cancer patient detection rate of up to 84 out of\n1000 selected. Such results surpass the medical screening strategies estimates;\nthe typical age-specific Number Needed to Screen is only 9 out of 1000 (for\ncolorectal cancer). Overall, our experiments show a 4.7-6.4 times improvement\nin cancer detection rate (TOP@1k) compared to the traditional healthcare risk\nestimation approach.\n","authors":["Petr Philonenko","Vladimir Kokh","Pavel Blinov"],"pdf_url":"https://arxiv.org/pdf/2309.15039v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.18582v1","updated":"2024-09-27T09:37:49Z","published":"2024-09-27T09:37:49Z","title":"Optimistic Games for Combinatorial Bayesian Optimization with\n Application to Protein Design","summary":" Bayesian optimization (BO) is a powerful framework to optimize black-box\nexpensive-to-evaluate functions via sequential interactions. In several\nimportant problems (e.g. drug discovery, circuit design, neural architecture\nsearch, etc.), though, such functions are defined over large\n$\\textit{combinatorial and unstructured}$ spaces. This makes existing BO\nalgorithms not feasible due to the intractable maximization of the acquisition\nfunction over these domains. To address this issue, we propose\n$\\textbf{GameOpt}$, a novel game-theoretical approach to combinatorial BO.\n$\\textbf{GameOpt}$ establishes a cooperative game between the different\noptimization variables, and selects points that are game $\\textit{equilibria}$\nof an upper confidence bound acquisition function. These are stable\nconfigurations from which no variable has an incentive to deviate$-$ analog to\nlocal optima in continuous domains. Crucially, this allows us to efficiently\nbreak down the complexity of the combinatorial domain into individual decision\nsets, making $\\textbf{GameOpt}$ scalable to large combinatorial spaces. We\ndemonstrate the application of $\\textbf{GameOpt}$ to the challenging\n$\\textit{protein design}$ problem and validate its performance on four\nreal-world protein datasets. Each protein can take up to $20^{X}$ possible\nconfigurations, where $X$ is the length of a protein, making standard BO\nmethods infeasible. Instead, our approach iteratively selects informative\nprotein configurations and very quickly discovers highly active protein\nvariants compared to other baselines.\n","authors":["Melis Ilayda Bal","Pier Giuseppe Sessa","Mojmir Mutny","Andreas Krause"],"pdf_url":"https://arxiv.org/pdf/2409.18582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18581v1","updated":"2024-09-27T09:37:09Z","published":"2024-09-27T09:37:09Z","title":"Using Deep Autoregressive Models as Causal Inference Engines","summary":" Existing causal inference (CI) models are limited to primarily handling\nlow-dimensional confounders and singleton actions. We propose an autoregressive\n(AR) CI framework capable of handling complex confounders and sequential\nactions common in modern applications. We accomplish this by {\\em\nsequencification}, transforming data from an underlying causal diagram into a\nsequence of tokens. This approach not only enables training with data generated\nfrom any DAG but also extends existing CI capabilities to accommodate\nestimating several statistical quantities using a {\\em single} model. We can\ndirectly predict interventional probabilities, simplifying inference and\nenhancing outcome prediction accuracy. We demonstrate that an AR model adapted\nfor CI is efficient and effective in various complex applications such as\nnavigating mazes, playing chess endgames, and evaluating the impact of certain\nkeywords on paper acceptance rates.\n","authors":["Daniel Jiwoong Im","Kevin Zhang","Nakul Verma","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2409.18581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20623v2","updated":"2024-09-27T09:35:05Z","published":"2024-07-30T07:59:28Z","title":"SharkTrack: an accurate, generalisable software for streamlining shark\n and ray underwater video analysis","summary":" Elasmobranchs (shark sand rays) represent a critical component of marine\necosystems. Yet, they are experiencing global population declines and effective\nmonitoring of populations is essential to their protection. Underwater\nstationary videos, such as those from Baited Remote Underwater Video Stations\n(BRUVS), are critical for understanding elasmobranch spatial ecology and\nabundance. However, processing these videos requires time-consuming manual\nanalysis that can delay conservation. To address this challenge, we developed\nSharkTrack, a semi-automatic underwater video analysis software. SharkTrack\nuses Convolutional Neural Networks (CNN) and Multi-Object Tracking to\nautomatically detect and track elasmobranchs and provides an annotation\npipeline to manually classify elasmobranch species and compute species-specific\nMaxN (ssMaxN), the standard metric of relative abundance. When tested on BRUVS\nfootage from locations unseen by the CNN model during training, SharkTrack\ncomputed ssMaxN with 89% accuracy over 207 hours of footage. The semi-automatic\nSharkTrack pipeline required two minutes of manual classification per hour of\nvideo, an estimated 95% reduction of manual analysis time compared to\ntraditional methods. Furthermore, we demonstrate SharkTrack accuracy across\ndiverse marine ecosystems and elasmobranch species, an advancement compared to\nprevious models, which were limited to specific species or locations.\nSharkTrack applications extend beyond BRUVS, facilitating the analysis of any\nunderwater stationary video. By making video analysis faster and more\naccessible, SharkTrack enables research and conservation organisations to\nmonitor elasmobranch populations more efficiently, thereby improving\nconservation efforts. To further support these goals, we provide public access\nto the SharkTrack software.\n","authors":["Filippo Varini","Joel H. Gayford","Jeremy Jenrette","Matthew J. Witt","Francesco Garzon","Francesco Ferretti","Sophie Wilday","Mark E. Bond","Michael R. Heithaus","Danielle Robinson","Devon Carter","Najee Gumbs","Vincent Webster","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2407.20623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18578v1","updated":"2024-09-27T09:28:27Z","published":"2024-09-27T09:28:27Z","title":"An Enhanced Federated Prototype Learning Method under Domain Shift","summary":" Federated Learning (FL) allows collaborative machine learning training\nwithout sharing private data. Numerous studies have shown that one significant\nfactor affecting the performance of federated learning models is the\nheterogeneity of data across different clients, especially when the data is\nsampled from various domains. A recent paper introduces variance-aware\ndual-level prototype clustering and uses a novel $\\alpha$-sparsity prototype\nloss, which increases intra-class similarity and reduces inter-class\nsimilarity. To ensure that the features converge within specific clusters, we\nintroduce an improved algorithm, Federated Prototype Learning with Convergent\nClusters, abbreviated as FedPLCC. To increase inter-class distances, we weight\neach prototype with the size of the cluster it represents. To reduce\nintra-class distances, considering that prototypes with larger distances might\ncome from different domains, we select only a certain proportion of prototypes\nfor the loss function calculation. Evaluations on the Digit-5, Office-10, and\nDomainNet datasets show that our method performs better than existing\napproaches.\n","authors":["Liang Kuang","Kuangpu Guo","Jian Liang","Jianguo Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18578v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.18574v1","updated":"2024-09-27T09:18:57Z","published":"2024-09-27T09:18:57Z","title":"Climate Adaptation with Reinforcement Learning: Experiments with\n Flooding and Transportation in Copenhagen","summary":" Due to climate change the frequency and intensity of extreme rainfall events,\nwhich contribute to urban flooding, are expected to increase in many places.\nThese floods can damage transport infrastructure and disrupt mobility,\nhighlighting the need for cities to adapt to escalating risks. Reinforcement\nlearning (RL) serves as a powerful tool for uncovering optimal adaptation\nstrategies, determining how and where to deploy adaptation measures\neffectively, even under significant uncertainty. In this study, we leverage RL\nto identify the most effective timing and locations for implementing measures,\naiming to reduce both direct and indirect impacts of flooding. Our framework\nintegrates climate change projections of future rainfall events and floods,\nmodels city-wide motorized trips, and quantifies direct and indirect impacts on\ninfrastructure and mobility. Preliminary results suggest that our RL-based\napproach can significantly enhance decision-making by prioritizing\ninterventions in specific urban areas and identifying the optimal periods for\ntheir implementation.\n","authors":["Miguel Costa","Morten W. Petersen","Arthur Vandervoort","Martin Drews","Karyn Morrissey","Francisco C. Pereira"],"pdf_url":"https://arxiv.org/pdf/2409.18574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18572v1","updated":"2024-09-27T09:15:44Z","published":"2024-09-27T09:15:44Z","title":"Towards an active-learning approach to resource allocation for\n population-based damage prognosis","summary":" Damage prognosis is, arguably, one of the most difficult tasks of structural\nhealth monitoring (SHM). To address common problems of damage prognosis, a\npopulation-based SHM (PBSHM) approach is adopted in the current work. In this\napproach the prognosis problem is considered as an information-sharing problem\nwhere data from past structures are exploited to make more accurate inferences\nregarding currently-degrading structures. For a given population, there may\nexist restrictions on the resources available to conduct monitoring; thus, the\ncurrent work studies the problem of allocating such resources within a\npopulation of degrading structures with a view to maximising the\ndamage-prognosis accuracy. The challenges of the current framework are mainly\nassociated with the inference of outliers on the level of damage evolution,\ngiven partial data from the damage-evolution phenomenon. The current approach\nconsiders an initial population of structures for which damage evolution is\nextensively observed. Subsequently, a second population of structures with\nevolving damage is considered for which two monitoring systems are available, a\nlow-availability and high-fidelity (low-uncertainty) one, and a\nwidely-available and low-fidelity (high-uncertainty) one. The task of the\ncurrent work is to follow an active-learning approach to identify the\nstructures to which the high-fidelity system should be assigned in order to\nenhance the predictive capabilities of the machine-learning model throughout\nthe population.\n","authors":["George Tsialiamanis","Keith Worden","Nikolaos Dervilis","Aidan J Hughes"],"pdf_url":"https://arxiv.org/pdf/2409.18572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18568v1","updated":"2024-09-27T09:11:52Z","published":"2024-09-27T09:11:52Z","title":"Experimental Evaluation of Machine Learning Models for Goal-oriented\n Customer Service Chatbot with Pipeline Architecture","summary":" Integrating machine learning (ML) into customer service chatbots enhances\ntheir ability to understand and respond to user queries, ultimately improving\nservice performance. However, they may appear artificial to some users and\naffecting customer experience. Hence, meticulous evaluation of ML models for\neach pipeline component is crucial for optimizing performance, though\ndifferences in functionalities can lead to unfair comparisons. In this paper,\nwe present a tailored experimental evaluation approach for goal-oriented\ncustomer service chatbots with pipeline architecture, focusing on three key\ncomponents: Natural Language Understanding (NLU), dialogue management (DM), and\nNatural Language Generation (NLG). Our methodology emphasizes individual\nassessment to determine optimal ML models. Specifically, we focus on optimizing\nhyperparameters and evaluating candidate models for NLU (utilizing BERT and\nLSTM), DM (employing DQN and DDQN), and NLG (leveraging GPT-2 and DialoGPT).\nThe results show that for the NLU component, BERT excelled in intent detection\nwhereas LSTM was superior for slot filling. For the DM component, the DDQN\nmodel outperformed DQN by achieving fewer turns, higher rewards, as well as\ngreater success rates. For NLG, the large language model GPT-2 surpassed\nDialoGPT in BLEU, METEOR, and ROUGE metrics. These findings aim to provide a\nbenchmark for future research in developing and optimizing customer service\nchatbots, offering valuable insights into model performance and optimal\nhyperparameters.\n","authors":["Nurul Ain Nabilah Mohd Isa","Siti Nuraishah Agos Jawaddi","Azlan Ismail"],"pdf_url":"https://arxiv.org/pdf/2409.18568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18566v1","updated":"2024-09-27T09:10:44Z","published":"2024-09-27T09:10:44Z","title":"Optimizing DNN Inference on Multi-Accelerator SoCs at Training-time","summary":" The demand for executing Deep Neural Networks (DNNs) with low latency and\nminimal power consumption at the edge has led to the development of advanced\nheterogeneous Systems-on-Chips (SoCs) that incorporate multiple specialized\ncomputing units (CUs), such as accelerators. Offloading DNN computations to a\nspecific CU from the available set often exposes accuracy vs efficiency\ntrade-offs, due to differences in their supported operations (e.g., standard\nvs. depthwise convolution) or data representations (e.g., more/less\naggressively quantized). A challenging yet unresolved issue is how to map a DNN\nonto these multi-CU systems to maximally exploit the parallelization\npossibilities while taking accuracy into account. To address this problem, we\npresent ODiMO, a hardware-aware tool that efficiently explores fine-grain\nmapping of DNNs among various on-chip CUs, during the training phase. ODiMO\nstrategically splits individual layers of the neural network and executes them\nin parallel on the multiple available CUs, aiming to balance the total\ninference energy consumption or latency with the resulting accuracy, impacted\nby the unique features of the different hardware units. We test our approach on\nCIFAR-10, CIFAR-100, and ImageNet, targeting two open-source heterogeneous\nSoCs, i.e., DIANA and Darkside. We obtain a rich collection of Pareto-optimal\nnetworks in the accuracy vs. energy or latency space. We show that ODiMO\nreduces the latency of a DNN executed on the Darkside SoC by up to 8x at\niso-accuracy, compared to manual heuristic mappings. When targeting energy, on\nthe same SoC, ODiMO produced up to 50.8x more efficient mappings, with minimal\naccuracy drop (< 0.3%).\n","authors":["Matteo Risso","Alessio Burrello","Daniele Jahier Pagliari"],"pdf_url":"https://arxiv.org/pdf/2409.18566v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2403.19181v3","updated":"2024-09-27T08:57:34Z","published":"2024-03-28T07:22:16Z","title":"Make Large Language Model a Better Ranker","summary":" Large Language Models (LLMs) demonstrate robust capabilities across various\nfields, leading to a paradigm shift in LLM-enhanced Recommender System (RS).\nResearch to date focuses on point-wise and pair-wise recommendation paradigms,\nwhich are inefficient for LLM-based recommenders due to high computational\ncosts. However, existing list-wise approaches also fall short in ranking tasks\ndue to misalignment between ranking objectives and next-token prediction.\nMoreover, these LLM-based methods struggle to effectively address the order\nrelation among candidates, particularly given the scale of ratings. To address\nthese challenges, this paper introduces the large language model framework with\nAligned Listwise Ranking Objectives (ALRO). ALRO is designed to bridge the gap\nbetween the capabilities of LLMs and the nuanced requirements of ranking tasks.\nSpecifically, ALRO employs explicit feedback in a listwise manner by\nintroducing soft lambda loss, a customized adaptation of lambda loss designed\nfor optimizing order relations. This mechanism provides more accurate\noptimization goals, enhancing the ranking process. Additionally, ALRO\nincorporates a permutation-sensitive learning mechanism that addresses position\nbias, a prevalent issue in generative models, without imposing additional\ncomputational burdens during inference. Our evaluative studies reveal that ALRO\noutperforms both existing embedding-based recommendation methods and LLM-based\nrecommendation baselines.\n","authors":["Wen-Shuo Chao","Zhi Zheng","Hengshu Zhu","Hao Liu"],"pdf_url":"https://arxiv.org/pdf/2403.19181v3.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.18556v1","updated":"2024-09-27T08:53:17Z","published":"2024-09-27T08:53:17Z","title":"CodeSCAN: ScreenCast ANalysis for Video Programming Tutorials","summary":" Programming tutorials in the form of coding screencasts play a crucial role\nin programming education, serving both novices and experienced developers.\nHowever, the video format of these tutorials presents a challenge due to the\ndifficulty of searching for and within videos. Addressing the absence of\nlarge-scale and diverse datasets for screencast analysis, we introduce the\nCodeSCAN dataset. It comprises 12,000 screenshots captured from the Visual\nStudio Code environment during development, featuring 24 programming languages,\n25 fonts, and over 90 distinct themes, in addition to diverse layout changes\nand realistic user interactions. Moreover, we conduct detailed quantitative and\nqualitative evaluations to benchmark the performance of Integrated Development\nEnvironment (IDE) element detection, color-to-black-and-white conversion, and\nOptical Character Recognition (OCR). We hope that our contributions facilitate\nmore research in coding screencast analysis, and we make the source code for\ncreating the dataset and the benchmark publicly available on this website.\n","authors":["Alexander Naumann","Felix Hertlein","Jacqueline Höllig","Lucas Cazzonelli","Steffen Thoma"],"pdf_url":"https://arxiv.org/pdf/2409.18556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03094v4","updated":"2024-09-27T08:50:10Z","published":"2024-02-05T15:25:32Z","title":"Cross-Domain Few-Shot Object Detection via Enhanced Open-Set Object\n Detector","summary":" This paper studies the challenging cross-domain few-shot object detection\n(CD-FSOD), aiming to develop an accurate object detector for novel domains with\nminimal labeled examples. While transformer-based open-set detectors, such as\nDE-ViT, show promise in traditional few-shot object detection, their\ngeneralization to CD-FSOD remains unclear: 1) can such open-set detection\nmethods easily generalize to CD-FSOD? 2) If not, how can models be enhanced\nwhen facing huge domain gaps? To answer the first question, we employ measures\nincluding style, inter-class variance (ICV), and indefinable boundaries (IB) to\nunderstand the domain gap. Based on these measures, we establish a new\nbenchmark named CD-FSOD to evaluate object detection methods, revealing that\nmost of the current approaches fail to generalize across domains. Technically,\nwe observe that the performance decline is associated with our proposed\nmeasures: style, ICV, and IB. Consequently, we propose several novel modules to\naddress these issues. First, the learnable instance features align initial\nfixed instances with target categories, enhancing feature distinctiveness.\nSecond, the instance reweighting module assigns higher importance to\nhigh-quality instances with slight IB. Third, the domain prompter encourages\nfeatures resilient to different styles by synthesizing imaginary domains\nwithout altering semantic contents. These techniques collectively contribute to\nthe development of the Cross-Domain Vision Transformer for CD-FSOD (CD-ViTO),\nsignificantly improving upon the base DE-ViT. Experimental results validate the\nefficacy of our model.\n","authors":["Yuqian Fu","Yu Wang","Yixuan Pan","Lian Huai","Xingyu Qiu","Zeyu Shangguan","Tong Liu","Yanwei Fu","Luc Van Gool","Xingqun Jiang"],"pdf_url":"https://arxiv.org/pdf/2402.03094v4.pdf","comment":"Accepted by ECCV2024 (project website:\n http://yuqianfu.com/CDFSOD-benchmark)"},{"id":"http://arxiv.org/abs/2409.18553v1","updated":"2024-09-27T08:45:55Z","published":"2024-09-27T08:45:55Z","title":"Efficient Noise Mitigation for Enhancing Inference Accuracy in DNNs on\n Mixed-Signal Accelerators","summary":" In this paper, we propose a framework to enhance the robustness of the neural\nmodels by mitigating the effects of process-induced and aging-related\nvariations of analog computing components on the accuracy of the analog neural\nnetworks. We model these variations as the noise affecting the precision of the\nactivations and introduce a denoising block inserted between selected layers of\na pre-trained model. We demonstrate that training the denoising block\nsignificantly increases the model's robustness against various noise levels. To\nminimize the overhead associated with adding these blocks, we present an\nexploration algorithm to identify optimal insertion points for the denoising\nblocks. Additionally, we propose a specialized architecture to efficiently\nexecute the denoising blocks, which can be integrated into mixed-signal\naccelerators. We evaluate the effectiveness of our approach using Deep Neural\nNetwork (DNN) models trained on the ImageNet and CIFAR-10 datasets. The results\nshow that on average, by accepting 2.03% parameter count overhead, the accuracy\ndrop due to the variations reduces from 31.7% to 1.15%.\n","authors":["Seyedarmin Azizi","Mohammad Erfan Sadeghi","Mehdi Kamal","Massoud Pedram"],"pdf_url":"https://arxiv.org/pdf/2409.18553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15564v2","updated":"2024-09-27T08:40:26Z","published":"2024-09-23T21:38:49Z","title":"CauSkelNet: Causal Representation Learning for Human Behaviour Analysis","summary":" Constrained by the lack of model interpretability and a deep understanding of\nhuman movement in traditional movement recognition machine learning methods,\nthis study introduces a novel representation learning method based on causal\ninference to better understand human joint dynamics and complex behaviors. We\npropose a two-stage framework that combines the Peter-Clark (PC) algorithm and\nKullback-Leibler (KL) divergence to identify and quantify causal relationships\nbetween joints. Our method effectively captures interactions and produces\ninterpretable, robust representations. Experiments on the EmoPain dataset show\nthat our causal GCN outperforms traditional GCNs in accuracy, F1 score, and\nrecall, especially in detecting protective behaviors. The model is also highly\ninvariant to data scale changes, enhancing its reliability in practical\napplications. Our approach advances human motion analysis and paves the way for\nmore adaptive intelligent healthcare solutions.\n","authors":["Xingrui Gu","Chuyi Jiang","Erte Wang","Zekun Wu","Qiang Cui","Leimin Tian","Lianlong Wu","Siyang Song","Chuang Yu"],"pdf_url":"https://arxiv.org/pdf/2409.15564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.06996v3","updated":"2024-09-27T08:33:31Z","published":"2022-10-13T13:04:16Z","title":"DICTDIS: Dictionary Constrained Disambiguation for Improved NMT","summary":" Domain-specific neural machine translation (NMT) systems (e.g., in\neducational applications) are socially significant with the potential to help\nmake information accessible to a diverse set of users in multilingual\nsocieties. It is desirable that such NMT systems be lexically constrained and\ndraw from domain-specific dictionaries. Dictionaries could present multiple\ncandidate translations for a source word/phrase due to the polysemous nature of\nwords. The onus is then on the NMT model to choose the contextually most\nappropriate candidate. Prior work has largely ignored this problem and focused\non the single candidate constraint setting wherein the target word or phrase is\nreplaced by a single constraint. In this work we present DictDis, a lexically\nconstrained NMT system that disambiguates between multiple candidate\ntranslations derived from dictionaries. We achieve this by augmenting training\ndata with multiple dictionary candidates to actively encourage disambiguation\nduring training by implicitly aligning multiple candidate constraints. We\ndemonstrate the utility of DictDis via extensive experiments on English-Hindi\nand English-German sentences in a variety of domains including regulatory,\nfinance, engineering. We also present comparisons on standard benchmark test\ndatasets. In comparison with existing approaches for lexically constrained and\nunconstrained NMT, we demonstrate superior performance with respect to\nconstraint copy and disambiguation related measures on all domains while also\nobtaining improved fluency of up to 2-3 BLEU points on some domains.\n","authors":["Ayush Maheshwari","Preethi Jyothi","Ganesh Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2210.06996v3.pdf","comment":"In Findings of EMNLP, 2024"},{"id":"http://arxiv.org/abs/2409.18544v1","updated":"2024-09-27T08:25:27Z","published":"2024-09-27T08:25:27Z","title":"Wasserstein Distance-Weighted Adversarial Network for Cross-Domain\n Credit Risk Assessment","summary":" This paper delves into the application of adversarial domain adaptation (ADA)\nfor enhancing credit risk assessment in financial institutions. It addresses\ntwo critical challenges: the cold start problem, where historical lending data\nis scarce, and the data imbalance issue, where high-risk transactions are\nunderrepresented. The paper introduces an improved ADA framework, the\nWasserstein Distance Weighted Adversarial Domain Adaptation Network (WD-WADA),\nwhich leverages the Wasserstein distance to align source and target domains\neffectively. The proposed method includes an innovative weighted strategy to\ntackle data imbalance, adjusting for both the class distribution and the\ndifficulty level of predictions. The paper demonstrates that WD-WADA not only\nmitigates the cold start problem but also provides a more accurate measure of\ndomain differences, leading to improved cross-domain credit risk assessment.\nExtensive experiments on real-world credit datasets validate the model's\neffectiveness, showcasing superior performance in cross-domain learning,\nclassification accuracy, and model stability compared to traditional methods.\n","authors":["Mohan Jiang","Jiating Lin","Hongju Ouyang","Jingming Pan","Siyuan Han","Bingyao Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18529v1","updated":"2024-09-27T08:11:49Z","published":"2024-09-27T08:11:49Z","title":"Robustness of AI-based weather forecasts in a changing climate","summary":" Data-driven machine learning models for weather forecasting have made\ntransformational progress in the last 1-2 years, with state-of-the-art ones now\noutperforming the best physics-based models for a wide range of skill scores.\nGiven the strong links between weather and climate modelling, this raises the\nquestion whether machine learning models could also revolutionize climate\nscience, for example by informing mitigation and adaptation to climate change\nor to generate larger ensembles for more robust uncertainty estimates. Here, we\nshow that current state-of-the-art machine learning models trained for weather\nforecasting in present-day climate produce skillful forecasts across different\nclimate states corresponding to pre-industrial, present-day, and future 2.9K\nwarmer climates. This indicates that the dynamics shaping the weather on short\ntimescales may not differ fundamentally in a changing climate. It also\ndemonstrates out-of-distribution generalization capabilities of the machine\nlearning models that are a critical prerequisite for climate applications.\nNonetheless, two of the models show a global-mean cold bias in the forecasts\nfor the future warmer climate state, i.e. they drift towards the colder\npresent-day climate they have been trained for. A similar result is obtained\nfor the pre-industrial case where two out of three models show a warming. We\ndiscuss possible remedies for these biases and analyze their spatial\ndistribution, revealing complex warming and cooling patterns that are partly\nrelated to missing ocean-sea ice and land surface information in the training\ndata. Despite these current limitations, our results suggest that data-driven\nmachine learning models will provide powerful tools for climate science and\ntransform established approaches by complementing conventional physics-based\nmodels.\n","authors":["Thomas Rackow","Nikolay Koldunov","Christian Lessig","Irina Sandu","Mihai Alexe","Matthew Chantry","Mariana Clare","Jesper Dramsch","Florian Pappenberger","Xabier Pedruzo-Bagazgoitia","Steffen Tietsche","Thomas Jung"],"pdf_url":"https://arxiv.org/pdf/2409.18529v1.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.18523v1","updated":"2024-09-27T08:05:34Z","published":"2024-09-27T08:05:34Z","title":"Token Caching for Diffusion Transformer Acceleration","summary":" Diffusion transformers have gained substantial interest in diffusion\ngenerative modeling due to their outstanding performance. However, their high\ncomputational cost, arising from the quadratic computational complexity of\nattention mechanisms and multi-step inference, presents a significant\nbottleneck. To address this challenge, we propose TokenCache, a novel\npost-training acceleration method that leverages the token-based multi-block\narchitecture of transformers to reduce redundant computations among tokens\nacross inference steps. TokenCache specifically addresses three critical\nquestions in the context of diffusion transformers: (1) which tokens should be\npruned to eliminate redundancy, (2) which blocks should be targeted for\nefficient pruning, and (3) at which time steps caching should be applied to\nbalance speed and quality. In response to these challenges, TokenCache\nintroduces a Cache Predictor that assigns importance scores to tokens, enabling\nselective pruning without compromising model performance. Furthermore, we\npropose an adaptive block selection strategy to focus on blocks with minimal\nimpact on the network's output, along with a Two-Phase Round-Robin (TPRR)\nscheduling policy to optimize caching intervals throughout the denoising\nprocess. Experimental results across various models demonstrate that TokenCache\nachieves an effective trade-off between generation quality and inference speed\nfor diffusion transformers. Our code will be publicly available.\n","authors":["Jinming Lou","Wenyang Luo","Yufan Liu","Bing Li","Xinmiao Ding","Weiming Hu","Jiajiong Cao","Yuming Li","Chenguang Ma"],"pdf_url":"https://arxiv.org/pdf/2409.18523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10512v2","updated":"2024-09-27T08:00:07Z","published":"2024-05-17T03:32:15Z","title":"In-context Contrastive Learning for Event Causality Identification","summary":" Event Causality Identification (ECI) aims at determining the existence of a\ncausal relation between two events. Although recent prompt learning-based\napproaches have shown promising improvements on the ECI task, their performance\nare often subject to the delicate design of multiple prompts and the positive\ncorrelations between the main task and derivate tasks. The in-context learning\nparadigm provides explicit guidance for label prediction in the prompt learning\nparadigm, alleviating its reliance on complex prompts and derivative tasks.\nHowever, it does not distinguish between positive and negative demonstrations\nfor analogy learning. Motivated from such considerations, this paper proposes\nan In-Context Contrastive Learning (ICCL) model that utilizes contrastive\nlearning to enhance the effectiveness of both positive and negative\ndemonstrations. Additionally, we apply contrastive learning to event pairs to\nbetter facilitate event causality identification. Our ICCL is evaluated on the\nwidely used corpora, including the EventStoryLine and Causal-TimeBank, and\nresults show significant performance improvements over the state-of-the-art\nalgorithms.\n","authors":["Chao Liang","Wei Xiang","Bang Wang"],"pdf_url":"https://arxiv.org/pdf/2405.10512v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18506v1","updated":"2024-09-27T07:44:07Z","published":"2024-09-27T07:44:07Z","title":"Med-IC: Fusing a Single Layer Involution with Convolutions for Enhanced\n Medical Image Classification and Segmentation","summary":" The majority of medical images, especially those that resemble cells, have\nsimilar characteristics. These images, which occur in a variety of shapes,\noften show abnormalities in the organ or cell region. The convolution operation\npossesses a restricted capability to extract visual patterns across several\nspatial regions of an image. The involution process, which is the inverse\noperation of convolution, complements this inherent lack of spatial information\nextraction present in convolutions. In this study, we investigate how applying\na single layer of involution prior to a convolutional neural network (CNN)\narchitecture can significantly improve classification and segmentation\nperformance, with a comparatively negligible amount of weight parameters. The\nstudy additionally shows how excessive use of involution layers might result in\ninaccurate predictions in a particular type of medical image. According to our\nfindings from experiments, the strategy of adding only a single involution\nlayer before a CNN-based model outperforms most of the previous works.\n","authors":["Md. Farhadul Islam","Sarah Zabeen","Meem Arafat Manab","Mohammad Rakibul Hasan Mahin","Joyanta Jyoti Mondal","Md. Tanzim Reza","Md Zahidul Hasan","Munima Haque","Farig Sadeque","Jannatun Noor"],"pdf_url":"https://arxiv.org/pdf/2409.18506v1.pdf","comment":"13 pages, 5 figures, 4 tables, preprint submitted to an Elsevier\n journal"},{"id":"http://arxiv.org/abs/2409.18504v1","updated":"2024-09-27T07:38:47Z","published":"2024-09-27T07:38:47Z","title":"WHOMP: Optimizing Randomized Controlled Trials via Wasserstein\n Homogeneity","summary":" We investigate methods for partitioning datasets into subgroups that maximize\ndiversity within each subgroup while minimizing dissimilarity across subgroups.\nWe introduce a novel partitioning method called the $\\textit{Wasserstein\nHomogeneity Partition}$ (WHOMP), which optimally minimizes type I and type II\nerrors that often result from imbalanced group splitting or partitioning,\ncommonly referred to as accidental bias, in comparative and controlled trials.\nWe conduct an analytical comparison of WHOMP against existing partitioning\nmethods, such as random subsampling, covariate-adaptive randomization,\nrerandomization, and anti-clustering, demonstrating its advantages. Moreover,\nwe characterize the optimal solutions to the WHOMP problem and reveal an\ninherent trade-off between the stability of subgroup means and variances among\nthese solutions. Based on our theoretical insights, we design algorithms that\nnot only obtain these optimal solutions but also equip practitioners with tools\nto select the desired trade-off. Finally, we validate the effectiveness of\nWHOMP through numerical experiments, highlighting its superiority over\ntraditional methods.\n","authors":["Shizhou Xu","Thomas Strohmer"],"pdf_url":"https://arxiv.org/pdf/2409.18504v1.pdf","comment":"46 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.18499v1","updated":"2024-09-27T07:32:42Z","published":"2024-09-27T07:32:42Z","title":"Fairness-aware Multiobjective Evolutionary Learning","summary":" Multiobjective evolutionary learning (MOEL) has demonstrated its advantages\nof training fairer machine learning models considering a predefined set of\nconflicting objectives, including accuracy and different fairness measures.\nRecent works propose to construct a representative subset of fairness measures\nas optimisation objectives of MOEL throughout model training. However, the\ndetermination of a representative measure set relies on dataset, prior\nknowledge and requires substantial computational costs. What's more, those\nrepresentative measures may differ across different model training processes.\nInstead of using a static predefined set determined before model training, this\npaper proposes to dynamically and adaptively determine a representative measure\nset online during model training. The dynamically determined representative set\nis then used as optimising objectives of the MOEL framework and can vary with\ntime. Extensive experimental results on 12 well-known benchmark datasets\ndemonstrate that our proposed framework achieves outstanding performance\ncompared to state-of-the-art approaches for mitigating unfairness in terms of\naccuracy as well as 25 fairness measures although only a few of them were\ndynamically selected and used as optimisation objectives. The results indicate\nthe importance of setting optimisation objectives dynamically during training.\n","authors":["Qingquan Zhang","Jialin Liu","Xin Yao"],"pdf_url":"https://arxiv.org/pdf/2409.18499v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2310.07745v3","updated":"2024-09-27T07:26:27Z","published":"2023-10-11T16:24:14Z","title":"Deep Reinforcement Learning for Autonomous Cyber Defence: A Survey","summary":" The rapid increase in the number of cyber-attacks in recent years raises the\nneed for principled methods for defending networks against malicious actors.\nDeep reinforcement learning (DRL) has emerged as a promising approach for\nmitigating these attacks. However, while DRL has shown much potential for cyber\ndefence, numerous challenges must be overcome before DRL can be applied to the\nautonomous cyber defence (ACD) problem at scale. Principled methods are\nrequired for environments that confront learners with very high-dimensional\nstate spaces, large multi-discrete action spaces, and adversarial learning.\nRecent works have reported success in solving these problems individually.\nThere have also been impressive engineering efforts towards solving all three\nfor real-time strategy games. However, applying DRL to the full ACD problem\nremains an open challenge. Here, we survey the relevant DRL literature and\nconceptualize an idealised ACD-DRL agent. We provide: i.) A summary of the\ndomain properties that define the ACD problem; ii.) A comprehensive comparison\nof current ACD environments used for benchmarking DRL approaches; iii.) An\noverview of state-of-the-art approaches for scaling DRL to domains that\nconfront learners with the curse of dimensionality, and; iv.) A survey and\ncritique of current methods for limiting the exploitability of agents within\nadversarial settings from the perspective of ACD. We conclude with open\nresearch questions that we hope will motivate future directions for researchers\nand practitioners working on ACD.\n","authors":["Gregory Palmer","Chris Parry","Daniel J. B. Harrold","Chris Willis"],"pdf_url":"https://arxiv.org/pdf/2310.07745v3.pdf","comment":"89 pages, 14 figures, 4 tables"},{"id":"http://arxiv.org/abs/2405.00871v2","updated":"2024-09-27T07:18:39Z","published":"2024-05-01T21:11:29Z","title":"Learning to Boost the Performance of Stable Nonlinear Systems","summary":" The growing scale and complexity of safety-critical control systems\nunderscore the need to evolve current control architectures aiming for the\nunparalleled performances achievable through state-of-the-art optimization and\nmachine learning algorithms. However, maintaining closed-loop stability while\nboosting the performance of nonlinear control systems using data-driven and\ndeep-learning approaches stands as an important unsolved challenge. In this\npaper, we tackle the performance-boosting problem with closed-loop stability\nguarantees. Specifically, we establish a synergy between the Internal Model\nControl (IMC) principle for nonlinear systems and state-of-the-art\nunconstrained optimization approaches for learning stable dynamics. Our methods\nenable learning over arbitrarily deep neural network classes of\nperformance-boosting controllers for stable nonlinear systems; crucially, we\nguarantee L_p closed-loop stability even if optimization is halted prematurely,\nand even when the ground-truth dynamics are unknown, with vanishing\nconservatism in the class of stabilizing policies as the model uncertainty is\nreduced to zero. We discuss the implementation details of the proposed control\nschemes, including distributed ones, along with the corresponding optimization\nprocedures, demonstrating the potential of freely shaping the cost functions\nthrough several numerical experiments.\n","authors":["Luca Furieri","Clara Lucía Galimberti","Giancarlo Ferrari-Trecate"],"pdf_url":"https://arxiv.org/pdf/2405.00871v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18491v1","updated":"2024-09-27T07:09:40Z","published":"2024-09-27T07:09:40Z","title":"Treating Brain-inspired Memories as Priors for Diffusion Model to\n Forecast Multivariate Time Series","summary":" Forecasting Multivariate Time Series (MTS) involves significant challenges in\nvarious application domains. One immediate challenge is modeling temporal\npatterns with the finite length of the input. These temporal patterns usually\ninvolve periodic and sudden events that recur across different channels. To\nbetter capture temporal patterns, we get inspiration from humans' memory\nmechanisms and propose a channel-shared, brain-inspired memory module for MTS.\nSpecifically, brain-inspired memory comprises semantic and episodic memory,\nwhere the former is used to capture general patterns, such as periodic events,\nand the latter is employed to capture special patterns, such as sudden events,\nrespectively. Meanwhile, we design corresponding recall and update mechanisms\nto better utilize these patterns. Furthermore, acknowledging the capacity of\ndiffusion models to leverage memory as a prior, we present a brain-inspired\nmemory-augmented diffusion model. This innovative model retrieves relevant\nmemories for different channels, utilizing them as distinct priors for MTS\npredictions. This incorporation significantly enhances the accuracy and\nrobustness of predictions. Experimental results on eight datasets consistently\nvalidate the superiority of our approach in capturing and leveraging diverse\nrecurrent temporal patterns across different channels.\n","authors":["Muyao Wang","Wenchao Chen","Zhibin Duan","Bo Chen"],"pdf_url":"https://arxiv.org/pdf/2409.18491v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15295v2","updated":"2024-09-27T07:05:47Z","published":"2024-09-04T08:53:24Z","title":"Reservoir Static Property Estimation Using Nearest-Neighbor Neural\n Network","summary":" This note presents an approach for estimating the spatial distribution of\nstatic properties in reservoir modeling using a nearest-neighbor neural\nnetwork. The method leverages the strengths of neural networks in approximating\ncomplex, non-linear functions, particularly for tasks involving spatial\ninterpolation. It incorporates a nearest-neighbor algorithm to capture local\nspatial relationships between data points and introduces randomization to\nquantify the uncertainty inherent in the interpolation process. This approach\naddresses the limitations of traditional geostatistical methods, such as\nInverse Distance Weighting (IDW) and Kriging, which often fail to model the\ncomplex non-linear dependencies in reservoir data. By integrating spatial\nproximity and uncertainty quantification, the proposed method can improve the\naccuracy of static property predictions like porosity and permeability.\n","authors":["Yuhe Wang"],"pdf_url":"https://arxiv.org/pdf/2409.15295v2.pdf","comment":"6 pages, 3 figures; updated to tex source"},{"id":"http://arxiv.org/abs/2408.07841v2","updated":"2024-09-27T07:02:12Z","published":"2024-08-14T22:43:52Z","title":"SustainDC -- Benchmarking for Sustainable Data Center Control","summary":" Machine learning has driven an exponential increase in computational demand,\nleading to massive data centers that consume significant amounts of energy and\ncontribute to climate change. This makes sustainable data center control a\npriority. In this paper, we introduce SustainDC, a set of Python environments\nfor benchmarking multi-agent reinforcement learning (MARL) algorithms for data\ncenters (DC). SustainDC supports custom DC configurations and tasks such as\nworkload scheduling, cooling optimization, and auxiliary battery management,\nwith multiple agents managing these operations while accounting for the effects\nof each other. We evaluate various MARL algorithms on SustainDC, showing their\nperformance across diverse DC designs, locations, weather conditions, grid\ncarbon intensity, and workload requirements. Our results highlight significant\nopportunities for improvement of data center operations using MARL algorithms.\nGiven the increasing use of DC due to AI, SustainDC provides a crucial platform\nfor the development and benchmarking of advanced algorithms essential for\nachieving sustainable computing and addressing other heterogeneous real-world\nchallenges.\n","authors":["Avisek Naug","Antonio Guillen","Ricardo Luna","Vineet Gundecha","Desik Rengarajan","Sahand Ghorbanpour","Sajad Mousavi","Ashwin Ramesh Babu","Dejan Markovikj","Lekhapriya D Kashyap","Soumyendu Sarkar"],"pdf_url":"https://arxiv.org/pdf/2408.07841v2.pdf","comment":"Under review at Advances in Neural Information Processing Systems\n 2024 (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2409.18482v1","updated":"2024-09-27T06:51:11Z","published":"2024-09-27T06:51:11Z","title":"HSTFL: A Heterogeneous Federated Learning Framework for Misaligned\n Spatiotemporal Forecasting","summary":" Spatiotemporal forecasting has emerged as an indispensable building block of\ndiverse smart city applications, such as intelligent transportation and smart\nenergy management. Recent advancements have uncovered that the performance of\nspatiotemporal forecasting can be significantly improved by integrating\nknowledge in geo-distributed time series data from different domains, \\eg\nenhancing real-estate appraisal with human mobility data; joint taxi and bike\ndemand predictions. While effective, existing approaches assume a centralized\ndata collection and exploitation environment, overlooking the privacy and\ncommercial interest concerns associated with data owned by different parties.\nIn this paper, we investigate multi-party collaborative spatiotemporal\nforecasting without direct access to multi-source private data. However, this\ntask is challenging due to 1) cross-domain feature heterogeneity and 2)\ncross-client geographical heterogeneity, where standard horizontal or vertical\nfederated learning is inapplicable. To this end, we propose a Heterogeneous\nSpatioTemporal Federated Learning (HSTFL) framework to enable multiple clients\nto collaboratively harness geo-distributed time series data from different\ndomains while preserving privacy. Specifically, we first devise vertical\nfederated spatiotemporal representation learning to locally preserve\nspatiotemporal dependencies among individual participants and generate\neffective representations for heterogeneous data. Then we propose a\ncross-client virtual node alignment block to incorporate cross-client\nspatiotemporal dependencies via a multi-level knowledge fusion scheme.\nExtensive privacy analysis and experimental evaluations demonstrate that HSTFL\nnot only effectively resists inference attacks but also provides a significant\nimprovement against various baselines.\n","authors":["Shuowei Cai","Hao Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18482v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2406.05405v2","updated":"2024-09-27T06:45:13Z","published":"2024-06-08T08:56:47Z","title":"Robust Conformal Prediction Using Privileged Information","summary":" We develop a method to generate prediction sets with a guaranteed coverage\nrate that is robust to corruptions in the training data, such as missing or\nnoisy variables. Our approach builds on conformal prediction, a powerful\nframework to construct prediction sets that are valid under the i.i.d\nassumption. Importantly, naively applying conformal prediction does not provide\nreliable predictions in this setting, due to the distribution shift induced by\nthe corruptions. To account for the distribution shift, we assume access to\nprivileged information (PI). The PI is formulated as additional features that\nexplain the distribution shift, however, they are only available during\ntraining and absent at test time. We approach this problem by introducing a\nnovel generalization of weighted conformal prediction and support our method\nwith theoretical coverage guarantees. Empirical experiments on both real and\nsynthetic datasets indicate that our approach achieves a valid coverage rate\nand constructs more informative predictions compared to existing methods, which\nare not supported by theoretical guarantees.\n","authors":["Shai Feldman","Yaniv Romano"],"pdf_url":"https://arxiv.org/pdf/2406.05405v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18481v1","updated":"2024-09-27T06:43:06Z","published":"2024-09-27T06:43:06Z","title":"Deep Heterogeneous Contrastive Hyper-Graph Learning for In-the-Wild\n Context-Aware Human Activity Recognition","summary":" Human Activity Recognition (HAR) is a challenging, multi-label classification\nproblem as activities may co-occur and sensor signals corresponding to the same\nactivity may vary in different contexts (e.g., different device placements).\nThis paper proposes a Deep Heterogeneous Contrastive Hyper-Graph Learning\n(DHC-HGL) framework that captures heterogenous Context-Aware HAR (CA-HAR)\nhypergraph properties in a message-passing and neighborhood-aggregation\nfashion. Prior work only explored homogeneous or shallow-node-heterogeneous\ngraphs. DHC-HGL handles heterogeneous CA-HAR data by innovatively 1)\nConstructing three different types of sub-hypergraphs that are each passed\nthrough different custom HyperGraph Convolution (HGC) layers designed to handle\nedge-heterogeneity and 2) Adopting a contrastive loss function to ensure\nnode-heterogeneity. In rigorous evaluation on two CA-HAR datasets, DHC-HGL\nsignificantly outperformed state-of-the-art baselines by 5.8% to 16.7% on\nMatthews Correlation Coefficient (MCC) and 3.0% to 8.4% on Macro F1 scores.\nUMAP visualizations of learned CA-HAR node embeddings are also presented to\nenhance model explainability.\n","authors":["Wen Ge","Guanyi Mou","Emmanuel O. Agu","Kyumin Lee"],"pdf_url":"https://arxiv.org/pdf/2409.18481v1.pdf","comment":"IMWUT 2023"},{"id":"http://arxiv.org/abs/2409.18479v1","updated":"2024-09-27T06:42:22Z","published":"2024-09-27T06:42:22Z","title":"CycleNet: Enhancing Time Series Forecasting through Modeling Periodic\n Patterns","summary":" The stable periodic patterns present in time series data serve as the\nfoundation for conducting long-horizon forecasts. In this paper, we pioneer the\nexploration of explicitly modeling this periodicity to enhance the performance\nof models in long-term time series forecasting (LTSF) tasks. Specifically, we\nintroduce the Residual Cycle Forecasting (RCF) technique, which utilizes\nlearnable recurrent cycles to model the inherent periodic patterns within\nsequences, and then performs predictions on the residual components of the\nmodeled cycles. Combining RCF with a Linear layer or a shallow MLP forms the\nsimple yet powerful method proposed in this paper, called CycleNet. CycleNet\nachieves state-of-the-art prediction accuracy in multiple domains including\nelectricity, weather, and energy, while offering significant efficiency\nadvantages by reducing over 90% of the required parameter quantity.\nFurthermore, as a novel plug-and-play technique, the RCF can also significantly\nimprove the prediction accuracy of existing models, including PatchTST and\niTransformer. The source code is available at:\nhttps://github.com/ACAT-SCUT/CycleNet.\n","authors":["Shengsheng Lin","Weiwei Lin","Xinyi Hu","Wentai Wu","Ruichao Mo","Haocheng Zhong"],"pdf_url":"https://arxiv.org/pdf/2409.18479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14206v2","updated":"2024-09-27T06:37:00Z","published":"2024-08-26T12:09:38Z","title":"Lemon and Orange Disease Classification using CNN-Extracted Features and\n Machine Learning Classifier","summary":" Lemons and oranges, both are the most economically significant citrus fruits\nglobally. The production of lemons and oranges is severely affected due to\ndiseases in its growth stages. Fruit quality has degraded due to the presence\nof flaws. Thus, it is necessary to diagnose the disease accurately so that we\ncan avoid major loss of lemons and oranges. To improve citrus farming, we\nproposed a disease classification approach for lemons and oranges. This\napproach would enable early disease detection and intervention, reduce yield\nlosses, and optimize resource allocation. For the initial modeling of disease\nclassification, the research uses innovative deep learning architectures such\nas VGG16, VGG19 and ResNet50. In addition, for achieving better accuracy, the\nbasic machine learning algorithms used for classification problems include\nRandom Forest, Naive Bayes, K-Nearest Neighbors (KNN) and Logistic Regression.\nThe lemon and orange fruits diseases are classified more accurately (95.0% for\nlemon and 99.69% for orange) by the model. The model's base features were\nextracted from the ResNet50 pre-trained model and the diseases are classified\nby the Logistic Regression which beats the performance given by VGG16 and VGG19\nfor other classifiers. Experimental outcomes show that the proposed model also\noutperforms existing models in which most of them classified the diseases using\nthe Softmax classifier without using any individual classifiers.\n","authors":["Khandoker Nosiba Arifin","Sayma Akter Rupa","Md Musfique Anwar","Israt Jahan"],"pdf_url":"https://arxiv.org/pdf/2408.14206v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10566v4","updated":"2024-09-27T06:32:01Z","published":"2024-08-20T06:05:52Z","title":"Overcoming Growth-Induced Forgetting in Task-Agnostic Continual Learning","summary":" In continual learning (CL), model growth enhances adaptability over new data,\nimproving knowledge retention for more tasks. However, improper model growth\ncan lead to severe degradation of previously learned knowledge, an issue we\nname as growth-induced forgetting (GIFt), especially in task-agnostic CL using\nentire grown model for inference. Existing works, despite adopting model growth\nand random initialization for better adaptability, often fail to recognize the\npresence of GIFt caused by improper model growth. This oversight limits\ncomprehensive control of forgetting and hinders full utilization of model\ngrowth. We are the first in CL to identify this issue and conduct an in-depth\nstudy on root cause of GIFt, where layer expansion stands out among model\ngrowth strategies, widening layers without affecting model functionality. Yet,\ndirect adoption of layer expansion presents challenges. It lacks data-driven\ncontrol and initialization of expanded parameters to balance adaptability and\nknowledge retention. This paper presents a novel SparseGrow approach to\novercome the issue of GIFt while enhancing adaptability over new data.\nSparseGrow employs data-driven sparse layer expansion to control efficient\nparameter usage during growth, reducing GIFt from excessive growth and\nfunctionality changes. It also combines sparse growth with on-data\ninitialization at training late-stage to create partially 0-valued expansions\nthat fit learned distribution, enhancing retention and adaptability. To further\nminimize forgetting, freezing is applied by calculating the sparse mask,\nallowing data-driven preservation of important parameters. Through experiments\nacross datasets with various settings, cases, and task numbers, we demonstrate\nthe necessity of layer expansion and showcase the effectiveness of SparseGrow\nin overcoming GIFt, highlighting its adaptability and knowledge retention for\nincremental tasks.\n","authors":["Yuqing Zhao","Divya Saxena","Jiannong Cao","Xiaoyun Liu","Changlin Song"],"pdf_url":"https://arxiv.org/pdf/2408.10566v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15898v3","updated":"2024-09-27T06:26:36Z","published":"2024-09-24T09:17:08Z","title":"FedRepOpt: Gradient Re-parametrized Optimizers in Federated Learning","summary":" Federated Learning (FL) has emerged as a privacy-preserving method for\ntraining machine learning models in a distributed manner on edge devices.\nHowever, on-device models face inherent computational power and memory\nlimitations, potentially resulting in constrained gradient updates. As the\nmodel's size increases, the frequency of gradient updates on edge devices\ndecreases, ultimately leading to suboptimal training outcomes during any\nparticular FL round. This limits the feasibility of deploying advanced and\nlarge-scale models on edge devices, hindering the potential for performance\nenhancements. To address this issue, we propose FedRepOpt, a gradient\nre-parameterized optimizer for FL. The gradient re-parameterized method allows\ntraining a simple local model with a similar performance as a complex model by\nmodifying the optimizer's gradients according to a set of model-specific\nhyperparameters obtained from the complex models. In this work, we focus on\nVGG-style and Ghost-style models in the FL environment. Extensive experiments\ndemonstrate that models using FedRepOpt obtain a significant boost in\nperformance of 16.7% and 11.4% compared to the RepGhost-style and RepVGG-style\nnetworks, while also demonstrating a faster convergence time of 11.7% and 57.4%\ncompared to their complex structure.\n","authors":["Kin Wai Lau","Yasar Abbas Ur Rehman","Pedro Porto Buarque de Gusmão","Lai-Man Po","Lan Ma","Yuyang Xie"],"pdf_url":"https://arxiv.org/pdf/2409.15898v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12842v3","updated":"2024-09-27T06:25:33Z","published":"2024-02-20T09:10:08Z","title":"PromptKD: Distilling Student-Friendly Knowledge for Generative Language\n Models via Prompt Tuning","summary":" Recent advancements in large language models (LLMs) have raised concerns\nabout inference costs, increasing the need for research into model compression.\nWhile knowledge distillation (KD) is a prominent method for this, research on\nKD for generative language models like LLMs is relatively sparse, and the\napproach of distilling student-friendly knowledge, which has shown promising\nperformance in KD for classification models, remains unexplored in generative\nlanguage models. To explore this approach, we propose PromptKD, a simple yet\neffective method that utilizes prompt tuning - for the first time in KD - to\nenable generative language models to transfer student-friendly knowledge.\nUnlike previous works in classification that require fine-tuning the entire\nteacher model for extracting student-friendly knowledge, PromptKD achieves\nsimilar effects by adding a small number of prompt tokens and tuning only the\nprompt with student guidance. Extensive experiments on instruction-following\ndatasets show that PromptKD achieves state-of-the-art performance while adding\nonly 0.0007% of the teacher's parameters as prompts. Further analysis suggests\nthat distilling student-friendly knowledge alleviates exposure bias effectively\nthroughout the entire training process, leading to performance enhancements.\n","authors":["Gyeongman Kim","Doohyuk Jang","Eunho Yang"],"pdf_url":"https://arxiv.org/pdf/2402.12842v3.pdf","comment":"EMNLP 2024 Findings. Our project page: https://promptkd.github.io"},{"id":"http://arxiv.org/abs/2409.18472v1","updated":"2024-09-27T06:18:55Z","published":"2024-09-27T06:18:55Z","title":"URIEL+: Enhancing Linguistic Inclusion and Usability in a Typological\n and Multilingual Knowledge Base","summary":" URIEL is a knowledge base offering geographical, phylogenetic, and\ntypological vector representations for 7970 languages. It includes distance\nmeasures between these vectors for 4005 languages, which are accessible via the\nlang2vec tool. Despite being frequently cited, URIEL is limited in terms of\nlinguistic inclusion and overall usability. To tackle these challenges, we\nintroduce URIEL+, an enhanced version of URIEL and lang2vec addressing these\nlimitations. In addition to expanding typological feature coverage for 2898\nlanguages, URIEL+ improves user experience with robust, customizable distance\ncalculations to better suit the needs of the users. These upgrades also offer\ncompetitive performance on downstream tasks and provide distances that better\nalign with linguistic distance studies.\n","authors":["Aditya Khan","Mason Shipton","David Anugraha","Kaiyao Duan","Phuong H. Hoang","Eric Khiu","A. Seza Doğruöz","En-Shiun Annie Lee"],"pdf_url":"https://arxiv.org/pdf/2409.18472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18470v1","updated":"2024-09-27T06:16:14Z","published":"2024-09-27T06:16:14Z","title":"Fairness without Sensitive Attributes via Knowledge Sharing","summary":" While model fairness improvement has been explored previously, existing\nmethods invariably rely on adjusting explicit sensitive attribute values in\norder to improve model fairness in downstream tasks. However, we observe a\ntrend in which sensitive demographic information becomes inaccessible as public\nconcerns around data privacy grow. In this paper, we propose a confidence-based\nhierarchical classifier structure called \"Reckoner\" for reliable fair model\nlearning under the assumption of missing sensitive attributes. We first present\nresults showing that if the dataset contains biased labels or other hidden\nbiases, classifiers significantly increase the bias gap across different\ndemographic groups in the subset with higher prediction confidence. Inspired by\nthese findings, we devised a dual-model system in which a version of the model\ninitialised with a high-confidence data subset learns from a version of the\nmodel initialised with a low-confidence data subset, enabling it to avoid\nbiased predictions. Our experimental results show that Reckoner consistently\noutperforms state-of-the-art baselines in COMPAS dataset and New Adult dataset,\nconsidering both accuracy and fairness metrics.\n","authors":["Hongliang Ni","Lei Han","Tong Chen","Shazia Sadiq","Gianluca Demartini"],"pdf_url":"https://arxiv.org/pdf/2409.18470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18467v1","updated":"2024-09-27T06:12:31Z","published":"2024-09-27T06:12:31Z","title":"A TextGCN-Based Decoding Approach for Improving Remote Sensing Image\n Captioning","summary":" Remote sensing images are highly valued for their ability to address complex\nreal-world issues such as risk management, security, and meteorology. However,\nmanually captioning these images is challenging and requires specialized\nknowledge across various domains. This letter presents an approach for\nautomatically describing (captioning) remote sensing images. We propose a novel\nencoder-decoder setup that deploys a Text Graph Convolutional Network (TextGCN)\nand multi-layer LSTMs. The embeddings generated by TextGCN enhance the\ndecoder's understanding by capturing the semantic relationships among words at\nboth the sentence and corpus levels. Furthermore, we advance our approach with\na comparison-based beam search method to ensure fairness in the search strategy\nfor generating the final caption. We present an extensive evaluation of our\napproach against various other state-of-the-art encoder-decoder frameworks. We\nevaluated our method across three datasets using seven metrics: BLEU-1 to\nBLEU-4, METEOR, ROUGE-L, and CIDEr. The results demonstrate that our approach\nsignificantly outperforms other state-of-the-art encoder-decoder methods.\n","authors":["Swadhin Das","Raksha Sharma"],"pdf_url":"https://arxiv.org/pdf/2409.18467v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2408.16245v2","updated":"2024-09-27T06:09:41Z","published":"2024-08-29T03:56:40Z","title":"Large-Scale Multi-omic Biosequence Transformers for Modeling\n Peptide-Nucleotide Interactions","summary":" The transformer architecture has revolutionized bioinformatics and driven\nprogress in the understanding and prediction of the properties of biomolecules.\nAlmost all research on large-scale biosequence transformers has focused on one\ndomain at a time (single-omic), usually nucleotides or peptides. These models\nhave seen incredible success in downstream tasks in each domain and have\nachieved particularly noteworthy breakthroughs in sequences of peptides and\nstructural modeling. However, these single-omic models are naturally incapable\nof modeling multi-omic tasks, one of the most biologically critical being\nnucleotide-peptide interactions.\n We present our work training the first multi-omic nucleotide-peptide\nfoundation models. We show that these multi-omic models (MOMs) can learn joint\nrepresentations between various single-omic distributions that are emergently\nconsistent with the Central Dogma of molecular biology, despite only being\ntrained on unlabeled biosequences. We further demonstrate that MOMs can be\nfine-tuned to achieve state-of-the-art results on peptide-nucleotide\ninteraction tasks, namely predicting the change in Gibbs free energy\n({\\Delta}G) of the binding interaction between a given oligonucleotide and\npeptide, as well as the effect on this binding interaction due to mutations in\nthe oligonucleotide sequence ({\\Delta}{\\Delta}G).\n Remarkably, we show that multi-omic biosequence transformers emergently learn\nuseful structural information without any prior structural training, allowing\nus to predict which peptide residues are most involved in the\npeptide-nucleotide binding interaction. Lastly, we provide evidence that\nmulti-omic biosequence models are non-inferior to foundation models trained on\nsingle-omics distributions, suggesting a more generalized or foundational\napproach to building these models.\n","authors":["Sully F. Chen","Robert J. Steele","Beakal Lemeneh","Shivanand P. Lad","Eric Oermann"],"pdf_url":"https://arxiv.org/pdf/2408.16245v2.pdf","comment":"27 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.18462v1","updated":"2024-09-27T05:50:29Z","published":"2024-09-27T05:50:29Z","title":"Latent Representation Learning for Multimodal Brain Activity Translation","summary":" Neuroscience employs diverse neuroimaging techniques, each offering distinct\ninsights into brain activity, from electrophysiological recordings such as EEG,\nwhich have high temporal resolution, to hemodynamic modalities such as fMRI,\nwhich have increased spatial precision. However, integrating these\nheterogeneous data sources remains a challenge, which limits a comprehensive\nunderstanding of brain function. We present the Spatiotemporal Alignment of\nMultimodal Brain Activity (SAMBA) framework, which bridges the spatial and\ntemporal resolution gaps across modalities by learning a unified latent space\nfree of modality-specific biases. SAMBA introduces a novel attention-based\nwavelet decomposition for spectral filtering of electrophysiological\nrecordings, graph attention networks to model functional connectivity between\nfunctional brain units, and recurrent layers to capture temporal\nautocorrelations in brain signal. We show that the training of SAMBA, aside\nfrom achieving translation, also learns a rich representation of brain\ninformation processing. We showcase this classify external stimuli driving\nbrain activity from the representation learned in hidden layers of SAMBA,\npaving the way for broad downstream applications in neuroscience research and\nclinical contexts.\n","authors":["Arman Afrasiyabi","Dhananjay Bhaskar","Erica L. Busch","Laurent Caplette","Rahul Singh","Guillaume Lajoie","Nicholas B. Turk-Browne","Smita Krishnaswamy"],"pdf_url":"https://arxiv.org/pdf/2409.18462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18461v1","updated":"2024-09-27T05:49:48Z","published":"2024-09-27T05:49:48Z","title":"Towards Diverse Device Heterogeneous Federated Learning via Task\n Arithmetic Knowledge Integration","summary":" Federated Learning has emerged as a promising paradigm for collaborative\nmachine learning, while preserving user data privacy. Despite its potential,\nstandard FL lacks support for diverse heterogeneous device prototypes, which\nvary significantly in model and dataset sizes -- from small IoT devices to\nlarge workstations. This limitation is only partially addressed by existing\nknowledge distillation techniques, which often fail to transfer knowledge\neffectively across a broad spectrum of device prototypes with varied\ncapabilities. This failure primarily stems from two issues: the dilution of\ninformative logits from more capable devices by those from less capable ones,\nand the use of a single integrated logits as the distillation target across all\ndevices, which neglects their individual learning capacities and and the unique\ncontributions of each. To address these challenges, we introduce TAKFL, a novel\nKD-based framework that treats the knowledge transfer from each device\nprototype's ensemble as a separate task, independently distilling each to\npreserve its unique contributions and avoid dilution. TAKFL also incorporates a\nKD-based self-regularization technique to mitigate the issues related to the\nnoisy and unsupervised ensemble distillation process. To integrate the\nseparately distilled knowledge, we introduce an adaptive task arithmetic\nknowledge integration process, allowing each student model to customize the\nknowledge integration for optimal performance. Additionally, we present\ntheoretical results demonstrating the effectiveness of task arithmetic in\ntransferring knowledge across heterogeneous devices with varying capacities.\nComprehensive evaluations of our method across both CV and NLP tasks\ndemonstrate that TAKFL achieves SOTA results in a variety of datasets and\nsettings, significantly outperforming existing KD-based methods. Code is\nreleased at https://github.com/MMorafah/TAKFL\n","authors":["Mahdi Morafah","Vyacheslav Kungurtsev","Hojin Chang","Chen Chen","Bill Lin"],"pdf_url":"https://arxiv.org/pdf/2409.18461v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.18455v1","updated":"2024-09-27T05:31:04Z","published":"2024-09-27T05:31:04Z","title":"Review of Digital Asset Development with Graph Neural Network Unlearning","summary":" In the rapidly evolving landscape of digital assets, the imperative for\nrobust data privacy and compliance with regulatory frameworks has intensified.\nThis paper investigates the critical role of Graph Neural Networks (GNNs) in\nthe management of digital assets and introduces innovative unlearning\ntechniques specifically tailored to GNN architectures. We categorize unlearning\nstrategies into two primary classes: data-driven approximation, which\nmanipulates the graph structure to isolate and remove the influence of specific\nnodes, and model-driven approximation, which modifies the internal parameters\nand architecture of the GNN itself. By examining recent advancements in these\nunlearning methodologies, we highlight their applicability in various use\ncases, including fraud detection, risk assessment, token relationship\nprediction, and decentralized governance. We discuss the challenges inherent in\nbalancing model performance with the requirements for data unlearning,\nparticularly in the context of real-time financial applications. Furthermore,\nwe propose a hybrid approach that combines the strengths of both unlearning\nstrategies to enhance the efficiency and effectiveness of GNNs in digital asset\necosystems. Ultimately, this paper aims to provide a comprehensive framework\nfor understanding and implementing GNN unlearning techniques, paving the way\nfor secure and compliant deployment of machine learning in the digital asset\ndomain.\n","authors":["Zara Lisbon"],"pdf_url":"https://arxiv.org/pdf/2409.18455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17144v2","updated":"2024-09-27T05:29:27Z","published":"2024-04-26T04:21:14Z","title":"Sensor Response-Time Reduction using Long-Short Term Memory Network\n Forecasting","summary":" The response time of a biosensor is a crucial metric in safety-critical\napplications such as medical diagnostics where an earlier diagnosis can\nmarkedly improve patient outcomes. However, the speed at which a biosensor\nreaches a final equilibrium state can be limited by poor mass transport and\nlong molecular diffusion times that increase the time it takes target molecules\nto reach the active sensing region of a biosensor. While optimization of system\nand sensor design can promote molecules reaching the sensing element faster, a\nsimpler and complementary approach for response time reduction that is widely\napplicable across all sensor platforms is to use time-series forecasting to\npredict the ultimate steady-state sensor response. In this work, we show that\nensembles of long short-term memory (LSTM) networks can accurately predict\nequilibrium biosensor response from a small quantity of initial time-dependent\nbiosensor measurements, allowing for significant reduction in response time by\na mean and median factor of improvement of 18.6 and 5.1 respectively. The\nensemble of models simultaneously estimates uncertainty, which is vital for\nensuring confidence in the predictions and subsequent safety-related decisions\nthat are made. This approach is demonstrated on real-time experimental data\ncollected by exposing porous silicon biosensors to buffered protein solutions\nusing a multi-channel fluidic cell that enables the automated measurement of\n100 porous silicon biosensors in parallel. The dramatic improvement in sensor\nresponse time achieved using LSTM network ensembles and associated uncertainty\nquantification opens the door to trustworthy and faster responding biosensors,\nenabling more rapid medical diagnostics for faster clinical decision making\nthat can lead to improved patient outcomes and healthcare access, as well as\nquicker identification of toxins in food and the environment.\n","authors":["Simon J. Ward","Muhamed Baljevic","Sharon M. Weiss"],"pdf_url":"https://arxiv.org/pdf/2404.17144v2.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.12372v2","updated":"2024-09-27T05:11:06Z","published":"2024-03-19T02:32:47Z","title":"Learning Transferable Time Series Classifier with Cross-Domain\n Pre-training from Language Model","summary":" Advancements in self-supervised pre-training (SSL) have significantly\nadvanced the field of learning transferable time series representations, which\ncan be very useful in enhancing the downstream task. Despite being effective,\nmost existing works struggle to achieve cross-domain SSL pre-training, missing\nvaluable opportunities to integrate patterns and features from different\ndomains. The main challenge lies in the significant differences in the\ncharacteristics of time-series data across different domains, such as\nvariations in the number of channels and temporal resolution scales. To address\nthis challenge, we propose CrossTimeNet, a novel cross-domain SSL learning\nframework to learn transferable knowledge from various domains to largely\nbenefit the target downstream task. One of the key characteristics of\nCrossTimeNet is the newly designed time series tokenization module, which could\neffectively convert the raw time series into a sequence of discrete tokens\nbased on a reconstruction optimization process. Besides, we highlight that\npredicting a high proportion of corrupted tokens can be very helpful for\nextracting informative patterns across different domains during SSL\npre-training, which has been largely overlooked in past years. Furthermore,\nunlike previous works, our work treats the pre-training language model (PLM) as\nthe initialization of the encoder network, investigating the feasibility of\ntransferring the knowledge learned by the PLM to the time series area. Through\nthese efforts, the path to cross-domain pre-training of a generic time series\nmodel can be effectively paved. We conduct extensive experiments in a\nreal-world scenario across various time series classification domains. The\nexperimental results clearly confirm CrossTimeNet's superior performance.\n","authors":["Mingyue Cheng","Xiaoyu Tao","Qi Liu","Hao Zhang","Yiheng Chen","Chenyi Lei"],"pdf_url":"https://arxiv.org/pdf/2403.12372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18448v1","updated":"2024-09-27T05:10:05Z","published":"2024-09-27T05:10:05Z","title":"Hierarchical Federated Learning with Multi-Timescale Gradient Correction","summary":" While traditional federated learning (FL) typically focuses on a star\ntopology where clients are directly connected to a central server, real-world\ndistributed systems often exhibit hierarchical architectures. Hierarchical FL\n(HFL) has emerged as a promising solution to bridge this gap, leveraging\naggregation points at multiple levels of the system. However, existing\nalgorithms for HFL encounter challenges in dealing with multi-timescale model\ndrift, i.e., model drift occurring across hierarchical levels of data\nheterogeneity. In this paper, we propose a multi-timescale gradient correction\n(MTGC) methodology to resolve this issue. Our key idea is to introduce distinct\ncontrol variables to (i) correct the client gradient towards the group\ngradient, i.e., to reduce client model drift caused by local updates based on\nindividual datasets, and (ii) correct the group gradient towards the global\ngradient, i.e., to reduce group model drift caused by FL over clients within\nthe group. We analytically characterize the convergence behavior of MTGC under\ngeneral non-convex settings, overcoming challenges associated with couplings\nbetween correction terms. We show that our convergence bound is immune to the\nextent of data heterogeneity, confirming the stability of the proposed\nalgorithm against multi-level non-i.i.d. data. Through extensive experiments on\nvarious datasets and models, we validate the effectiveness of MTGC in diverse\nHFL settings. The code for this project is available at\n\\href{https://github.com/wenzhifang/MTGC}{https://github.com/wenzhifang/MTGC}.\n","authors":["Wenzhi Fang","Dong-Jun Han","Evan Chen","Shiqiang Wang","Christopher G. Brinton"],"pdf_url":"https://arxiv.org/pdf/2409.18448v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.09858v2","updated":"2024-09-27T04:50:00Z","published":"2024-09-15T20:41:18Z","title":"A Survey of Out-of-distribution Generalization for Graph Machine\n Learning from a Causal View","summary":" Graph machine learning (GML) has been successfully applied across a wide\nrange of tasks. Nonetheless, GML faces significant challenges in generalizing\nover out-of-distribution (OOD) data, which raises concerns about its wider\napplicability. Recent advancements have underscored the crucial role of\ncausality-driven approaches in overcoming these generalization challenges.\nDistinct from traditional GML methods that primarily rely on statistical\ndependencies, causality-focused strategies delve into the underlying causal\nmechanisms of data generation and model prediction, thus significantly\nimproving the generalization of GML across different environments. This paper\noffers a thorough review of recent progress in causality-involved GML\ngeneralization. We elucidate the fundamental concepts of employing causality to\nenhance graph model generalization and categorize the various approaches,\nproviding detailed descriptions of their methodologies and the connections\namong them. Furthermore, we explore the incorporation of causality in other\nrelated important areas of trustworthy GML, such as explanation, fairness, and\nrobustness. Concluding with a discussion on potential future research\ndirections, this review seeks to articulate the continuing development and\nfuture potential of causality in enhancing the trustworthiness of graph machine\nlearning.\n","authors":["Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2409.09858v2.pdf","comment":"15 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2405.15861v3","updated":"2024-09-27T04:43:20Z","published":"2024-05-24T18:07:05Z","title":"Achieving Dimension-Free Communication in Federated Learning via\n Zeroth-Order Optimization","summary":" Federated Learning (FL) offers a promising framework for collaborative and\nprivacy-preserving machine learning across distributed data sources. However,\nthe substantial communication costs associated with FL significantly challenge\nits efficiency. Specifically, in each communication round, the communication\ncosts scale linearly with the model's dimension, which presents a formidable\nobstacle, especially in large model scenarios. Despite various\ncommunication-efficient strategies, the intrinsic dimension-dependent\ncommunication cost remains a major bottleneck for current FL implementations.\nThis paper proposes a novel dimension-free communication algorithm -- DeComFL,\nwhich leverages the zeroth-order optimization techniques and reduces the\ncommunication cost from $\\mathscr{O}(d)$ to $\\mathscr{O}(1)$ by transmitting\nonly a constant number of scalar values between clients and the server in each\nround, regardless of the dimension $d$ of the model parameters. Theoretically,\nin non-convex functions, we prove that our algorithm achieves state-of-the-art\nrates, which show a linear speedup of the number of clients and local steps\nunder standard assumptions. With additional low effective rank assumption, we\ncan further show the convergence rate is independent of the model dimension $d$\nas well. Empirical evaluations, encompassing both classic deep learning\ntraining and large language model fine-tuning, demonstrate significant\nreductions in communication overhead. Notably, DeComFL achieves this by\ntransmitting only around 1MB of data in total between the server and a client\nto fine-tune a model with billions of parameters.\n","authors":["Zhe Li","Bicheng Ying","Zidong Liu","Chaosheng Dong","Haibo Yang"],"pdf_url":"https://arxiv.org/pdf/2405.15861v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18442v1","updated":"2024-09-27T04:38:14Z","published":"2024-09-27T04:38:14Z","title":"Gradient-free Decoder Inversion in Latent Diffusion Models","summary":" In latent diffusion models (LDMs), denoising diffusion process efficiently\ntakes place on latent space whose dimension is lower than that of pixel space.\nDecoder is typically used to transform the representation in latent space to\nthat in pixel space. While a decoder is assumed to have an encoder as an\naccurate inverse, exact encoder-decoder pair rarely exists in practice even\nthough applications often require precise inversion of decoder. Prior works for\ndecoder inversion in LDMs employed gradient descent inspired by inversions of\ngenerative adversarial networks. However, gradient-based methods require larger\nGPU memory and longer computation time for larger latent space. For example,\nrecent video LDMs can generate more than 16 frames, but GPUs with 24 GB memory\ncan only perform gradient-based decoder inversion for 4 frames. Here, we\npropose an efficient gradient-free decoder inversion for LDMs, which can be\napplied to diverse latent models. Theoretical convergence property of our\nproposed inversion has been investigated not only for the forward step method,\nbut also for the inertial Krasnoselskii-Mann (KM) iterations under mild\nassumption on cocoercivity that is satisfied by recent LDMs. Our proposed\ngradient-free method with Adam optimizer and learning rate scheduling\nsignificantly reduced computation time and memory usage over prior\ngradient-based methods and enabled efficient computation in applications such\nas noise-space watermarking while achieving comparable error levels.\n","authors":["Seongmin Hong","Suh Yoon Jeon","Kyeonghyun Lee","Ernest K. Ryu","Se Young Chun"],"pdf_url":"https://arxiv.org/pdf/2409.18442v1.pdf","comment":"19 pages, Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.18439v1","updated":"2024-09-27T04:28:19Z","published":"2024-09-27T04:28:19Z","title":"State-free Reinforcement Learning","summary":" In this work, we study the \\textit{state-free RL} problem, where the\nalgorithm does not have the states information before interacting with the\nenvironment. Specifically, denote the reachable state set by ${S}^\\Pi := \\{\ns|\\max_{\\pi\\in \\Pi}q^{P, \\pi}(s)>0 \\}$, we design an algorithm which requires\nno information on the state space $S$ while having a regret that is completely\nindependent of ${S}$ and only depend on ${S}^\\Pi$. We view this as a concrete\nfirst step towards \\textit{parameter-free RL}, with the goal of designing RL\nalgorithms that require no hyper-parameter tuning.\n","authors":["Mingyu Chen","Aldo Pacchiano","Xuezhou Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18435v1","updated":"2024-09-27T03:57:54Z","published":"2024-09-27T03:57:54Z","title":"Multi-agent Reinforcement Learning for Dynamic Dispatching in Material\n Handling Systems","summary":" This paper proposes a multi-agent reinforcement learning (MARL) approach to\nlearn dynamic dispatching strategies, which is crucial for optimizing\nthroughput in material handling systems across diverse industries. To benchmark\nour method, we developed a material handling environment that reflects the\ncomplexities of an actual system, such as various activities at different\nlocations, physical constraints, and inherent uncertainties. To enhance\nexploration during learning, we propose a method to integrate domain knowledge\nin the form of existing dynamic dispatching heuristics. Our experimental\nresults show that our method can outperform heuristics by up to 7.4 percent in\nterms of median throughput. Additionally, we analyze the effect of different\narchitectures on MARL performance when training multiple agents with different\nfunctions. We also demonstrate that the MARL agents performance can be further\nimproved by using the first iteration of MARL agents as heuristics to train a\nsecond iteration of MARL agents. This work demonstrates the potential of\napplying MARL to learn effective dynamic dispatching strategies that may be\ndeployed in real-world systems to improve business outcomes.\n","authors":["Xian Yeow Lee","Haiyan Wang","Daisuke Katsumata","Takaharu Matsui","Chetan Gupta"],"pdf_url":"https://arxiv.org/pdf/2409.18435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18433v1","updated":"2024-09-27T03:49:56Z","published":"2024-09-27T03:49:56Z","title":"Easy2Hard-Bench: Standardized Difficulty Labels for Profiling LLM\n Performance and Generalization","summary":" While generalization over tasks from easy to hard is crucial to profile\nlanguage models (LLMs), the datasets with fine-grained difficulty annotations\nfor each problem across a broad range of complexity are still blank. Aiming to\naddress this limitation, we present Easy2Hard-Bench, a consistently formatted\ncollection of 6 benchmark datasets spanning various domains, such as\nmathematics and programming problems, chess puzzles, and reasoning questions.\nEach problem within these datasets is annotated with numerical difficulty\nscores. To systematically estimate problem difficulties, we collect abundant\nperformance data on attempts to each problem by humans in the real world or\nLLMs on the prominent leaderboard. Leveraging the rich performance data, we\napply well-established difficulty ranking systems, such as Item Response Theory\n(IRT) and Glicko-2 models, to uniformly assign numerical difficulty scores to\nproblems. Moreover, datasets in Easy2Hard-Bench distinguish themselves from\nprevious collections by a higher proportion of challenging problems. Through\nextensive experiments with six state-of-the-art LLMs, we provide a\ncomprehensive analysis of their performance and generalization capabilities\nacross varying levels of difficulty, with the aim of inspiring future research\nin LLM generalization. The datasets are available at\nhttps://huggingface.co/datasets/furonghuang-lab/Easy2Hard-Bench.\n","authors":["Mucong Ding","Chenghao Deng","Jocelyn Choo","Zichu Wu","Aakriti Agrawal","Avi Schwarzschild","Tianyi Zhou","Tom Goldstein","John Langford","Anima Anandkumar","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2409.18433v1.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2408.05586v2","updated":"2024-09-27T03:38:36Z","published":"2024-08-10T16:09:51Z","title":"Meta Clustering of Neural Bandits","summary":" The contextual bandit has been identified as a powerful framework to\nformulate the recommendation process as a sequential decision-making process,\nwhere each item is regarded as an arm and the objective is to minimize the\nregret of $T$ rounds. In this paper, we study a new problem, Clustering of\nNeural Bandits, by extending previous work to the arbitrary reward function, to\nstrike a balance between user heterogeneity and user correlations in the\nrecommender system. To solve this problem, we propose a novel algorithm called\nM-CNB, which utilizes a meta-learner to represent and rapidly adapt to dynamic\nclusters, along with an informative Upper Confidence Bound (UCB)-based\nexploration strategy. We provide an instance-dependent performance guarantee\nfor the proposed algorithm that withstands the adversarial context, and we\nfurther prove the guarantee is at least as good as state-of-the-art (SOTA)\napproaches under the same assumptions. In extensive experiments conducted in\nboth recommendation and online classification scenarios, M-CNB outperforms SOTA\nbaselines. This shows the effectiveness of the proposed approach in improving\nonline recommendation and online classification performance.\n","authors":["Yikun Ban","Yunzhe Qi","Tianxin Wei","Lihui Liu","Jingrui He"],"pdf_url":"https://arxiv.org/pdf/2408.05586v2.pdf","comment":"Accepted by KDD 2024"},{"id":"http://arxiv.org/abs/2405.06119v2","updated":"2024-09-27T03:35:49Z","published":"2024-05-09T21:53:27Z","title":"Gradient Flow Based Phase-Field Modeling Using Separable Neural Networks","summary":" The $L^2$ gradient flow of the Ginzburg-Landau free energy functional leads\nto the Allen Cahn equation that is widely used for modeling phase separation.\nMachine learning methods for solving the Allen-Cahn equation in its strong form\nsuffer from inaccuracies in collocation techniques, errors in computing\nhigher-order spatial derivatives through automatic differentiation, and the\nlarge system size required by the space-time approach. To overcome these\nlimitations, we propose a separable neural network-based approximation of the\nphase field in a minimizing movement scheme to solve the aforementioned\ngradient flow problem. At each time step, the separable neural network is used\nto approximate the phase field in space through a low-rank tensor decomposition\nthereby accelerating the derivative calculations. The minimizing movement\nscheme naturally allows for the use of Gauss quadrature technique to compute\nthe functional. A `$tanh$' transformation is applied on the neural\nnetwork-predicted phase field to strictly bounds the solutions within the\nvalues of the two phases. For this transformation, a theoretical guarantee for\nenergy stability of the minimizing movement scheme is established. Our results\nsuggest that bounding the solution through this transformation is the key to\neffectively model sharp interfaces through separable neural network. The\nproposed method outperforms the state-of-the-art machine learning methods for\nphase separation problems and is an order of magnitude faster than the finite\nelement method.\n","authors":["Revanth Mattey","Susanta Ghosh"],"pdf_url":"https://arxiv.org/pdf/2405.06119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18427v1","updated":"2024-09-27T03:28:11Z","published":"2024-09-27T03:28:11Z","title":"Neural Collaborative Filtering to Detect Anomalies in Human Semantic\n Trajectories","summary":" Human trajectory anomaly detection has become increasingly important across a\nwide range of applications, including security surveillance and public health.\nHowever, existing trajectory anomaly detection methods are primarily focused on\nvehicle-level traffic, while human-level trajectory anomaly detection remains\nunder-explored. Since human trajectory data is often very sparse, machine\nlearning methods have become the preferred approach for identifying complex\npatterns. However, concerns regarding potential biases and the robustness of\nthese models have intensified the demand for more transparent and explainable\nalternatives. In response to these challenges, our research focuses on\ndeveloping a lightweight anomaly detection model specifically designed to\ndetect anomalies in human trajectories. We propose a Neural Collaborative\nFiltering approach to model and predict normal mobility. Our method is designed\nto model users' daily patterns of life without requiring prior knowledge,\nthereby enhancing performance in scenarios where data is sparse or incomplete,\nsuch as in cold start situations. Our algorithm consists of two main modules.\nThe first is the collaborative filtering module, which applies collaborative\nfiltering to model normal mobility of individual humans to places of interest.\nThe second is the neural module, responsible for interpreting the complex\nspatio-temporal relationships inherent in human trajectory data. To validate\nour approach, we conducted extensive experiments using simulated and real-world\ndatasets comparing to numerous state-of-the-art trajectory anomaly detection\napproaches.\n","authors":["Yueyang Liu","Lance Kennedy","Hossein Amiri","Andreas Züfle"],"pdf_url":"https://arxiv.org/pdf/2409.18427v1.pdf","comment":"Accepted for publication in the 1st ACM SIGSPATIAL International\n Workshop on Geospatial Anomaly Detection (GeoAnomalies'24)"},{"id":"http://arxiv.org/abs/2409.18426v1","updated":"2024-09-27T03:27:46Z","published":"2024-09-27T03:27:46Z","title":"Dual Cone Gradient Descent for Training Physics-Informed Neural Networks","summary":" Physics-informed neural networks (PINNs) have emerged as a prominent approach\nfor solving partial differential equations (PDEs) by minimizing a combined loss\nfunction that incorporates both boundary loss and PDE residual loss. Despite\ntheir remarkable empirical performance in various scientific computing tasks,\nPINNs often fail to generate reasonable solutions, and such pathological\nbehaviors remain difficult to explain and resolve. In this paper, we identify\nthat PINNs can be adversely trained when gradients of each loss function\nexhibit a significant imbalance in their magnitudes and present a negative\ninner product value. To address these issues, we propose a novel optimization\nframework, Dual Cone Gradient Descent (DCGD), which adjusts the direction of\nthe updated gradient to ensure it falls within a dual cone region. This region\nis defined as a set of vectors where the inner products with both the gradients\nof the PDE residual loss and the boundary loss are non-negative. Theoretically,\nwe analyze the convergence properties of DCGD algorithms in a non-convex\nsetting. On a variety of benchmark equations, we demonstrate that DCGD\noutperforms other optimization algorithms in terms of various evaluation\nmetrics. In particular, DCGD achieves superior predictive accuracy and enhances\nthe stability of training for failure modes of PINNs and complex PDEs, compared\nto existing optimally tuned models. Moreover, DCGD can be further improved by\ncombining it with popular strategies for PINNs, including learning rate\nannealing and the Neural Tangent Kernel (NTK).\n","authors":["Youngsik Hwang","Dong-Young Lim"],"pdf_url":"https://arxiv.org/pdf/2409.18426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18423v1","updated":"2024-09-27T03:26:38Z","published":"2024-09-27T03:26:38Z","title":"A physics-driven sensor placement optimization methodology for\n temperature field reconstruction","summary":" Perceiving the global field from sparse sensors has been a grand challenge in\nthe monitoring, analysis, and design of physical systems. In this context,\nsensor placement optimization is a crucial issue. Most existing works require\nlarge and sufficient data to construct data-based criteria, which are\nintractable in data-free scenarios without numerical and experimental data. To\nthis end, we propose a novel physics-driven sensor placement optimization\n(PSPO) method for temperature field reconstruction using a physics-based\ncriterion to optimize sensor locations. In our methodological framework, we\nfirstly derive the theoretical upper and lower bounds of the reconstruction\nerror under noise scenarios by analyzing the optimal solution, proving that\nerror bounds correlate with the condition number determined by sensor\nlocations. Furthermore, the condition number, as the physics-based criterion,\nis used to optimize sensor locations by the genetic algorithm. Finally, the\nbest sensors are validated by reconstruction models, including non-invasive\nend-to-end models, non-invasive reduced-order models, and physics-informed\nmodels. Experimental results, both on a numerical and an application case,\ndemonstrate that the PSPO method significantly outperforms random and uniform\nselection methods, improving the reconstruction accuracy by nearly an order of\nmagnitude. Moreover, the PSPO method can achieve comparable reconstruction\naccuracy to the existing data-driven placement optimization methods.\n","authors":["Xu Liu","Wen Yao","Wei Peng","Zhuojia Fu","Zixue Xiang","Xiaoqian Chen"],"pdf_url":"https://arxiv.org/pdf/2409.18423v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16852v2","updated":"2024-09-27T03:25:44Z","published":"2024-08-29T18:34:59Z","title":"The Star Geometry of Critic-Based Regularizer Learning","summary":" Variational regularization is a classical technique to solve statistical\ninference tasks and inverse problems, with modern data-driven approaches\nparameterizing regularizers via deep neural networks showcasing impressive\nempirical performance. Recent works along these lines learn task-dependent\nregularizers. This is done by integrating information about the measurements\nand ground-truth data in an unsupervised, critic-based loss function, where the\nregularizer attributes low values to likely data and high values to unlikely\ndata. However, there is little theory about the structure of regularizers\nlearned via this process and how it relates to the two data distributions. To\nmake progress on this challenge, we initiate a study of optimizing critic-based\nloss functions to learn regularizers over a particular family of regularizers:\ngauges (or Minkowski functionals) of star-shaped bodies. This family contains\nregularizers that are commonly employed in practice and shares properties with\nregularizers parameterized by deep neural networks. We specifically investigate\ncritic-based losses derived from variational representations of statistical\ndistances between probability measures. By leveraging tools from star geometry\nand dual Brunn-Minkowski theory, we illustrate how these losses can be\ninterpreted as dual mixed volumes that depend on the data distribution. This\nallows us to derive exact expressions for the optimal regularizer in certain\ncases. Finally, we identify which neural network architectures give rise to\nsuch star body gauges and when do such regularizers have favorable properties\nfor optimization. More broadly, this work highlights how the tools of star\ngeometry can aid in understanding the geometry of unsupervised regularizer\nlearning.\n","authors":["Oscar Leong","Eliza O'Reilly","Yong Sheng Soh"],"pdf_url":"https://arxiv.org/pdf/2408.16852v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2407.11016v2","updated":"2024-09-27T03:23:53Z","published":"2024-06-27T01:52:05Z","title":"LongLaMP: A Benchmark for Personalized Long-form Text Generation","summary":" Long-text generation is seemingly ubiquitous in real-world applications of\nlarge language models such as generating an email or writing a review. Despite\nthe fundamental importance and prevalence of long-text generation in many\npractical applications, existing work on personalized generation has focused on\nthe generation of very short text. To overcome these limitations, we study the\nproblem of personalized long-text generation, that is, generating long-text\nthat is personalized for a specific user while being practically useful for the\nvast majority of real-world applications that naturally require the generation\nof longer text. In this work, we demonstrate the importance of user-specific\npersonalization for long-text generation tasks and develop the Long-text\nLanguage Model Personalization (LongLaMP) Benchmark. LongLaMP provides a\ncomprehensive and diverse evaluation framework for personalized long-text\ngeneration. Extensive experiments on LongLaMP for zero-shot and fine-tuned\nlanguage tasks demonstrate the effectiveness of the proposed benchmark and its\nutility for developing and evaluating techniques for personalized long-text\ngeneration across a wide variety of long-text generation tasks. The results\nhighlight the importance of personalization across a wide variety of long-text\ngeneration tasks. Finally, we release the benchmark for others to use for this\nimportant problem.\n","authors":["Ishita Kumar","Snigdha Viswanathan","Sushrita Yerra","Alireza Salemi","Ryan A. Rossi","Franck Dernoncourt","Hanieh Deilamsalehy","Xiang Chen","Ruiyi Zhang","Shubham Agarwal","Nedim Lipka","Chein Van Nguyen","Thien Huu Nguyen","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2407.11016v2.pdf","comment":"9 pages, 4 figures, 20 tables(including appendix) submitted to EMNLP"},{"id":"http://arxiv.org/abs/2409.18419v1","updated":"2024-09-27T03:17:35Z","published":"2024-09-27T03:17:35Z","title":"Robust Network Learning via Inverse Scale Variational Sparsification","summary":" While neural networks have made significant strides in many AI tasks, they\nremain vulnerable to a range of noise types, including natural corruptions,\nadversarial noise, and low-resolution artifacts. Many existing approaches focus\non enhancing robustness against specific noise types, limiting their\nadaptability to others. Previous studies have addressed general robustness by\nadopting a spectral perspective, which tends to blur crucial features like\ntexture and object contours. Our proposed solution, however, introduces an\ninverse scale variational sparsification framework within a time-continuous\ninverse scale space formulation. This framework progressively learns\nfiner-scale features by discerning variational differences between pixels,\nultimately preserving only large-scale features in the smoothed image. Unlike\nfrequency-based methods, our approach not only removes noise by smoothing\nsmall-scale features where corruptions often occur but also retains\nhigh-contrast details such as textures and object contours. Moreover, our\nframework offers simplicity and efficiency in implementation. By integrating\nthis algorithm into neural network training, we guide the model to prioritize\nlearning large-scale features. We show the efficacy of our approach through\nenhanced robustness against various noise types.\n","authors":["Zhiling Zhou","Zirui Liu","Chengming Xu","Yanwei Fu","Xinwei Sun"],"pdf_url":"https://arxiv.org/pdf/2409.18419v1.pdf","comment":"21 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.18418v1","updated":"2024-09-27T03:17:01Z","published":"2024-09-27T03:17:01Z","title":"A3: Active Adversarial Alignment for Source-Free Domain Adaptation","summary":" Unsupervised domain adaptation (UDA) aims to transfer knowledge from a\nlabeled source domain to an unlabeled target domain. Recent works have focused\non source-free UDA, where only target data is available. This is challenging as\nmodels rely on noisy pseudo-labels and struggle with distribution shifts. We\npropose Active Adversarial Alignment (A3), a novel framework combining\nself-supervised learning, adversarial training, and active learning for robust\nsource-free UDA. A3 actively samples informative and diverse data using an\nacquisition function for training. It adapts models via adversarial losses and\nconsistency regularization, aligning distributions without source data access.\nA3 advances source-free UDA through its synergistic integration of active and\nadversarial learning for effective domain alignment and noise reduction.\n","authors":["Chrisantus Eze","Christopher Crick"],"pdf_url":"https://arxiv.org/pdf/2409.18418v1.pdf","comment":"Accepted at ICMLA 2024"},{"id":"http://arxiv.org/abs/2409.18417v1","updated":"2024-09-27T03:15:07Z","published":"2024-09-27T03:15:07Z","title":"VickreyFeedback: Cost-efficient Data Construction for Reinforcement\n Learning from Human Feedback","summary":" This paper addresses the cost-efficiency aspect of Reinforcement Learning\nfrom Human Feedback (RLHF). RLHF leverages datasets of human preferences over\noutputs of large language models (LLM) to instill human expectations into LLMs.\nWhile preference annotation comes with a monetized cost, the economic utility\nof a preference dataset has not been considered by far. What exacerbates this\nsituation is that given complex intransitive or cyclic relationships in\npreference datasets, existing algorithms for fine-tuning LLMs are still far\nfrom capturing comprehensive preferences. This raises severe cost-efficiency\nconcerns in production environments, where preference data accumulate over\ntime. In this paper, we see the fine-tuning of LLMs as a monetized economy and\nintroduce an auction mechanism to improve the efficiency of the preference data\ncollection in dollar terms. We show that introducing an auction mechanism can\nplay an essential role in enhancing the cost-efficiency of RLHF while\nmaintaining satisfactory model performance. Experimental results demonstrate\nthat our proposed auction-based protocol is cost-efficient for fine-tuning LLMs\nby concentrating on high-quality feedback.\n","authors":["Guoxi Zhang","Jiuding Duan"],"pdf_url":"https://arxiv.org/pdf/2409.18417v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.13787v3","updated":"2024-09-27T03:07:05Z","published":"2024-08-25T09:30:34Z","title":"Mask-Encoded Sparsification: Mitigating Biased Gradients in\n Communication-Efficient Split Learning","summary":" This paper introduces a novel framework designed to achieve a high\ncompression ratio in Split Learning (SL) scenarios where resource-constrained\ndevices are involved in large-scale model training. Our investigations\ndemonstrate that compressing feature maps within SL leads to biased gradients\nthat can negatively impact the convergence rates and diminish the\ngeneralization capabilities of the resulting models. Our theoretical analysis\nprovides insights into how compression errors critically hinder SL performance,\nwhich previous methodologies underestimate. To address these challenges, we\nemploy a narrow bit-width encoded mask to compensate for the sparsification\nerror without increasing the order of time complexity. Supported by rigorous\ntheoretical analysis, our framework significantly reduces compression errors\nand accelerates the convergence. Extensive experiments also verify that our\nmethod outperforms existing solutions regarding training efficiency and\ncommunication complexity.\n","authors":["Wenxuan Zhou","Zhihao Qu","Shen-Huan Lyu","Miao Cai","Baoliu Ye"],"pdf_url":"https://arxiv.org/pdf/2408.13787v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03954v7","updated":"2024-09-27T02:43:48Z","published":"2024-03-06T18:58:49Z","title":"3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple\n 3D Representations","summary":" Imitation learning provides an efficient way to teach robots dexterous\nskills; however, learning complex skills robustly and generalizablely usually\nconsumes large amounts of human demonstrations. To tackle this challenging\nproblem, we present 3D Diffusion Policy (DP3), a novel visual imitation\nlearning approach that incorporates the power of 3D visual representations into\ndiffusion policies, a class of conditional action generative models. The core\ndesign of DP3 is the utilization of a compact 3D visual representation,\nextracted from sparse point clouds with an efficient point encoder. In our\nexperiments involving 72 simulation tasks, DP3 successfully handles most tasks\nwith just 10 demonstrations and surpasses baselines with a 24.2% relative\nimprovement. In 4 real robot tasks, DP3 demonstrates precise control with a\nhigh success rate of 85%, given only 40 demonstrations of each task, and shows\nexcellent generalization abilities in diverse aspects, including space,\nviewpoint, appearance, and instance. Interestingly, in real robot experiments,\nDP3 rarely violates safety requirements, in contrast to baseline methods which\nfrequently do, necessitating human intervention. Our extensive evaluation\nhighlights the critical importance of 3D representations in real-world robot\nlearning. Videos, code, and data are available on\nhttps://3d-diffusion-policy.github.io .\n","authors":["Yanjie Ze","Gu Zhang","Kangning Zhang","Chenyuan Hu","Muhan Wang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2403.03954v7.pdf","comment":"Published at Robotics: Science and Systems (RSS) 2024. Videos, code,\n and data: https://3d-diffusion-policy.github.io"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2409.18052v2","updated":"2024-09-27T02:09:44Z","published":"2024-09-26T16:55:44Z","title":"Explaining Explaining","summary":" Explanation is key to people having confidence in high-stakes AI systems.\nHowever, machine-learning-based systems -- which account for almost all current\nAI -- can't explain because they are usually black boxes. The explainable AI\n(XAI) movement hedges this problem by redefining \"explanation\". The\nhuman-centered explainable AI (HCXAI) movement identifies the\nexplanation-oriented needs of users but can't fulfill them because of its\ncommitment to machine learning. In order to achieve the kinds of explanations\nneeded by real people operating in critical domains, we must rethink how to\napproach AI. We describe a hybrid approach to developing cognitive agents that\nuses a knowledge-based infrastructure supplemented by data obtained through\nmachine learning when applicable. These agents will serve as assistants to\nhumans who will bear ultimate responsibility for the decisions and actions of\nthe human-robot team. We illustrate the explanatory potential of such agents\nusing the under-the-hood panels of a demonstration system in which a team of\nsimulated robots collaborate on a search task assigned by a human.\n","authors":["Sergei Nirenburg","Marjorie McShane","Kenneth W. Goodman","Sanjay Oruganti"],"pdf_url":"https://arxiv.org/pdf/2409.18052v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17994v2","updated":"2024-09-27T04:03:19Z","published":"2024-09-26T16:06:38Z","title":"CRoP: Context-wise Robust Static Human-Sensing Personalization","summary":" The advancement in deep learning and internet-of-things have led to diverse\nhuman sensing applications. However, distinct patterns in human sensing,\ninfluenced by various factors or contexts, challenge generic neural network\nmodel's performance due to natural distribution shifts. To address this,\npersonalization tailors models to individual users. Yet most personalization\nstudies overlook intra-user heterogeneity across contexts in sensory data,\nlimiting intra-user generalizability. This limitation is especially critical in\nclinical applications, where limited data availability hampers both\ngeneralizability and personalization. Notably, intra-user sensing attributes\nare expected to change due to external factors such as treatment progression,\nfurther complicating the challenges. This work introduces CRoP, a novel static\npersonalization approach using an off-the-shelf pre-trained model and pruning\nto optimize personalization and generalization. CRoP shows superior\npersonalization effectiveness and intra-user robustness across four\nhuman-sensing datasets, including two from real-world health domains,\nhighlighting its practical and social impact. Additionally, to support CRoP's\ngeneralization ability and design choices, we provide empirical justification\nthrough gradient inner product analysis, ablation studies, and comparisons\nagainst state-of-the-art baselines.\n","authors":["Sawinder Kaur","Avery Gump","Jingyu Xin","Yi Xiao","Harshit Sharma","Nina R Benway","Jonathan L Preston","Asif Salekin"],"pdf_url":"https://arxiv.org/pdf/2409.17994v2.pdf","comment":"31 pages, 10 figues and 13 tables"},{"id":"http://arxiv.org/abs/2409.17763v2","updated":"2024-09-27T06:50:21Z","published":"2024-09-26T11:58:41Z","title":"Confidence intervals uncovered: Are we ready for real-world medical\n imaging AI?","summary":" Medical imaging is spearheading the AI transformation of healthcare.\nPerformance reporting is key to determine which methods should be translated\ninto clinical practice. Frequently, broad conclusions are simply derived from\nmean performance values. In this paper, we argue that this common practice is\noften a misleading simplification as it ignores performance variability. Our\ncontribution is threefold. (1) Analyzing all MICCAI segmentation papers (n =\n221) published in 2023, we first observe that more than 50% of papers do not\nassess performance variability at all. Moreover, only one (0.5%) paper reported\nconfidence intervals (CIs) for model performance. (2) To address the reporting\nbottleneck, we show that the unreported standard deviation (SD) in segmentation\npapers can be approximated by a second-order polynomial function of the mean\nDice similarity coefficient (DSC). Based on external validation data from 56\nprevious MICCAI challenges, we demonstrate that this approximation can\naccurately reconstruct the CI of a method using information provided in\npublications. (3) Finally, we reconstructed 95% CIs around the mean DSC of\nMICCAI 2023 segmentation papers. The median CI width was 0.03 which is three\ntimes larger than the median performance gap between the first and second\nranked method. For more than 60% of papers, the mean performance of the\nsecond-ranked method was within the CI of the first-ranked method. We conclude\nthat current publications typically do not provide sufficient evidence to\nsupport which models could potentially be translated into clinical practice.\n","authors":["Evangelia Christodoulou","Annika Reinke","Rola Houhou","Piotr Kalinowski","Selen Erkan","Carole H. Sudre","Ninon Burgos","Sofiène Boutaj","Sophie Loizillon","Maëlys Solal","Nicola Rieke","Veronika Cheplygina","Michela Antonelli","Leon D. Mayer","Minu D. Tizabi","M. Jorge Cardoso","Amber Simpson","Paul F. Jäger","Annette Kopp-Schneider","Gaël Varoquaux","Olivier Colliot","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2409.17763v2.pdf","comment":"Paper accepted at MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2409.17699v2","updated":"2024-09-27T10:16:37Z","published":"2024-09-26T10:12:19Z","title":"MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard\n for Prompt Attacks","summary":" The proliferation of Large Language Models (LLMs) in diverse applications\nunderscores the pressing need for robust security measures to thwart potential\njailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger\ndata integrity and user privacy. Guardrails serve as crucial protective\nmechanisms against such threats, but existing models often fall short in terms\nof both detection accuracy, and computational efficiency. This paper advocates\nfor the significance of jailbreak attack prevention on LLMs, and emphasises the\nrole of input guardrails in safeguarding these models. We introduce MoJE\n(Mixture of Jailbreak Expert), a novel guardrail architecture designed to\nsurpass current limitations in existing state-of-the-art guardrails. By\nemploying simple linguistic statistical techniques, MoJE excels in detecting\njailbreak attacks while maintaining minimal computational overhead during model\ninference. Through rigorous experimentation, MoJE demonstrates superior\nperformance capable of detecting 90% of the attacks without compromising benign\nprompts, enhancing LLMs security against jailbreak attacks.\n","authors":["Giandomenico Cornacchia","Giulio Zizzo","Kieran Fraser","Muhammad Zaid Hamed","Ambrish Rawat","Mark Purcell"],"pdf_url":"https://arxiv.org/pdf/2409.17699v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06802v3","updated":"2024-09-27T06:13:06Z","published":"2024-05-10T20:29:25Z","title":"Summarizing Radiology Reports Findings into Impressions","summary":" Patient hand-off and triage are two fundamental problems in health care.\nOften doctors must painstakingly summarize complex findings to efficiently\ncommunicate with specialists and quickly make decisions on which patients have\nthe most urgent cases. In pursuit of these challenges, we present (1) a model\nwith state-of-art radiology report summarization performance using (2) a novel\nmethod for augmenting medical data, and (3) an analysis of the model\nlimitations and radiology knowledge gain. We also provide a data processing\npipeline for future models developed on the the MIMIC CXR dataset. Our best\nperforming model was a fine-tuned BERT-to-BERT encoder-decoder with 58.75/100\nROUGE-L F1, which outperformed specialized checkpoints with more sophisticated\nattention mechanisms. We investigate these aspects in this work.\n","authors":["Raul Salles de Padua","Imran Qureshi"],"pdf_url":"https://arxiv.org/pdf/2405.06802v3.pdf","comment":"This version reverts to the original preprint, following the advice\n from the Artificial Intelligence in Health editorial office. The published\n version is peer-reviewed and available in the journal (see external DOI). The\n preprint remains unchanged to maintain version transparency, as noted in the\n further disclosure section of the published article"},{"id":"http://arxiv.org/abs/2409.17545v2","updated":"2024-09-27T06:48:08Z","published":"2024-09-26T05:24:14Z","title":"Modulated Intervention Preference Optimization (MIPO): Keep the Easy,\n Refine the Difficult","summary":" Preference optimization methods typically begin training with a well-trained\nSFT model as a reference model. In RLHF and DPO, a regularization term is used\nduring the preference optimization process to prevent the policy model from\ndeviating too far from the reference model's distribution, thereby avoiding the\ngeneration of anomalous responses. When the reference model is already\nwell-aligned with the given data or only requires slight adjustments, this\napproach can produce a well-aligned model. However, if the reference model is\nnot aligned with the given data and requires significant deviation from its\ncurrent state, a regularization term may actually hinder the model alignment.\nIn this study, we propose \\textbf{Modulated Intervention Preference\nOptimization (MIPO)} to address this issue. MIPO modulates the degree of\nintervention from the reference model based on how well the given data is\naligned with it. If the data is well-aligned, the intervention is increased to\nprevent the policy model from diverging significantly from reference model.\nConversely, if the alignment is poor, the interference is reduced to facilitate\nmore extensive training. We compare the performance of MIPO and DPO using\nMistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental\nresults demonstrate that MIPO consistently outperforms DPO across various\nevaluation scenarios.\n","authors":["Cheolhun Jang"],"pdf_url":"https://arxiv.org/pdf/2409.17545v2.pdf","comment":"8pages, submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2409.18964v1","updated":"2024-09-27T17:59:57Z","published":"2024-09-27T17:59:57Z","title":"PhysGen: Rigid-Body Physics-Grounded Image-to-Video Generation","summary":" We present PhysGen, a novel image-to-video generation method that converts a\nsingle image and an input condition (e.g., force and torque applied to an\nobject in the image) to produce a realistic, physically plausible, and\ntemporally consistent video. Our key insight is to integrate model-based\nphysical simulation with a data-driven video generation process, enabling\nplausible image-space dynamics. At the heart of our system are three core\ncomponents: (i) an image understanding module that effectively captures the\ngeometry, materials, and physical parameters of the image; (ii) an image-space\ndynamics simulation model that utilizes rigid-body physics and inferred\nparameters to simulate realistic behaviors; and (iii) an image-based rendering\nand refinement module that leverages generative video diffusion to produce\nrealistic video footage featuring the simulated motion. The resulting videos\nare realistic in both physics and appearance and are even precisely\ncontrollable, showcasing superior results over existing data-driven\nimage-to-video generation works through quantitative comparison and\ncomprehensive user study. PhysGen's resulting videos can be used for various\ndownstream applications, such as turning an image into a realistic animation or\nallowing users to interact with the image and create various dynamics. Project\npage: https://stevenlsw.github.io/physgen/\n","authors":["Shaowei Liu","Zhongzheng Ren","Saurabh Gupta","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18964v1.pdf","comment":"Accepted to ECCV 2024. Project page:\n https://stevenlsw.github.io/physgen/"},{"id":"http://arxiv.org/abs/2409.18962v1","updated":"2024-09-27T17:59:50Z","published":"2024-09-27T17:59:50Z","title":"Exploring Token Pruning in Vision State Space Models","summary":" State Space Models (SSMs) have the advantage of keeping linear computational\ncomplexity compared to attention modules in transformers, and have been applied\nto vision tasks as a new type of powerful vision foundation model. Inspired by\nthe observations that the final prediction in vision transformers (ViTs) is\nonly based on a subset of most informative tokens, we take the novel step of\nenhancing the efficiency of SSM-based vision models through token-based\npruning. However, direct applications of existing token pruning techniques\ndesigned for ViTs fail to deliver good performance, even with extensive\nfine-tuning. To address this issue, we revisit the unique computational\ncharacteristics of SSMs and discover that naive application disrupts the\nsequential token positions. This insight motivates us to design a novel and\ngeneral token pruning method specifically for SSM-based vision models. We first\nintroduce a pruning-aware hidden state alignment method to stabilize the\nneighborhood of remaining tokens for performance enhancement. Besides, based on\nour detailed analysis, we propose a token importance evaluation method adapted\nfor SSM models, to guide the token pruning. With efficient implementation and\npractical acceleration methods, our method brings actual speedup. Extensive\nexperiments demonstrate that our approach can achieve significant computation\nreduction with minimal impact on performance across different tasks. Notably,\nwe achieve 81.7\\% accuracy on ImageNet with a 41.6\\% reduction in the FLOPs for\npruned PlainMamba-L3. Furthermore, our work provides deeper insights into\nunderstanding the behavior of SSM-based vision models for future research.\n","authors":["Zheng Zhan","Zhenglun Kong","Yifan Gong","Yushu Wu","Zichong Meng","Hangyu Zheng","Xuan Shen","Stratis Ioannidis","Wei Niu","Pu Zhao","Yanzhi Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18962v1.pdf","comment":"NeurIPS'24"},{"id":"http://arxiv.org/abs/2409.18961v1","updated":"2024-09-27T17:59:42Z","published":"2024-09-27T17:59:42Z","title":"ProMerge: Prompt and Merge for Unsupervised Instance Segmentation","summary":" Unsupervised instance segmentation aims to segment distinct object instances\nin an image without relying on human-labeled data. This field has recently seen\nsignificant advancements, partly due to the strong local correspondences\nafforded by rich visual feature representations from self-supervised models\n(e.g., DINO). Recent state-of-the-art approaches use self-supervised features\nto represent images as graphs and solve a generalized eigenvalue system (i.e.,\nnormalized-cut) to generate foreground masks. While effective, this strategy is\nlimited by its attendant computational demands, leading to slow inference\nspeeds. In this paper, we propose Prompt and Merge (ProMerge), which leverages\nself-supervised visual features to obtain initial groupings of patches and\napplies a strategic merging to these segments, aided by a sophisticated\nbackground-based mask pruning technique. ProMerge not only yields competitive\nresults but also offers a significant reduction in inference time compared to\nstate-of-the-art normalized-cut-based approaches. Furthermore, when training an\nobject detector using our mask predictions as pseudo-labels, the resulting\ndetector surpasses the current leading unsupervised model on various\nchallenging instance segmentation benchmarks.\n","authors":["Dylan Li","Gyungin Shin"],"pdf_url":"https://arxiv.org/pdf/2409.18961v1.pdf","comment":"ECCV2024 camera-ready"},{"id":"http://arxiv.org/abs/2409.18959v1","updated":"2024-09-27T17:59:10Z","published":"2024-09-27T17:59:10Z","title":"$O(d/T)$ Convergence Theory for Diffusion Probabilistic Models under\n Minimal Assumptions","summary":" Score-based diffusion models, which generate new data by learning to reverse\na diffusion process that perturbs data from the target distribution into noise,\nhave achieved remarkable success across various generative tasks. Despite their\nsuperior empirical performance, existing theoretical guarantees are often\nconstrained by stringent assumptions or suboptimal convergence rates. In this\npaper, we establish a fast convergence theory for a popular SDE-based sampler\nunder minimal assumptions. Our analysis shows that, provided\n$\\ell_{2}$-accurate estimates of the score functions, the total variation\ndistance between the target and generated distributions is upper bounded by\n$O(d/T)$ (ignoring logarithmic factors), where $d$ is the data dimensionality\nand $T$ is the number of steps. This result holds for any target distribution\nwith finite first-order moment. To our knowledge, this improves upon existing\nconvergence theory for both the SDE-based sampler and another ODE-based\nsampler, while imposing minimal assumptions on the target data distribution and\nscore estimates. This is achieved through a novel set of analytical tools that\nprovides a fine-grained characterization of how the error propagates at each\nstep of the reverse process.\n","authors":["Gen Li","Yuling Yan"],"pdf_url":"https://arxiv.org/pdf/2409.18959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18957v1","updated":"2024-09-27T17:58:50Z","published":"2024-09-27T17:58:50Z","title":"LML: Language Model Learning a Dataset for Data-Augmented Prediction","summary":" This paper introduces a new approach to using Large Language Models (LLMs)\nfor classification tasks, which are typically handled using Machine Learning\n(ML) models. Unlike ML models that rely heavily on data cleaning and feature\nengineering, this method streamlines the process using LLMs. This paper\nproposes a new concept called \"Language Model Learning (LML)\" powered by a new\nmethod called \"Data-Augmented Prediction (DAP)\". The classification is\nperformed by LLMs using a method similar to humans manually exploring and\nunderstanding the data and deciding classifications using data as a reference.\nTraining data is summarized and evaluated to determine the features that lead\nto the classification of each label the most. In the process of DAP, the system\nuses the data summary to automatically create a query, which is used to\nretrieve relevant rows from the dataset. A classification is generated by the\nLLM using data summary and relevant rows, ensuring satisfactory accuracy even\nwith complex data. Usage of data summary and similar data in DAP ensures\ncontext-aware decision-making. The proposed method uses the words \"Act as an\nExplainable Machine Learning Model\" in the prompt to enhance the\ninterpretability of the predictions by allowing users to review the logic\nbehind each prediction. In some test cases, the system scored an accuracy above\n90%, proving the effectiveness of the system and its potential to outperform\nconventional ML models in various scenarios. The code is available at\nhttps://github.com/Pro-GenAI/LML-DAP\n","authors":["Praneeth Vadlapati"],"pdf_url":"https://arxiv.org/pdf/2409.18957v1.pdf","comment":"First version"},{"id":"http://arxiv.org/abs/2409.18946v1","updated":"2024-09-27T17:46:05Z","published":"2024-09-27T17:46:05Z","title":"Unconditional stability of a recurrent neural circuit implementing\n divisive normalization","summary":" Stability in recurrent neural models poses a significant challenge,\nparticularly in developing biologically plausible neurodynamical models that\ncan be seamlessly trained. Traditional cortical circuit models are notoriously\ndifficult to train due to expansive nonlinearities in the dynamical system,\nleading to an optimization problem with nonlinear stability constraints that\nare difficult to impose. Conversely, recurrent neural networks (RNNs) excel in\ntasks involving sequential data but lack biological plausibility and\ninterpretability. In this work, we address these challenges by linking dynamic\ndivisive normalization (DN) to the stability of ORGaNICs, a biologically\nplausible recurrent cortical circuit model that dynamically achieves DN and has\nbeen shown to simulate a wide range of neurophysiological phenomena. By using\nthe indirect method of Lyapunov, we prove the remarkable property of\nunconditional local stability for an arbitrary-dimensional ORGaNICs circuit\nwhen the recurrent weight matrix is the identity. We thus connect ORGaNICs to a\nsystem of coupled damped harmonic oscillators, which enables us to derive the\ncircuit's energy function, providing a normative principle of what the circuit,\nand individual neurons, aim to accomplish. Further, for a generic recurrent\nweight matrix, we prove the stability of the 2D model and demonstrate\nempirically that stability holds in higher dimensions. Finally, we show that\nORGaNICs can be trained by backpropagation through time without gradient\nclipping/scaling, thanks to its intrinsic stability property and adaptive time\nconstants, which address the problems of exploding, vanishing, and oscillating\ngradients. By evaluating the model's performance on RNN benchmarks, we find\nthat ORGaNICs outperform alternative neurodynamical models on static image\nclassification tasks and perform comparably to LSTMs on sequential tasks.\n","authors":["Shivang Rawat","David J. Heeger","Stefano Martiniani"],"pdf_url":"https://arxiv.org/pdf/2409.18946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18941v1","updated":"2024-09-27T17:41:18Z","published":"2024-09-27T17:41:18Z","title":"Building Trust Through Voice: How Vocal Tone Impacts User Perception of\n Attractiveness of Voice Assistants","summary":" Voice Assistants (VAs) are popular for simple tasks, but users are often\nhesitant to use them for complex activities like online shopping. We explored\nwhether the vocal characteristics like the VA's vocal tone, can make VAs\nperceived as more attractive and trustworthy to users for complex tasks. Our\nfindings show that the tone of the VA voice significantly impacts its perceived\nattractiveness and trustworthiness. Participants in our experiment were more\nlikely to be attracted to VAs with positive or neutral tones and ultimately\ntrusted the VAs they found more attractive. We conclude that VA's perceived\ntrustworthiness can be enhanced through thoughtful voice design, incorporating\na variety of vocal tones.\n","authors":["Sabid Bin Habib Pias","Alicia Freel","Ran Huang","Donald Williamson","Minjeong Kim","Apu Kapadia"],"pdf_url":"https://arxiv.org/pdf/2409.18941v1.pdf","comment":"Extended Abstract"},{"id":"http://arxiv.org/abs/2409.18938v1","updated":"2024-09-27T17:38:36Z","published":"2024-09-27T17:38:36Z","title":"From Seconds to Hours: Reviewing MultiModal Large Language Models on\n Comprehensive Long Video Understanding","summary":" The integration of Large Language Models (LLMs) with visual encoders has\nrecently shown promising performance in visual understanding tasks, leveraging\ntheir inherent capability to comprehend and generate human-like text for visual\nreasoning. Given the diverse nature of visual data, MultiModal Large Language\nModels (MM-LLMs) exhibit variations in model designing and training for\nunderstanding images, short videos, and long videos. Our paper focuses on the\nsubstantial differences and unique challenges posed by long video understanding\ncompared to static image and short video understanding. Unlike static images,\nshort videos encompass sequential frames with both spatial and within-event\ntemporal information, while long videos consist of multiple events with\nbetween-event and long-term temporal information. In this survey, we aim to\ntrace and summarize the advancements of MM-LLMs from image understanding to\nlong video understanding. We review the differences among various visual\nunderstanding tasks and highlight the challenges in long video understanding,\nincluding more fine-grained spatiotemporal details, dynamic events, and\nlong-term dependencies. We then provide a detailed summary of the advancements\nin MM-LLMs in terms of model design and training methodologies for\nunderstanding long videos. Finally, we compare the performance of existing\nMM-LLMs on video understanding benchmarks of various lengths and discuss\npotential future directions for MM-LLMs in long video understanding.\n","authors":["Heqing Zou","Tianze Luo","Guiyang Xie"," Victor"," Zhang","Fengmao Lv","Guangcong Wang","Juanyang Chen","Zhuochen Wang","Hansheng Zhang","Huaijian Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18938v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2409.18924v1","updated":"2024-09-27T17:17:15Z","published":"2024-09-27T17:17:15Z","title":"AIPatient: Simulating Patients with EHRs and LLM Powered Agentic\n Workflow","summary":" Simulated patient systems play a crucial role in modern medical education and\nresearch, providing safe, integrative learning environments and enabling\nclinical decision-making simulations. Large Language Models (LLM) could advance\nsimulated patient systems by replicating medical conditions and patient-doctor\ninteractions with high fidelity and low cost. However, ensuring the\neffectiveness and trustworthiness of these systems remains a challenge, as they\nrequire a large, diverse, and precise patient knowledgebase, along with a\nrobust and stable knowledge diffusion to users. Here, we developed AIPatient,\nan advanced simulated patient system with AIPatient Knowledge Graph (AIPatient\nKG) as the input and the Reasoning Retrieval-Augmented Generation (Reasoning\nRAG) agentic workflow as the generation backbone. AIPatient KG samples data\nfrom Electronic Health Records (EHRs) in the Medical Information Mart for\nIntensive Care (MIMIC)-III database, producing a clinically diverse and\nrelevant cohort of 1,495 patients with high knowledgebase validity (F1 0.89).\nReasoning RAG leverages six LLM powered agents spanning tasks including\nretrieval, KG query generation, abstraction, checker, rewrite, and\nsummarization. This agentic framework reaches an overall accuracy of 94.15% in\nEHR-based medical Question Answering (QA), outperforming benchmarks that use\neither no agent or only partial agent integration. Our system also presents\nhigh readability (median Flesch Reading Ease 77.23; median Flesch Kincaid Grade\n5.6), robustness (ANOVA F-value 0.6126, p<0.1), and stability (ANOVA F-value\n0.782, p<0.1). The promising performance of the AIPatient system highlights its\npotential to support a wide range of applications, including medical education,\nmodel evaluation, and system integration.\n","authors":["Huizi Yu","Jiayan Zhou","Lingyao Li","Shan Chen","Jack Gallifant","Anye Shi","Xiang Li","Wenyue Hua","Mingyu Jin","Guang Chen","Yang Zhou","Zhao Li","Trisha Gupte","Ming-Li Chen","Zahra Azizi","Yongfeng Zhang","Themistocles L. Assimes","Xin Ma","Danielle S. Bitterman","Lin Lu","Lizhou Fan"],"pdf_url":"https://arxiv.org/pdf/2409.18924v1.pdf","comment":"42 pages, 6 figures, 7 tables"},{"id":"http://arxiv.org/abs/2403.16877v2","updated":"2024-09-27T17:14:26Z","published":"2024-03-25T15:42:09Z","title":"Proprioception Is All You Need: Terrain Classification for Boreal\n Forests","summary":" Recent works in field robotics highlighted the importance of resiliency\nagainst different types of terrains. Boreal forests, in particular, are home to\nmany mobility-impeding terrains that should be considered for off-road\nautonomous navigation. Also, being one of the largest land biomes on Earth,\nboreal forests are an area where autonomous vehicles are expected to become\nincreasingly common. In this paper, we address this issue by introducing\nBorealTC, a publicly available dataset for proprioceptive-based terrain\nclassification (TC). Recorded with a Husky A200, our dataset contains 116 min\nof Inertial Measurement Unit (IMU), motor current, and wheel odometry data,\nfocusing on typical boreal forest terrains, notably snow, ice, and silty loam.\nCombining our dataset with another dataset from the state-of-the-art, we\nevaluate both a Convolutional Neural Network (CNN) and the novel state space\nmodel (SSM)-based Mamba architecture on a TC task. Interestingly, we show that\nwhile CNN outperforms Mamba on each separate dataset, Mamba achieves greater\naccuracy when trained on a combination of both. In addition, we demonstrate\nthat Mamba's learning capacity is greater than a CNN for increasing amounts of\ndata. We show that the combination of two TC datasets yields a latent space\nthat can be interpreted with the properties of the terrains. We also discuss\nthe implications of merging datasets on classification. Our source code and\ndataset are publicly available online:\nhttps://github.com/norlab-ulaval/BorealTC.\n","authors":["Damien LaRocque","William Guimont-Martin","David-Alexandre Duclos","Philippe Giguère","François Pomerleau"],"pdf_url":"https://arxiv.org/pdf/2403.16877v2.pdf","comment":"Accepted to the 2024 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2409.18911v1","updated":"2024-09-27T16:54:36Z","published":"2024-09-27T16:54:36Z","title":"Soft Measures for Extracting Causal Collective Intelligence","summary":" Understanding and modeling collective intelligence is essential for\naddressing complex social systems. Directed graphs called fuzzy cognitive maps\n(FCMs) offer a powerful tool for encoding causal mental models, but extracting\nhigh-integrity FCMs from text is challenging. This study presents an approach\nusing large language models (LLMs) to automate FCM extraction. We introduce\nnovel graph-based similarity measures and evaluate them by correlating their\noutputs with human judgments through the Elo rating system. Results show\npositive correlations with human evaluations, but even the best-performing\nmeasure exhibits limitations in capturing FCM nuances. Fine-tuning LLMs\nimproves performance, but existing measures still fall short. This study\nhighlights the need for soft similarity measures tailored to FCM extraction,\nadvancing collective intelligence modeling with NLP.\n","authors":["Maryam Berijanian","Spencer Dork","Kuldeep Singh","Michael Riley Millikan","Ashlin Riggs","Aadarsh Swaminathan","Sarah L. Gibbs","Scott E. Friedman","Nathan Brugnone"],"pdf_url":"https://arxiv.org/pdf/2409.18911v1.pdf","comment":"Camera-ready version accepted for publication in the EMNLP 2024\n Workshop NLP4Science"},{"id":"http://arxiv.org/abs/2409.18901v1","updated":"2024-09-27T16:39:50Z","published":"2024-09-27T16:39:50Z","title":"Improving Visual Object Tracking through Visual Prompting","summary":" Learning a discriminative model to distinguish a target from its surrounding\ndistractors is essential to generic visual object tracking. Dynamic target\nrepresentation adaptation against distractors is challenging due to the limited\ndiscriminative capabilities of prevailing trackers. We present a new visual\nPrompting mechanism for generic Visual Object Tracking (PiVOT) to address this\nissue. PiVOT proposes a prompt generation network with the pre-trained\nfoundation model CLIP to automatically generate and refine visual prompts,\nenabling the transfer of foundation model knowledge for tracking. While CLIP\noffers broad category-level knowledge, the tracker, trained on\ninstance-specific data, excels at recognizing unique object instances. Thus,\nPiVOT first compiles a visual prompt highlighting potential target locations.\nTo transfer the knowledge of CLIP to the tracker, PiVOT leverages CLIP to\nrefine the visual prompt based on the similarities between candidate objects\nand the reference templates across potential targets. Once the visual prompt is\nrefined, it can better highlight potential target locations, thereby reducing\nirrelevant prompt information. With the proposed prompting mechanism, the\ntracker can generate improved instance-aware feature maps through the guidance\nof the visual prompt, thus effectively reducing distractors. The proposed\nmethod does not involve CLIP during training, thereby keeping the same training\ncomplexity and preserving the generalization capability of the pretrained\nfoundation model. Extensive experiments across multiple benchmarks indicate\nthat PiVOT, using the proposed prompting method can suppress distracting\nobjects and enhance the tracker.\n","authors":["Shih-Fang Chen","Jun-Cheng Chen","I-Hong Jhuo","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2409.18901v1.pdf","comment":"Accepted and to appear in IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2409.18895v1","updated":"2024-09-27T16:32:57Z","published":"2024-09-27T16:32:57Z","title":"Multi-Source Hard and Soft Information Fusion Approach for Accurate\n Cryptocurrency Price Movement Prediction","summary":" One of the most important challenges in the financial and cryptocurrency\nfield is accurately predicting cryptocurrency price trends. Leveraging\nartificial intelligence (AI) is beneficial in addressing this challenge.\nCryptocurrency markets, marked by substantial growth and volatility, attract\ninvestors and scholars keen on deciphering and forecasting cryptocurrency price\nmovements. The vast and diverse array of data available for such predictions\nincreases the complexity of the task. In our study, we introduce a novel\napproach termed hard and soft information fusion (HSIF) to enhance the accuracy\nof cryptocurrency price movement forecasts. The hard information component of\nour approach encompasses historical price records alongside technical\nindicators. Complementing this, the soft data component extracts from X\n(formerly Twitter), encompassing news headlines and tweets about the\ncryptocurrency. To use this data, we use the Bidirectional Encoder\nRepresentations from Transformers (BERT)-based sentiment analysis method,\nfinancial BERT (FinBERT), which performs best. Finally, our model feeds on the\ninformation set including processed hard and soft data. We employ the\nbidirectional long short-term memory (BiLSTM) model because processing\ninformation in both forward and backward directions can capture long-term\ndependencies in sequential information. Our empirical findings emphasize the\nsuperiority of the HSIF approach over models dependent on single-source data by\ntesting on Bitcoin-related data. By fusing hard and soft information on Bitcoin\ndataset, our model has about 96.8\\% accuracy in predicting price movement.\nIncorporating information enables our model to grasp the influence of social\nsentiment on price fluctuations, thereby supplementing the technical\nanalysis-based predictions derived from hard information.\n","authors":["Saeed Mohammadi Dashtaki","Mehdi Hosseini Chagahi","Behzad Moshiri","Md. Jalil Piran"],"pdf_url":"https://arxiv.org/pdf/2409.18895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15657v3","updated":"2024-09-27T16:24:50Z","published":"2024-09-24T01:40:24Z","title":"M$^2$PT: Multimodal Prompt Tuning for Zero-shot Instruction Learning","summary":" Multimodal Large Language Models (MLLMs) demonstrate remarkable performance\nacross a wide range of domains, with increasing emphasis on enhancing their\nzero-shot generalization capabilities for unseen tasks across various\nmodalities. Instruction tuning has emerged as an effective strategy for\nachieving zero-shot generalization by finetuning pretrained models on diverse\nmultimodal tasks. As the scale of MLLMs continues to grow, parameter-efficient\nfinetuning becomes increasingly critical. However, most existing\nparameter-efficient approaches focus only on single modalities and often\noverlook the multimodal characteristics during finetuning. In this work, we\nintroduce a novel Multimodal Prompt Tuning (M$^2$PT) approach for efficient\ninstruction tuning of MLLMs. M$^2$PT effectively integrates visual and textual\nprompts into the vision encoder and language processor respectively during\nfinetuning, facilitating the extraction and alignment of features across\nmodalities. Empirical results on various multimodal evaluation datasets\ndemonstrate the superior performance of our approach compared to several\nstate-of-the-art baselines. A comprehensive set of ablation studies validates\nthe effectiveness of our prompt design and the efficiency of our approach.\n","authors":["Taowen Wang","Yiyang Liu","James Chenhao Liang","junhan zhao","Yiming Cui","Yuning Mao","Shaoliang Nie","Jiahao Liu","Fuli Feng","Zenglin Xu","Cheng Han","Lifu Huang","Qifan Wang","Dongfang Liu"],"pdf_url":"https://arxiv.org/pdf/2409.15657v3.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18878v1","updated":"2024-09-27T16:13:38Z","published":"2024-09-27T16:13:38Z","title":"Suicide Phenotyping from Clinical Notes in Safety-Net Psychiatric\n Hospital Using Multi-Label Classification with Pre-Trained Language Models","summary":" Accurate identification and categorization of suicidal events can yield\nbetter suicide precautions, reducing operational burden, and improving care\nquality in high-acuity psychiatric settings. Pre-trained language models offer\npromise for identifying suicidality from unstructured clinical narratives. We\nevaluated the performance of four BERT-based models using two fine-tuning\nstrategies (multiple single-label and single multi-label) for detecting\ncoexisting suicidal events from 500 annotated psychiatric evaluation notes. The\nnotes were labeled for suicidal ideation (SI), suicide attempts (SA), exposure\nto suicide (ES), and non-suicidal self-injury (NSSI). RoBERTa outperformed\nother models using binary relevance (acc=0.86, F1=0.78). MentalBERT (F1=0.74)\nalso exceeded BioClinicalBERT (F1=0.72). RoBERTa fine-tuned with a single\nmulti-label classifier further improved performance (acc=0.88, F1=0.81),\nhighlighting that models pre-trained on domain-relevant data and the single\nmulti-label classification strategy enhance efficiency and performance.\n Keywords: EHR-based Phynotyping; Natural Language Processing; Secondary Use\nof EHR Data; Suicide Classification; BERT-based Model; Psychiatry; Mental\nHealth\n","authors":["Zehan Li","Yan Hu","Scott Lane","Salih Selek","Lokesh Shahani","Rodrigo Machado-Vieira","Jair Soares","Hua Xu","Hongfang Liu","Ming Huang"],"pdf_url":"https://arxiv.org/pdf/2409.18878v1.pdf","comment":"submitted to AMIA Informatics Summit 2025 as a conference paper"},{"id":"http://arxiv.org/abs/2409.18877v1","updated":"2024-09-27T16:12:51Z","published":"2024-09-27T16:12:51Z","title":"UniEmoX: Cross-modal Semantic-Guided Large-Scale Pretraining for\n Universal Scene Emotion Perception","summary":" Visual emotion analysis holds significant research value in both computer\nvision and psychology. However, existing methods for visual emotion analysis\nsuffer from limited generalizability due to the ambiguity of emotion perception\nand the diversity of data scenarios. To tackle this issue, we introduce\nUniEmoX, a cross-modal semantic-guided large-scale pretraining framework.\nInspired by psychological research emphasizing the inseparability of the\nemotional exploration process from the interaction between individuals and\ntheir environment, UniEmoX integrates scene-centric and person-centric\nlow-level image spatial structural information, aiming to derive more nuanced\nand discriminative emotional representations. By exploiting the similarity\nbetween paired and unpaired image-text samples, UniEmoX distills rich semantic\nknowledge from the CLIP model to enhance emotional embedding representations\nmore effectively. To the best of our knowledge, this is the first large-scale\npretraining framework that integrates psychological theories with contemporary\ncontrastive learning and masked image modeling techniques for emotion analysis\nacross diverse scenarios. Additionally, we develop a visual emotional dataset\ntitled Emo8. Emo8 samples cover a range of domains, including cartoon, natural,\nrealistic, science fiction and advertising cover styles, covering nearly all\ncommon emotional scenes. Comprehensive experiments conducted on six benchmark\ndatasets across two downstream tasks validate the effectiveness of UniEmoX. The\nsource code is available at https://github.com/chincharles/u-emo.\n","authors":["Chuang Chen","Xiao Sun","Zhi Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18877v1.pdf","comment":"Submitted to TIP"},{"id":"http://arxiv.org/abs/2409.18874v1","updated":"2024-09-27T16:10:11Z","published":"2024-09-27T16:10:11Z","title":"CESNET-TimeSeries24: Time Series Dataset for Network Traffic Anomaly\n Detection and Forecasting","summary":" Anomaly detection in network traffic is crucial for maintaining the security\nof computer networks and identifying malicious activities. One of the primary\napproaches to anomaly detection are methods based on forecasting. Nevertheless,\nextensive real-world network datasets for forecasting and anomaly detection\ntechniques are missing, potentially causing performance overestimation of\nanomaly detection algorithms. This manuscript addresses this gap by introducing\na dataset comprising time series data of network entities' behavior, collected\nfrom the CESNET3 network. The dataset was created from 40 weeks of network\ntraffic of 275 thousand active IP addresses. The ISP origin of the presented\ndata ensures a high level of variability among network entities, which forms a\nunique and authentic challenge for forecasting and anomaly detection models. It\nprovides valuable insights into the practical deployment of forecast-based\nanomaly detection approaches.\n","authors":["Josef Koumar","Karel Hynek","Tomáš Čejka","Pavel Šiška"],"pdf_url":"https://arxiv.org/pdf/2409.18874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08276v3","updated":"2024-09-27T16:09:13Z","published":"2024-09-12T17:59:44Z","title":"AnySkin: Plug-and-play Skin Sensing for Robotic Touch","summary":" While tactile sensing is widely accepted as an important and useful sensing\nmodality, its use pales in comparison to other sensory modalities like vision\nand proprioception. AnySkin addresses the critical challenges that impede the\nuse of tactile sensing -- versatility, replaceability, and data reusability.\nBuilding on the simplistic design of ReSkin, and decoupling the sensing\nelectronics from the sensing interface, AnySkin simplifies integration making\nit as straightforward as putting on a phone case and connecting a charger.\nFurthermore, AnySkin is the first uncalibrated tactile-sensor with\ncross-instance generalizability of learned manipulation policies. To summarize,\nthis work makes three key contributions: first, we introduce a streamlined\nfabrication process and a design tool for creating an adhesive-free, durable\nand easily replaceable magnetic tactile sensor; second, we characterize slip\ndetection and policy learning with the AnySkin sensor; and third, we\ndemonstrate zero-shot generalization of models trained on one instance of\nAnySkin to new instances, and compare it with popular existing tactile\nsolutions like DIGIT and ReSkin. Videos of experiments, fabrication details and\ndesign files can be found on https://any-skin.github.io/\n","authors":["Raunaq Bhirangi","Venkatesh Pattabiraman","Enes Erciyes","Yifeng Cao","Tess Hellebrekers","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2409.08276v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03291v2","updated":"2024-09-27T16:04:40Z","published":"2024-09-05T06:55:13Z","title":"LLM Detectors Still Fall Short of Real World: Case of LLM-Generated\n Short News-Like Posts","summary":" With the emergence of widely available powerful LLMs, disinformation\ngenerated by large Language Models (LLMs) has become a major concern.\nHistorically, LLM detectors have been touted as a solution, but their\neffectiveness in the real world is still to be proven. In this paper, we focus\non an important setting in information operations -- short news-like posts\ngenerated by moderately sophisticated attackers.\n We demonstrate that existing LLM detectors, whether zero-shot or\npurpose-trained, are not ready for real-world use in that setting. All tested\nzero-shot detectors perform inconsistently with prior benchmarks and are highly\nvulnerable to sampling temperature increase, a trivial attack absent from\nrecent benchmarks. A purpose-trained detector generalizing across LLMs and\nunseen attacks can be developed, but it fails to generalize to new\nhuman-written texts.\n We argue that the former indicates domain-specific benchmarking is needed,\nwhile the latter suggests a trade-off between the adversarial evasion\nresilience and overfitting to the reference human text, with both needing\nevaluation in benchmarks and currently absent. We believe this suggests a\nre-consideration of current LLM detector benchmarking approaches and provides a\ndynamically extensible benchmark to allow it\n(https://github.com/Reliable-Information-Lab-HEVS/benchmark_llm_texts_detection).\n","authors":["Henrique Da Silva Gameiro","Andrei Kucharavy","Ljiljana Dolamic"],"pdf_url":"https://arxiv.org/pdf/2409.03291v2.pdf","comment":"20 pages, 7 tables, 13 figures, under consideration for EMNLP"},{"id":"http://arxiv.org/abs/2409.18868v1","updated":"2024-09-27T16:04:06Z","published":"2024-09-27T16:04:06Z","title":"Individuation in Neural Models with and without Visual Grounding","summary":" We show differences between a language-and-vision model CLIP and two\ntext-only models - FastText and SBERT - when it comes to the encoding of\nindividuation information. We study latent representations that CLIP provides\nfor substrates, granular aggregates, and various numbers of objects. We\ndemonstrate that CLIP embeddings capture quantitative differences in\nindividuation better than models trained on text-only data. Moreover, the\nindividuation hierarchy we deduce from the CLIP embeddings agrees with the\nhierarchies proposed in linguistics and cognitive science.\n","authors":["Alexey Tikhonov","Lisa Bylinina","Ivan P. Yamshchikov"],"pdf_url":"https://arxiv.org/pdf/2409.18868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18865v1","updated":"2024-09-27T16:02:12Z","published":"2024-09-27T16:02:12Z","title":"Positional Encoder Graph Quantile Neural Networks for Geographic Data","summary":" Positional Encoder Graph Neural Networks (PE-GNNs) are a leading approach for\nmodeling continuous spatial data. However, they often fail to produce\ncalibrated predictive distributions, limiting their effectiveness for\nuncertainty quantification. We introduce the Positional Encoder Graph Quantile\nNeural Network (PE-GQNN), a novel method that integrates PE-GNNs, Quantile\nNeural Networks, and recalibration techniques in a fully nonparametric\nframework, requiring minimal assumptions about the predictive distributions. We\npropose a new network architecture that, when combined with a quantile-based\nloss function, yields accurate and reliable probabilistic models without\nincreasing computational complexity. Our approach provides a flexible, robust\nframework for conditional density estimation, applicable beyond spatial data\ncontexts. We further introduce a structured method for incorporating a KNN\npredictor into the model while avoiding data leakage through the GNN layer\noperation. Experiments on benchmark datasets demonstrate that PE-GQNN\nsignificantly outperforms existing state-of-the-art methods in both predictive\naccuracy and uncertainty quantification.\n","authors":["William E. R. de Amorim","Scott A. Sisson","T. Rodrigues","David J. Nott","Guilherme S. Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2409.18865v1.pdf","comment":"17 main text pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.18857v1","updated":"2024-09-27T15:53:54Z","published":"2024-09-27T15:53:54Z","title":"Mitigating Selection Bias with Node Pruning and Auxiliary Options","summary":" Large language models (LLMs) often show unwarranted preference for certain\nchoice options when responding to multiple-choice questions, posing significant\nreliability concerns in LLM-automated systems. To mitigate this selection bias\nproblem, previous solutions utilized debiasing methods to adjust the model's\ninput and/or output. Our work, in contrast, investigates the model's internal\nrepresentation of the selection bias. Specifically, we introduce a novel\ndebiasing approach, Bias Node Pruning (BNP), which eliminates the linear layer\nparameters that contribute to the bias. Furthermore, we present Auxiliary\nOption Injection (AOI), a simple yet effective input modification technique for\ndebiasing, which is compatible even with black-box LLMs. To provide a more\nsystematic evaluation of selection bias, we review existing metrics and\nintroduce Choice Kullback-Leibler Divergence (CKLD), which addresses the\ninsensitivity of the commonly used metrics to label imbalance. Experiments show\nthat our methods are robust and adaptable across various datasets when applied\nto three LLMs.\n","authors":["Hyeong Kyu Choi","Weijie Xu","Chi Xue","Stephanie Eckman","Chandan K. Reddy"],"pdf_url":"https://arxiv.org/pdf/2409.18857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18828v1","updated":"2024-09-27T15:22:44Z","published":"2024-09-27T15:22:44Z","title":"MECG-E: Mamba-based ECG Enhancer for Baseline Wander Removal","summary":" Electrocardiogram (ECG) is an important non-invasive method for diagnosing\ncardiovascular disease. However, ECG signals are susceptible to noise\ncontamination, such as electrical interference or signal wandering, which\nreduces diagnostic accuracy. Various ECG denoising methods have been proposed,\nbut most existing methods yield suboptimal performance under very noisy\nconditions or require several steps during inference, leading to latency during\nonline processing. In this paper, we propose a novel ECG denoising model,\nnamely Mamba-based ECG Enhancer (MECG-E), which leverages the Mamba\narchitecture known for its fast inference and outstanding nonlinear mapping\ncapabilities. Experimental results indicate that MECG-E surpasses several\nwell-known existing models across multiple metrics under different noise\nconditions. Additionally, MECG-E requires less inference time than\nstate-of-the-art diffusion-based ECG denoisers, demonstrating the model's\nfunctionality and efficiency.\n","authors":["Kuo-Hsuan Hung","Kuan-Chen Wang","Kai-Chun Liu","Wei-Lun Chen","Xugang Lu","Yu Tsao","Chii-Wann Lin"],"pdf_url":"https://arxiv.org/pdf/2409.18828v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.18814v1","updated":"2024-09-27T15:07:26Z","published":"2024-09-27T15:07:26Z","title":"Early diagnosis of Alzheimer's disease from MRI images with deep\n learning model","summary":" It is acknowledged that the most common cause of dementia worldwide is\nAlzheimer's disease (AD). This condition progresses in severity from mild to\nsevere and interferes with people's everyday routines. Early diagnosis plays a\ncritical role in patient care and clinical trials. Convolutional neural\nnetworks (CNN) are used to create a framework for identifying specific disease\nfeatures from MRI scans Classification of dementia involves approaches such as\nmedical history review, neuropsychological tests, and magnetic resonance\nimaging (MRI). However, the image dataset obtained from Kaggle faces a\nsignificant issue of class imbalance, which requires equal distribution of\nsamples from each class to address. In this article, to address this imbalance,\nthe Synthetic Minority Oversampling Technique (SMOTE) is utilized. Furthermore,\na pre-trained convolutional neural network has been applied to the DEMNET\ndementia network to extract key features from AD images. The proposed model\nachieved an impressive accuracy of 98.67%.\n","authors":["Sajjad Aghasi Javid","Mahmood Mohassel Feghhi"],"pdf_url":"https://arxiv.org/pdf/2409.18814v1.pdf","comment":"7 pages, 3 figures, Presented at the 20-th CSI International\n Symposium on Artificial Intelligence and Signal Processing (AISP) 21-22\n February, 2024, Mazandaran University of Science and Technology, Babol, Iran"},{"id":"http://arxiv.org/abs/2409.18812v1","updated":"2024-09-27T15:04:39Z","published":"2024-09-27T15:04:39Z","title":"LLMs4Synthesis: Leveraging Large Language Models for Scientific\n Synthesis","summary":" In response to the growing complexity and volume of scientific literature,\nthis paper introduces the LLMs4Synthesis framework, designed to enhance the\ncapabilities of Large Language Models (LLMs) in generating high-quality\nscientific syntheses. This framework addresses the need for rapid, coherent,\nand contextually rich integration of scientific insights, leveraging both\nopen-source and proprietary LLMs. It also examines the effectiveness of LLMs in\nevaluating the integrity and reliability of these syntheses, alleviating\ninadequacies in current quantitative metrics. Our study contributes to this\nfield by developing a novel methodology for processing scientific papers,\ndefining new synthesis types, and establishing nine detailed quality criteria\nfor evaluating syntheses. The integration of LLMs with reinforcement learning\nand AI feedback is proposed to optimize synthesis quality, ensuring alignment\nwith established criteria. The LLMs4Synthesis framework and its components are\nmade available, promising to enhance both the generation and evaluation\nprocesses in scientific research synthesis.\n","authors":["Hamed Babaei Giglou","Jennifer D'Souza","Sören Auer"],"pdf_url":"https://arxiv.org/pdf/2409.18812v1.pdf","comment":"12 pages, 3 figures, Accepted to JCDL 2024 Research Track"},{"id":"http://arxiv.org/abs/2409.18798v1","updated":"2024-09-27T14:53:04Z","published":"2024-09-27T14:53:04Z","title":"Esports Debut as a Medal Event at 2023 Asian Games: Exploring Public\n Perceptions with BERTopic and GPT-4 Topic Fine-Tuning","summary":" This study examined the public opinions of esports at the 2023 Asian Games\nand value co-creation during the event using an LLM-enhanced BERTopic modeling\nanalysis. We identified five major themes representing public perceptions, as\nwell as how major stakeholders co-created value within and beyond the esports\necosystem. Key findings highlighted the strategic use of social media marketing\nto influence public opinion and promote esports events and brands, emphasizing\nthe importance of event logistics and infrastructure. Additionally, the study\nrevealed the co-creation value contributed by stakeholders outside the\ntraditional esports ecosystem, particularly in promoting national\nrepresentation and performance. Our findings supported the ongoing efforts to\nlegitimize esports as a sport, noting that mainstream recognition remains a\nchallenge. The inclusion of esports as a medal event showcased broader\nacceptance and helped mitigate negative public perceptions. Moreover,\ncontributions from non-traditional stakeholders underscored the value of\ncross-subcultural collaborations in esports.\n","authors":["Tyreal Yizhou Qian","Bo Yu","Weizhe Li","Chenglong Xu"],"pdf_url":"https://arxiv.org/pdf/2409.18798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18796v1","updated":"2024-09-27T14:50:36Z","published":"2024-09-27T14:50:36Z","title":"Hierarchical Federated ADMM","summary":" In this paper, we depart from the widely-used gradient descent-based\nhierarchical federated learning (FL) algorithms to develop a novel hierarchical\nFL framework based on the alternating direction method of multipliers (ADMM).\nWithin this framework, we propose two novel FL algorithms, which both use ADMM\nin the top layer: one that employs ADMM in the lower layer and another that\nuses the conventional gradient descent-based approach. The proposed framework\nenhances privacy, and experiments demonstrate the superiority of the proposed\nalgorithms compared to the conventional algorithms in terms of learning\nconvergence and accuracy. Additionally, gradient descent on the lower layer\nperforms well even if the number of local steps is very limited, while ADMM on\nboth layers lead to better performance otherwise.\n","authors":["Seyed Mohammad Azimi-Abarghouyi","Nicola Bastianello","Karl H. Johansson","Viktoria Fodor"],"pdf_url":"https://arxiv.org/pdf/2409.18796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12393v2","updated":"2024-09-27T14:40:11Z","published":"2024-01-22T22:50:59Z","title":"A Learning-based Declarative Privacy-Preserving Framework for Federated\n Data Management","summary":" It is challenging to select the right privacy-preserving mechanism for\nfederated query processing over multiple private data silos. There exist\nnumerous privacy-preserving mechanisms, such as secure multi-party computing\n(SMC), approximate query processing with differential privacy (DP), combined\nSMC and DP, DP-based data obfuscation, and federated learning. These mechanisms\nmake different trade-offs among accuracy, privacy, execution efficiency, and\nstorage efficiency. In this work, we first introduce a new privacy-preserving\ntechnique that uses a deep learning model trained using the\nDifferentially-Private Stochastic Gradient Descent (DP-SGD) algorithm to\nreplace portions of actual data to answer a query. We then demonstrate a novel\ndeclarative privacy-preserving workflow that allows users to specify \"what\nprivate information to protect\" rather than \"how to protect\". Under the hood,\nthe system relies on a cost model to automatically choose privacy-preserving\nmechanisms as well as hyper-parameters. At the same time, the proposed workflow\nalso allows human experts to review and tune the selected privacy-preserving\nmechanism for audit/compliance, and optimization purposes.\n","authors":["Hong Guan","Summer Gautier","Rajan Hari Ambrish","Yancheng Wang","Chaowei Xiao","Yingzhen Yang","Jia Zou"],"pdf_url":"https://arxiv.org/pdf/2401.12393v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18786v1","updated":"2024-09-27T14:34:54Z","published":"2024-09-27T14:34:54Z","title":"A Survey on the Honesty of Large Language Models","summary":" Honesty is a fundamental principle for aligning large language models (LLMs)\nwith human values, requiring these models to recognize what they know and don't\nknow and be able to faithfully express their knowledge. Despite promising,\ncurrent LLMs still exhibit significant dishonest behaviors, such as confidently\npresenting wrong answers or failing to express what they know. In addition,\nresearch on the honesty of LLMs also faces challenges, including varying\ndefinitions of honesty, difficulties in distinguishing between known and\nunknown knowledge, and a lack of comprehensive understanding of related\nresearch. To address these issues, we provide a survey on the honesty of LLMs,\ncovering its clarification, evaluation approaches, and strategies for\nimprovement. Moreover, we offer insights for future research, aiming to inspire\nfurther exploration in this important area.\n","authors":["Siheng Li","Cheng Yang","Taiqiang Wu","Chufan Shi","Yuji Zhang","Xinyu Zhu","Zesen Cheng","Deng Cai","Mo Yu","Lemao Liu","Jie Zhou","Yujiu Yang","Ngai Wong","Xixin Wu","Wai Lam"],"pdf_url":"https://arxiv.org/pdf/2409.18786v1.pdf","comment":"Project Page: https://github.com/SihengLi99/LLM-Honesty-Survey"},{"id":"http://arxiv.org/abs/2404.07164v2","updated":"2024-09-27T14:32:19Z","published":"2024-04-10T17:00:04Z","title":"PIM-Opt: Demystifying Distributed Optimization Algorithms on a\n Real-World Processing-In-Memory System","summary":" Modern Machine Learning (ML) training on large-scale datasets is a very\ntime-consuming workload. It relies on the optimization algorithm Stochastic\nGradient Descent (SGD) due to its effectiveness, simplicity, and generalization\nperformance. Processor-centric architectures (e.g., CPUs, GPUs) commonly used\nfor modern ML training workloads based on SGD are bottlenecked by data movement\nbetween the processor and memory units due to the poor data locality in\naccessing large datasets. As a result, processor-centric architectures suffer\nfrom low performance and high energy consumption while executing ML training\nworkloads. Processing-In-Memory (PIM) is a promising solution to alleviate the\ndata movement bottleneck by placing the computation mechanisms inside or near\nmemory.\n Our goal is to understand the capabilities of popular distributed SGD\nalgorithms on real-world PIM systems to accelerate data-intensive ML training\nworkloads. To this end, we 1) implement several representative centralized\nparallel SGD algorithms on the real-world UPMEM PIM system, 2) rigorously\nevaluate these algorithms for ML training on large-scale datasets in terms of\nperformance, accuracy, and scalability, 3) compare to conventional CPU and GPU\nbaselines, and 4) discuss implications for future PIM hardware and highlight\nthe need for a shift to an algorithm-hardware codesign.\n Our results demonstrate three major findings: 1) The UPMEM PIM system can be\na viable alternative to state-of-the-art CPUs and GPUs for many memory-bound ML\ntraining workloads, especially when operations and datatypes are natively\nsupported by PIM hardware, 2) it is important to carefully choose the\noptimization algorithms that best fit PIM, and 3) the UPMEM PIM system does not\nscale approximately linearly with the number of nodes for many data-intensive\nML training workloads. We open source all our code to facilitate future\nresearch.\n","authors":["Steve Rhyner","Haocong Luo","Juan Gómez-Luna","Mohammad Sadrosadati","Jiawei Jiang","Ataberk Olgun","Harshita Gupta","Ce Zhang","Onur Mutlu"],"pdf_url":"https://arxiv.org/pdf/2404.07164v2.pdf","comment":"\"PIM-Opt: Demystifying Distributed Optimization Algorithms on a\n Real-World Processing-In-Memory System\" in Proceedings of the 33rd\n International Conference on Parallel Architectures and Compilation Techniques\n (PACT), Long Beach, CA, USA, October 2024"},{"id":"http://arxiv.org/abs/2409.16125v2","updated":"2024-09-27T14:24:52Z","published":"2024-09-24T14:35:20Z","title":"Analyzing Probabilistic Methods for Evaluating Agent Capabilities","summary":" To mitigate risks from AI systems, we need to assess their capabilities\naccurately. This is especially difficult in cases where capabilities are only\nrarely displayed. Phuong et al. propose two methods that aim to obtain better\nestimates of the probability of an AI agent successfully completing a given\ntask. The milestone method decomposes tasks into subtasks, aiming to improve\noverall success rate estimation, while the expert best-of-N method leverages\nhuman guidance as a proxy for the model's independent performance.\n Our analysis of these methods as Monte Carlo estimators reveals that while\nboth effectively reduce variance compared to naive Monte Carlo sampling, they\nalso introduce bias. Experimental results demonstrate that the milestone method\nunderestimates true solve rates for many real-world tasks due to its\nconstraining assumptions. The expert best-of-N method exhibits even more severe\nunderestimation across all tasks, attributed to an inherently flawed\nre-weighting factor. To enhance the accuracy of capability estimates of AI\nagents on difficult tasks, we suggest future work should leverage the rich\nliterature on Monte Carlo Estimators.\n","authors":["Axel Højmark","Govind Pimpale","Arjun Panickssery","Marius Hobbhahn","Jérémy Scheurer"],"pdf_url":"https://arxiv.org/pdf/2409.16125v2.pdf","comment":"Updated wording in Figure 1 and 2"},{"id":"http://arxiv.org/abs/2409.18778v1","updated":"2024-09-27T14:24:16Z","published":"2024-09-27T14:24:16Z","title":"HardCore Generation: Generating Hard UNSAT Problems for Data\n Augmentation","summary":" Efficiently determining the satisfiability of a boolean equation -- known as\nthe SAT problem for brevity -- is crucial in various industrial problems.\nRecently, the advent of deep learning methods has introduced significant\npotential for enhancing SAT solving. However, a major barrier to the\nadvancement of this field has been the scarcity of large, realistic datasets.\nThe majority of current public datasets are either randomly generated or\nextremely limited, containing only a few examples from unrelated problem\nfamilies. These datasets are inadequate for meaningful training of deep\nlearning methods. In light of this, researchers have started exploring\ngenerative techniques to create data that more accurately reflect SAT problems\nencountered in practical situations. These methods have so far suffered from\neither the inability to produce challenging SAT problems or time-scalability\nobstacles. In this paper we address both by identifying and manipulating the\nkey contributors to a problem's ``hardness'', known as cores. Although some\nprevious work has addressed cores, the time costs are unacceptably high due to\nthe expense of traditional heuristic core detection techniques. We introduce a\nfast core detection procedure that uses a graph neural network. Our empirical\nresults demonstrate that we can efficiently generate problems that remain hard\nto solve and retain key attributes of the original example problems. We show\nvia experiment that the generated synthetic SAT problems can be used in a data\naugmentation setting to provide improved prediction of solver runtimes.\n","authors":["Joseph Cotnareanu","Zhanguang Zhang","Hui-Ling Zhen","Yingxue Zhang","Mark Coates"],"pdf_url":"https://arxiv.org/pdf/2409.18778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18769v1","updated":"2024-09-27T14:14:16Z","published":"2024-09-27T14:14:16Z","title":"State-of-the-Art Periorbital Distance Prediction and Disease\n Classification Using Periorbital Features","summary":" Periorbital distances and features around the eyes and lids hold valuable\ninformation for disease quantification and monitoring of surgical and medical\nintervention. These distances are commonly measured manually, a process that is\nboth subjective and highly time-consuming. Here, we set out to developed three\ndeep-learning methods for segmentation and periorbital distance prediction, and\nalso evaluate the utility of periorbital distances for disease classification.\nThe MAE of our deep learning predicted distances was less than or very close to\nthe error observed between trained human annotators. We compared our models to\nthe current state-of-the-art (SOTA) method for periorbital distance prediction\nand found that our methods outperformed SOTA on all of our datasets on all but\none periorbital measurement. We also show that robust segmentation can be\nachieved on diseased eyes using models trained on open-source, healthy eyes,\nand that periorbital distances have can be used as high-quality features in\ndownstream classification models. Leveraging segmentation networks as\nintermediary steps in classification has broad implications for increasing the\ngeneralizability of classification models in ophthalmic plastic and\ncraniofacial surgery by avoiding the out-of-distribution problem observed in\ntraditional convolutional neural networks.\n","authors":["George R. Nahass","Ghasem Yazdanpanah","Madison Cheung","Alex Palacios","Jeffery Peterson","Kevin Heinze","Sasha Hubschman","Chad A. Purnell","Pete Setabutr","Ann Q. Tran","Darvin Yi"],"pdf_url":"https://arxiv.org/pdf/2409.18769v1.pdf","comment":"16 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2409.18768v1","updated":"2024-09-27T14:12:49Z","published":"2024-09-27T14:12:49Z","title":"Learning from Demonstration with Implicit Nonlinear Dynamics Models","summary":" Learning from Demonstration (LfD) is a useful paradigm for training policies\nthat solve tasks involving complex motions. In practice, the successful\napplication of LfD requires overcoming error accumulation during policy\nexecution, i.e. the problem of drift due to errors compounding over time and\nthe consequent out-of-distribution behaviours. Existing works seek to address\nthis problem through scaling data collection, correcting policy errors with a\nhuman-in-the-loop, temporally ensembling policy predictions or through learning\nthe parameters of a dynamical system model. In this work, we propose and\nvalidate an alternative approach to overcoming this issue. Inspired by\nreservoir computing, we develop a novel neural network layer that includes a\nfixed nonlinear dynamical system with tunable dynamical properties. We validate\nthe efficacy of our neural network layer on the task of reproducing human\nhandwriting motions using the LASA Human Handwriting Dataset. Through empirical\nexperiments we demonstrate that incorporating our layer into existing neural\nnetwork architectures addresses the issue of compounding errors in LfD.\nFurthermore, we perform a comparative evaluation against existing approaches\nincluding a temporal ensemble of policy predictions and an Echo State Networks\n(ESNs) implementation. We find that our approach yields greater policy\nprecision and robustness on the handwriting task while also generalising to\nmultiple dynamics regimes and maintaining competitive latency scores.\n","authors":["Peter David Fagan","Subramanian Ramamoorthy"],"pdf_url":"https://arxiv.org/pdf/2409.18768v1.pdf","comment":"21 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.09197v2","updated":"2024-09-27T14:01:16Z","published":"2024-07-12T11:53:40Z","title":"A Chatbot for Asylum-Seeking Migrants in Europe","summary":" We present ACME: A Chatbot for asylum-seeking Migrants in Europe. ACME relies\non computational argumentation and aims to help migrants identify the highest\nlevel of protection they can apply for. This would contribute to a more\nsustainable migration by reducing the load on territorial commissions, Courts,\nand humanitarian organizations supporting asylum applicants. We describe the\nbackground context, system architecture, underlying technologies, and a case\nstudy used to validate the tool with domain experts.\n","authors":["Bettina Fazzinga","Elena Palmieri","Margherita Vestoso","Luca Bolognini","Andrea Galassi","Filippo Furfaro","Paolo Torroni"],"pdf_url":"https://arxiv.org/pdf/2407.09197v2.pdf","comment":"Accepted for publication at IEEE International Conference on Tools\n with Artificial Intelligence (ICTAI) @IEEE"},{"id":"http://arxiv.org/abs/2409.18743v1","updated":"2024-09-27T13:33:52Z","published":"2024-09-27T13:33:52Z","title":"OpenObject-NAV: Open-Vocabulary Object-Oriented Navigation Based on\n Dynamic Carrier-Relationship Scene Graph","summary":" In everyday life, frequently used objects like cups often have unfixed\npositions and multiple instances within the same category, and their carriers\nfrequently change as well. As a result, it becomes challenging for a robot to\nefficiently navigate to a specific instance. To tackle this challenge, the\nrobot must capture and update scene changes and plans continuously. However,\ncurrent object navigation approaches primarily focus on semantic-level and lack\nthe ability to dynamically update scene representation. This paper captures the\nrelationships between frequently used objects and their static carriers. It\nconstructs an open-vocabulary Carrier-Relationship Scene Graph (CRSG) and\nupdates the carrying status during robot navigation to reflect the dynamic\nchanges of the scene. Based on the CRSG, we further propose an instance\nnavigation strategy that models the navigation process as a Markov Decision\nProcess. At each step, decisions are informed by Large Language Model's\ncommonsense knowledge and visual-language feature similarity. We designed a\nseries of long-sequence navigation tasks for frequently used everyday items in\nthe Habitat simulator. The results demonstrate that by updating the CRSG, the\nrobot can efficiently navigate to moved targets. Additionally, we deployed our\nalgorithm on a real robot and validated its practical effectiveness.\n","authors":["Yujie Tang","Meiling Wang","Yinan Deng","Zibo Zheng","Jiagui Zhong","Yufeng Yue"],"pdf_url":"https://arxiv.org/pdf/2409.18743v1.pdf","comment":"Project website: https://openobject-nav.github.io/"},{"id":"http://arxiv.org/abs/2409.18735v1","updated":"2024-09-27T13:27:15Z","published":"2024-09-27T13:27:15Z","title":"Autoregressive Policy Optimization for Constrained Allocation Tasks","summary":" Allocation tasks represent a class of problems where a limited amount of\nresources must be allocated to a set of entities at each time step. Prominent\nexamples of this task include portfolio optimization or distributing\ncomputational workloads across servers. Allocation tasks are typically bound by\nlinear constraints describing practical requirements that have to be strictly\nfulfilled at all times. In portfolio optimization, for example, investors may\nbe obligated to allocate less than 30\\% of the funds into a certain industrial\nsector in any investment period. Such constraints restrict the action space of\nallowed allocations in intricate ways, which makes learning a policy that\navoids constraint violations difficult. In this paper, we propose a new method\nfor constrained allocation tasks based on an autoregressive process to\nsequentially sample allocations for each entity. In addition, we introduce a\nnovel de-biasing mechanism to counter the initial bias caused by sequential\nsampling. We demonstrate the superior performance of our approach compared to a\nvariety of Constrained Reinforcement Learning (CRL) methods on three distinct\nconstrained allocation tasks: portfolio optimization, computational workload\ndistribution, and a synthetic allocation benchmark. Our code is available at:\nhttps://github.com/niklasdbs/paspo\n","authors":["David Winkel","Niklas Strauß","Maximilian Bernhard","Zongyue Li","Thomas Seidl","Matthias Schubert"],"pdf_url":"https://arxiv.org/pdf/2409.18735v1.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.14180v2","updated":"2024-09-27T13:12:04Z","published":"2024-08-26T11:08:44Z","title":"I2EBench: A Comprehensive Benchmark for Instruction-based Image Editing","summary":" Significant progress has been made in the field of Instruction-based Image\nEditing (IIE). However, evaluating these models poses a significant challenge.\nA crucial requirement in this field is the establishment of a comprehensive\nevaluation benchmark for accurately assessing editing results and providing\nvaluable insights for its further development. In response to this need, we\npropose I2EBench, a comprehensive benchmark designed to automatically evaluate\nthe quality of edited images produced by IIE models from multiple dimensions.\nI2EBench consists of 2,000+ images for editing, along with 4,000+ corresponding\noriginal and diverse instructions. It offers three distinctive characteristics:\n1) Comprehensive Evaluation Dimensions: I2EBench comprises 16 evaluation\ndimensions that cover both high-level and low-level aspects, providing a\ncomprehensive assessment of each IIE model. 2) Human Perception Alignment: To\nensure the alignment of our benchmark with human perception, we conducted an\nextensive user study for each evaluation dimension. 3) Valuable Research\nInsights: By analyzing the advantages and disadvantages of existing IIE models\nacross the 16 dimensions, we offer valuable research insights to guide future\ndevelopment in the field. We will open-source I2EBench, including all\ninstructions, input images, human annotations, edited images from all evaluated\nmethods, and a simple script for evaluating the results from new IIE models.\nThe code, dataset and generated images from all IIE models are provided in\ngithub: https://github.com/cocoshe/I2EBench.\n","authors":["Yiwei Ma","Jiayi Ji","Ke Ye","Weihuang Lin","Zhibin Wang","Yonghan Zheng","Qiang Zhou","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2408.14180v2.pdf","comment":"NeurIPS2024, 15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.12059v2","updated":"2024-09-27T13:07:26Z","published":"2024-09-18T15:32:48Z","title":"Dual-Layer Training and Decoding of Large Language Model with\n Simultaneously Thinking and Speaking","summary":" Large Language Model can reasonably understand and generate human expressions\nbut may lack of thorough thinking and reasoning mechanisms. Recently there have\nbeen several studies which enhance the thinking ability of language models but\nmost of them are not data-driven or training-based. In this paper, we are\nmotivated by the cognitive mechanism in the natural world, and design a novel\nmodel architecture called TaS which allows it to first consider the thoughts\nand then express the response based upon the query. We design several pipelines\nto annotate or generate the thought contents from prompt-response samples, then\nadd language heads in a middle layer which behaves as the thinking layer. We\ntrain the language model by the thoughts-augmented data and successfully let\nthe thinking layer automatically generate reasonable thoughts and finally\noutput more reasonable responses. Both qualitative examples and quantitative\nresults validate the effectiveness and performance of TaS. Our code is\navailable at https://anonymous.4open.science/r/TadE.\n","authors":["Ningyuan Xi","Xiaoyu Wang","Yetao Wu","Teng Chen","Qingqing Gu","Jinxian Qu","Zhonglin Jiang","Yong Chen","Luo Ji"],"pdf_url":"https://arxiv.org/pdf/2409.12059v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.08821v2","updated":"2024-09-27T13:00:12Z","published":"2024-08-16T16:09:59Z","title":"EasyRec: Simple yet Effective Language Models for Recommendation","summary":" Deep neural networks have become a powerful technique for learning\nrepresentations from user-item interaction data in collaborative filtering (CF)\nfor recommender systems. However, many existing methods heavily rely on unique\nuser and item IDs, which limits their ability to perform well in practical\nzero-shot learning scenarios where sufficient training data may be unavailable.\nInspired by the success of language models (LMs) and their strong\ngeneralization capabilities, a crucial question arises: How can we harness the\npotential of language models to empower recommender systems and elevate its\ngeneralization capabilities to new heights? In this study, we propose EasyRec -\nan effective and easy-to-use approach that seamlessly integrates text-based\nsemantic understanding with collaborative signals. EasyRec employs a\ntext-behavior alignment framework, which combines contrastive learning with\ncollaborative language model tuning, to ensure a strong alignment between the\ntext-enhanced semantic space and the collaborative behavior information.\nExtensive empirical evaluations across diverse real-world datasets demonstrate\nthe superior performance of EasyRec compared to state-of-the-art alternative\nmodels, particularly in the challenging text-based zero-shot recommendation\nscenarios. Furthermore, the study highlights the potential of seamlessly\nintegrating EasyRec as a plug-and-play component into text-enhanced\ncollaborative filtering frameworks, thereby empowering existing recommender\nsystems to elevate their recommendation performance and adapt to the evolving\nuser preferences in dynamic environments. For better result reproducibility of\nour EasyRec framework, the model implementation details, source code, and\ndatasets are available at the link: https://github.com/HKUDS/EasyRec.\n","authors":["Xubin Ren","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2408.08821v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18715v1","updated":"2024-09-27T12:59:29Z","published":"2024-09-27T12:59:29Z","title":"Multi-modal Medical Image Fusion For Non-Small Cell Lung Cancer\n Classification","summary":" The early detection and nuanced subtype classification of non-small cell lung\ncancer (NSCLC), a predominant cause of cancer mortality worldwide, is a\ncritical and complex issue. In this paper, we introduce an innovative\nintegration of multi-modal data, synthesizing fused medical imaging (CT and PET\nscans) with clinical health records and genomic data. This unique fusion\nmethodology leverages advanced machine learning models, notably MedClip and\nBEiT, for sophisticated image feature extraction, setting a new standard in\ncomputational oncology. Our research surpasses existing approaches, as\nevidenced by a substantial enhancement in NSCLC detection and classification\nprecision. The results showcase notable improvements across key performance\nmetrics, including accuracy, precision, recall, and F1-score. Specifically, our\nleading multi-modal classifier model records an impressive accuracy of 94.04%.\nWe believe that our approach has the potential to transform NSCLC diagnostics,\nfacilitating earlier detection and more effective treatment planning and,\nultimately, leading to superior patient outcomes in lung cancer care.\n","authors":["Salma Hassan","Hamad Al Hammadi","Ibrahim Mohammed","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2409.18715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18708v1","updated":"2024-09-27T12:54:13Z","published":"2024-09-27T12:54:13Z","title":"Read Over the Lines: Attacking LLMs and Toxicity Detection Systems with\n ASCII Art to Mask Profanity","summary":" We introduce a novel family of adversarial attacks that exploit the inability\nof language models to interpret ASCII art. To evaluate these attacks, we\npropose the ToxASCII benchmark and develop two custom ASCII art fonts: one\nleveraging special tokens and another using text-filled letter shapes. Our\nattacks achieve a perfect 1.0 Attack Success Rate across ten models, including\nOpenAI's o1-preview and LLaMA 3.1.\n Warning: this paper contains examples of toxic language used for research\npurposes.\n","authors":["Sergey Berezin","Reza Farahbakhsh","Noel Crespi"],"pdf_url":"https://arxiv.org/pdf/2409.18708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18705v1","updated":"2024-09-27T12:47:36Z","published":"2024-09-27T12:47:36Z","title":"Speech Boosting: Low-Latency Live Speech Enhancement for TWS Earbuds","summary":" This paper introduces a speech enhancement solution tailored for true\nwireless stereo (TWS) earbuds on-device usage. The solution was specifically\ndesigned to support conversations in noisy environments, with active noise\ncancellation (ANC) activated. The primary challenges for speech enhancement\nmodels in this context arise from computational complexity that limits\non-device usage and latency that must be less than 3 ms to preserve a live\nconversation. To address these issues, we evaluated several crucial design\nelements, including the network architecture and domain, design of loss\nfunctions, pruning method, and hardware-specific optimization. Consequently, we\ndemonstrated substantial improvements in speech enhancement quality compared\nwith that in baseline models, while simultaneously reducing the computational\ncomplexity and algorithmic latency.\n","authors":["Hanbin Bae","Pavel Andreev","Azat Saginbaev","Nicholas Babaev","Won-Jun Lee","Hosang Sung","Hoon-Young Cho"],"pdf_url":"https://arxiv.org/pdf/2409.18705v1.pdf","comment":"Accepted by Interspeech 2024"},{"id":"http://arxiv.org/abs/2409.18704v1","updated":"2024-09-27T12:45:57Z","published":"2024-09-27T12:45:57Z","title":"Semantic Model Component Implementation for Model-driven Semantic\n Communications","summary":" The key feature of model-driven semantic communication is the propagation of\nthe model. The semantic model component (SMC) is designed to drive the\nintelligent model to transmit in the physical channel, allowing the\nintelligence to flow through the networks. According to the characteristics of\nneural networks with common and individual model parameters, this paper designs\nthe cross-source-domain and cross-task semantic component model. Considering\nthat the basic model is deployed on the edge node, the large server node\nupdates the edge node by transmitting only the semantic component model to the\nedge node so that the edge node can handle different sources and different\ntasks. In addition, this paper also discusses how channel noise affects the\nperformance of the model and proposes methods of injection noise and\nregularization to improve the noise resistance of the model. Experiments show\nthat SMCs use smaller model parameters to achieve cross-source, cross-task\nfunctionality while maintaining performance and improving the model's tolerance\nto noise. Finally, a component transfer-based unmanned vehicle tracking\nprototype was implemented to verify the feasibility of model components in\npractical applications.\n","authors":["Haotai Liang","Mengran Shi","Chen Dong","Xiaodong Xu","Long Liu","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2409.18704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15360v2","updated":"2024-09-27T12:36:58Z","published":"2024-09-18T02:35:41Z","title":"Reward-Robust RLHF in LLMs","summary":" As Large Language Models (LLMs) continue to progress toward more advanced\nforms of intelligence, Reinforcement Learning from Human Feedback (RLHF) is\nincreasingly seen as a key pathway toward achieving Artificial General\nIntelligence (AGI). However, the reliance on reward-model-based (RM-based)\nalignment methods introduces significant challenges due to the inherent\ninstability and imperfections of Reward Models (RMs), which can lead to\ncritical issues such as reward hacking and misalignment with human intentions.\nIn this paper, we introduce a reward-robust RLHF framework aimed at addressing\nthese fundamental challenges, paving the way for more reliable and resilient\nlearning in LLMs. Our approach introduces a novel optimization objective that\ncarefully balances performance and robustness by incorporating Bayesian Reward\nModel Ensembles (BRME) to model the uncertainty set of reward functions. This\nallows the framework to integrate both nominal performance and minimum reward\nsignals, ensuring more stable learning even with imperfect RMs. Empirical\nresults demonstrate that our framework consistently outperforms baselines\nacross diverse benchmarks, showing improved accuracy and long-term stability.\nWe also provide a theoretical analysis, demonstrating that reward-robust RLHF\napproaches the stability of constant reward settings, which proves to be\nacceptable even in a stochastic-case analysis. Together, these contributions\nhighlight the framework potential to enhance both the performance and stability\nof LLM alignment.\n","authors":["Yuzi Yan","Xingzhou Lou","Jialian Li","Yiping Zhang","Jian Xie","Chao Yu","Yu Wang","Dong Yan","Yuan Shen"],"pdf_url":"https://arxiv.org/pdf/2409.15360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18695v1","updated":"2024-09-27T12:33:57Z","published":"2024-09-27T12:33:57Z","title":"KALE-LM: Unleash The Power Of AI For Science Via Knowledge And Logic\n Enhanced Large Model","summary":" Artificial intelligence is gradually demonstrating its immense potential, and\nincreasing attention is being given to how AI can be harnessed to advance\nscientific research. In this vision paper, we present our perspectives on how\nAI can better assist scientific inquiry and explore corresponding technical\napproach. We have proposed and open-sourced a large model of our KALE-LM model\nseries, Llama3-KALE-LM-Chem-8B, which has achieved outstanding performance in\ntasks related to the field of chemistry. We hope that our work serves as a\nstrong starting point, helping to realize more intelligent AI and promoting the\nadvancement of human science and technology, as well as societal development.\n","authors":["Weichen Dai","Yezeng Chen","Zijie Dai","Zhijie Huang","Yubo Liu","Yixuan Pan","Baiyang Song","Chengli Zhong","Xinhe Li","Zeyu Wang","Zhuoying Feng","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.18695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18694v1","updated":"2024-09-27T12:28:47Z","published":"2024-09-27T12:28:47Z","title":"Learning from Pattern Completion: Self-supervised Controllable\n Generation","summary":" The human brain exhibits a strong ability to spontaneously associate\ndifferent visual attributes of the same or similar visual scene, such as\nassociating sketches and graffiti with real-world visual objects, usually\nwithout supervising information. In contrast, in the field of artificial\nintelligence, controllable generation methods like ControlNet heavily rely on\nannotated training datasets such as depth maps, semantic segmentation maps, and\nposes, which limits the method's scalability. Inspired by the neural mechanisms\nthat may contribute to the brain's associative power, specifically the cortical\nmodularization and hippocampal pattern completion, here we propose a\nself-supervised controllable generation (SCG) framework. Firstly, we introduce\nan equivariant constraint to promote inter-module independence and intra-module\ncorrelation in a modular autoencoder network, thereby achieving functional\nspecialization. Subsequently, based on these specialized modules, we employ a\nself-supervised pattern completion approach for controllable generation\ntraining. Experimental results demonstrate that the proposed modular\nautoencoder effectively achieves functional specialization, including the\nmodular processing of color, brightness, and edge detection, and exhibits\nbrain-like features including orientation selectivity, color antagonism, and\ncenter-surround receptive fields. Through self-supervised training, associative\ngeneration capabilities spontaneously emerge in SCG, demonstrating excellent\ngeneralization ability to various tasks such as associative generation on\npainting, sketches, and ancient graffiti. Compared to the previous\nrepresentative method ControlNet, our proposed approach not only demonstrates\nsuperior robustness in more challenging high-noise scenarios but also possesses\nmore promising scalability potential due to its self-supervised manner.\n","authors":["Zhiqiang Chen","Guofan Fan","Jinying Gao","Lei Ma","Bo Lei","Tiejun Huang","Shan Yu"],"pdf_url":"https://arxiv.org/pdf/2409.18694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18692v1","updated":"2024-09-27T12:28:18Z","published":"2024-09-27T12:28:18Z","title":"MG-Net: Learn to Customize QAOA with Circuit Depth Awareness","summary":" Quantum Approximate Optimization Algorithm (QAOA) and its variants exhibit\nimmense potential in tackling combinatorial optimization challenges. However,\ntheir practical realization confronts a dilemma: the requisite circuit depth\nfor satisfactory performance is problem-specific and often exceeds the maximum\ncapability of current quantum devices. To address this dilemma, here we first\nanalyze the convergence behavior of QAOA, uncovering the origins of this\ndilemma and elucidating the intricate relationship between the employed mixer\nHamiltonian, the specific problem at hand, and the permissible maximum circuit\ndepth. Harnessing this understanding, we introduce the Mixer Generator Network\n(MG-Net), a unified deep learning framework adept at dynamically formulating\noptimal mixer Hamiltonians tailored to distinct tasks and circuit depths.\nSystematic simulations, encompassing Ising models and weighted Max-Cut\ninstances with up to 64 qubits, substantiate our theoretical findings,\nhighlighting MG-Net's superior performance in terms of both approximation ratio\nand efficiency.\n","authors":["Yang Qian","Xinbiao Wang","Yuxuan Du","Yong Luo","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2409.18692v1.pdf","comment":"29 pages, 16 figures"},{"id":"http://arxiv.org/abs/2405.08027v3","updated":"2024-09-27T12:27:48Z","published":"2024-05-12T13:36:58Z","title":"Automating Data Annotation under Strategic Human Agents: Risks and\n Potential Solutions","summary":" As machine learning (ML) models are increasingly used in social domains to\nmake consequential decisions about humans, they often have the power to reshape\ndata distributions. Humans, as strategic agents, continuously adapt their\nbehaviors in response to the learning system. As populations change\ndynamically, ML systems may need frequent updates to ensure high performance.\nHowever, acquiring high-quality human-annotated samples can be highly\nchallenging and even infeasible in social domains. A common practice to address\nthis issue is using the model itself to annotate unlabeled data samples. This\npaper investigates the long-term impacts when ML models are retrained with\nmodel-annotated samples when they incorporate human strategic responses. We\nfirst formalize the interactions between strategic agents and the model and\nthen analyze how they evolve under such dynamic interactions. We find that\nagents are increasingly likely to receive positive decisions as the model gets\nretrained, whereas the proportion of agents with positive labels may decrease\nover time. We thus propose a refined retraining process to stabilize the\ndynamics. Last, we examine how algorithmic fairness can be affected by these\nretraining processes and find that enforcing common fairness constraints at\nevery round may not benefit the disadvantaged group in the long run.\nExperiments on (semi-)synthetic and real data validate the theoretical\nfindings.\n","authors":["Tian Xie","Xueru Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.08027v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14277v2","updated":"2024-09-27T12:18:34Z","published":"2024-06-20T12:59:27Z","title":"QPaug: Question and Passage Augmentation for Open-Domain Question\n Answering of LLMs","summary":" Retrieval-augmented generation (RAG) has received much attention for\nOpen-domain question-answering (ODQA) tasks as a means to compensate for the\nparametric knowledge of large language models (LLMs). While previous approaches\nfocused on processing retrieved passages to remove irrelevant context, they\nstill rely heavily on the quality of retrieved passages which can degrade if\nthe question is ambiguous or complex. In this paper, we propose a simple yet\nefficient method called question and passage augmentation (QPaug) via LLMs for\nopen-domain QA. QPaug first decomposes the original questions into\nmultiple-step sub-questions. By augmenting the original question with detailed\nsub-questions and planning, we are able to make the query more specific on what\nneeds to be retrieved, improving the retrieval performance. In addition, to\ncompensate for the case where the retrieved passages contain distracting\ninformation or divided opinions, we augment the retrieved passages with\nself-generated passages by LLMs to guide the answer extraction. Experimental\nresults show that QPaug outperforms the previous state-of-the-art and achieves\nsignificant performance gain over existing RAG methods. The source code is\navailable at \\url{https://github.com/kmswin1/QPaug}.\n","authors":["Minsang Kim","Cheoneum Park","Seungjun Baek"],"pdf_url":"https://arxiv.org/pdf/2406.14277v2.pdf","comment":"The 2024 Conference on Empirical Methods in Natural Language\n Processing (EMNLP), Findings"},{"id":"http://arxiv.org/abs/2409.17213v2","updated":"2024-09-27T12:12:44Z","published":"2024-09-25T17:38:39Z","title":"Plurals: A System for Guiding LLMs Via Simulated Social Ensembles","summary":" Recent debates raised concerns that language models may favor certain\nviewpoints. But what if the solution is not to aim for a 'view from nowhere'\nbut rather to leverage different viewpoints? We introduce Plurals, a system and\nPython library for pluralistic AI deliberation. Plurals consists of Agents\n(LLMs, optionally with personas) which deliberate within customizable\nStructures, with Moderators overseeing deliberation. Plurals is a generator of\nsimulated social ensembles. Plurals integrates with government datasets to\ncreate nationally representative personas, includes deliberation templates\ninspired by democratic deliberation theory, and allows users to customize both\ninformation-sharing structures and deliberation behavior within Structures. Six\ncase studies demonstrate fidelity to theoretical constructs and efficacy. Three\nrandomized experiments show simulated focus groups produced output resonant\nwith an online sample of the relevant audiences (chosen over zero-shot\ngeneration in 75% of trials). Plurals is both a paradigm and a concrete system\nfor pluralistic AI. The Plurals library is available at\nhttps://github.com/josh-ashkinaze/plurals and will be continually updated.\n","authors":["Joshua Ashkinaze","Emily Fry","Narendra Edara","Eric Gilbert","Ceren Budak"],"pdf_url":"https://arxiv.org/pdf/2409.17213v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18680v1","updated":"2024-09-27T12:06:53Z","published":"2024-09-27T12:06:53Z","title":"Beyond Single-Audio: Advancing Multi-Audio Processing in Audio Large\n Language Models","summary":" Various audio-LLMs (ALLMs) have been explored recently for tackling different\naudio tasks simultaneously using a single, unified model. While existing\nevaluations of ALLMs primarily focus on single-audio tasks, real-world\napplications often involve processing multiple audio streams simultaneously. To\nbridge this gap, we propose the first multi-audio evaluation (MAE) benchmark\nthat consists of 20 datasets from 11 multi-audio tasks encompassing both speech\nand sound scenarios. Comprehensive experiments on MAE demonstrate that the\nexisting ALLMs, while being powerful in comprehending primary audio elements in\nindividual audio inputs, struggling to handle multi-audio scenarios. To this\nend, we propose a novel multi-audio-LLM (MALLM) to capture audio context among\nmultiple similar audios using discriminative learning on our proposed synthetic\ndata. The results demonstrate that the proposed MALLM outperforms all baselines\nand achieves high data efficiency using synthetic data without requiring human\nannotations. The proposed MALLM opens the door for ALLMs towards multi-audio\nprocessing era and brings us closer to replicating human auditory capabilities\nin machines.\n","authors":["Yiming Chen","Xianghu Yue","Xiaoxue Gao","Chen Zhang","Luis Fernando D'Haro","Robby T. Tan","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2409.18680v1.pdf","comment":"EMNLP24 Findings"},{"id":"http://arxiv.org/abs/2409.18676v1","updated":"2024-09-27T12:03:15Z","published":"2024-09-27T12:03:15Z","title":"Toward Universal and Interpretable World Models for Open-ended Learning\n Agents","summary":" We introduce a generic, compositional and interpretable class of generative\nworld models that supports open-ended learning agents. This is a sparse class\nof Bayesian networks capable of approximating a broad range of stochastic\nprocesses, which provide agents with the ability to learn world models in a\nmanner that may be both interpretable and computationally scalable. This\napproach integrating Bayesian structure learning and intrinsically motivated\n(model-based) planning enables agents to actively develop and refine their\nworld models, which may lead to open-ended learning and more robust, adaptive\nbehavior.\n","authors":["Lancelot Da Costa"],"pdf_url":"https://arxiv.org/pdf/2409.18676v1.pdf","comment":"4 pages including appendix, 6 including appendix and references; 2\n figures"},{"id":"http://arxiv.org/abs/2409.18673v1","updated":"2024-09-27T11:59:00Z","published":"2024-09-27T11:59:00Z","title":"Exploiting Motion Prior for Accurate Pose Estimation of Dashboard\n Cameras","summary":" Dashboard cameras (dashcams) record millions of driving videos daily,\noffering a valuable potential data source for various applications, including\ndriving map production and updates. A necessary step for utilizing these\ndashcam data involves the estimation of camera poses. However, the low-quality\nimages captured by dashcams, characterized by motion blurs and dynamic objects,\npose challenges for existing image-matching methods in accurately estimating\ncamera poses. In this study, we propose a precise pose estimation method for\ndashcam images, leveraging the inherent camera motion prior. Typically, image\nsequences captured by dash cameras exhibit pronounced motion prior, such as\nforward movement or lateral turns, which serve as essential cues for\ncorrespondence estimation. Building upon this observation, we devise a pose\nregression module aimed at learning camera motion prior, subsequently\nintegrating these prior into both correspondences and pose estimation\nprocesses. The experiment shows that, in real dashcams dataset, our method is\n22% better than the baseline for pose estimation in AUC5\\textdegree, and it can\nestimate poses for 19% more images with less reprojection error in Structure\nfrom Motion (SfM).\n","authors":["Yipeng Lu","Yifan Zhao","Haiping Wang","Zhiwei Ruan","Yuan Liu","Zhen Dong","Bisheng Yang"],"pdf_url":"https://arxiv.org/pdf/2409.18673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16763v2","updated":"2024-09-27T11:49:58Z","published":"2024-02-26T17:30:34Z","title":"ELiSe: Efficient Learning of Sequences in Structured Recurrent Networks","summary":" Behavior can be described as a temporal sequence of actions driven by neural\nactivity. To learn complex sequential patterns in neural networks, memories of\npast activities need to persist on significantly longer timescales than the\nrelaxation times of single-neuron activity. While recurrent networks can\nproduce such long transients, training these networks is a challenge. Learning\nvia error propagation confers models such as FORCE, RTRL or BPTT a significant\nfunctional advantage, but at the expense of biological plausibility. While\nreservoir computing circumvents this issue by learning only the readout\nweights, it does not scale well with problem complexity. We propose that two\nprominent structural features of cortical networks can alleviate these issues:\nthe presence of a certain network scaffold at the onset of learning and the\nexistence of dendritic compartments for enhancing neuronal information storage\nand computation. Our resulting model for Efficient Learning of Sequences\n(ELiSe) builds on these features to acquire and replay complex non-Markovian\nspatio-temporal patterns using only local, always-on and phase-free synaptic\nplasticity. We showcase the capabilities of ELiSe in a mock-up of birdsong\nlearning, and demonstrate its flexibility with respect to parametrization, as\nwell as its robustness to external disturbances.\n","authors":["Laura Kriener","Kristin Völk","Ben von Hünerbein","Federico Benitez","Walter Senn","Mihai A. Petrovici"],"pdf_url":"https://arxiv.org/pdf/2402.16763v2.pdf","comment":"15 pages, 7 figures, 1 table"},{"id":"http://arxiv.org/abs/2409.18661v1","updated":"2024-09-27T11:45:56Z","published":"2024-09-27T11:45:56Z","title":"Not the Silver Bullet: LLM-enhanced Programming Error Messages are\n Ineffective in Practice","summary":" The sudden emergence of large language models (LLMs) such as ChatGPT has had\na disruptive impact throughout the computing education community. LLMs have\nbeen shown to excel at producing correct code to CS1 and CS2 problems, and can\neven act as friendly assistants to students learning how to code. Recent work\nshows that LLMs demonstrate unequivocally superior results in being able to\nexplain and resolve compiler error messages -- for decades, one of the most\nfrustrating parts of learning how to code. However, LLM-generated error message\nexplanations have only been assessed by expert programmers in artificial\nconditions. This work sought to understand how novice programmers resolve\nprogramming error messages (PEMs) in a more realistic scenario. We ran a\nwithin-subjects study with $n$ = 106 participants in which students were tasked\nto fix six buggy C programs. For each program, participants were randomly\nassigned to fix the problem using either a stock compiler error message, an\nexpert-handwritten error message, or an error message explanation generated by\nGPT-4. Despite promising evidence on synthetic benchmarks, we found that GPT-4\ngenerated error messages outperformed conventional compiler error messages in\nonly 1 of the 6 tasks, measured by students' time-to-fix each problem.\nHandwritten explanations still outperform LLM and conventional error messages,\nboth on objective and subjective measures.\n","authors":["Eddie Antonio Santos","Brett A. Becker"],"pdf_url":"https://arxiv.org/pdf/2409.18661v1.pdf","comment":"To appear in the proceedings of the 2024 UK and Ireland Computing\n Education Research conference (UKICER '24)"},{"id":"http://arxiv.org/abs/2409.18660v1","updated":"2024-09-27T11:44:03Z","published":"2024-09-27T11:44:03Z","title":"Effects of AI Feedback on Learning, the Skill Gap, and Intellectual\n Diversity","summary":" Can human decision-makers learn from AI feedback? Using data on 52,000\ndecision-makers from a large online chess platform, we investigate how their AI\nuse affects three interrelated long-term outcomes: Learning, skill gap, and\ndiversity of decision strategies. First, we show that individuals are far more\nlikely to seek AI feedback in situations in which they experienced success\nrather than failure. This AI feedback seeking strategy turns out to be\ndetrimental to learning: Feedback on successes decreases future performance,\nwhile feedback on failures increases it. Second, higher-skilled decision-makers\nseek AI feedback more often and are far more likely to seek AI feedback after a\nfailure, and benefit more from AI feedback than lower-skilled individuals. As a\nresult, access to AI feedback increases, rather than decreases, the skill gap\nbetween high- and low-skilled individuals. Finally, we leverage 42 major\nplatform updates as natural experiments to show that access to AI feedback\ncauses a decrease in intellectual diversity of the population as individuals\ntend to specialize in the same areas. Together, those results indicate that\nlearning from AI feedback is not automatic and using AI correctly seems to be a\nskill itself. Furthermore, despite its individual-level benefits, access to AI\nfeedback can have significant population-level downsides including loss of\nintellectual diversity and an increasing skill gap.\n","authors":["Christoph Riedl","Eric Bogert"],"pdf_url":"https://arxiv.org/pdf/2409.18660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18653v1","updated":"2024-09-27T11:35:50Z","published":"2024-09-27T11:35:50Z","title":"When SAM2 Meets Video Camouflaged Object Segmentation: A Comprehensive\n Evaluation and Adaptation","summary":" This study investigates the application and performance of the Segment\nAnything Model 2 (SAM2) in the challenging task of video camouflaged object\nsegmentation (VCOS). VCOS involves detecting objects that blend seamlessly in\nthe surroundings for videos, due to similar colors and textures, poor light\nconditions, etc. Compared to the objects in normal scenes, camouflaged objects\nare much more difficult to detect. SAM2, a video foundation model, has shown\npotential in various tasks. But its effectiveness in dynamic camouflaged\nscenarios remains under-explored. This study presents a comprehensive study on\nSAM2's ability in VCOS. First, we assess SAM2's performance on camouflaged\nvideo datasets using different models and prompts (click, box, and mask).\nSecond, we explore the integration of SAM2 with existing multimodal large\nlanguage models (MLLMs) and VCOS methods. Third, we specifically adapt SAM2 by\nfine-tuning it on the video camouflaged dataset. Our comprehensive experiments\ndemonstrate that SAM2 has excellent zero-shot ability of detecting camouflaged\nobjects in videos. We also show that this ability could be further improved by\nspecifically adjusting SAM2's parameters for VCOS. The code will be available\nat https://github.com/zhoustan/SAM2-VCOS\n","authors":["Yuli Zhou","Guolei Sun","Yawei Li","Luca Benini","Ender Konukoglu"],"pdf_url":"https://arxiv.org/pdf/2409.18653v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2409.18642v1","updated":"2024-09-27T11:20:20Z","published":"2024-09-27T11:20:20Z","title":"Enhanced Convolution Neural Network with Optimized Pooling and\n Hyperparameter Tuning for Network Intrusion Detection","summary":" Network Intrusion Detection Systems (NIDS) are essential for protecting\ncomputer networks from malicious activities, including Denial of Service (DoS),\nProbing, User-to-Root (U2R), and Remote-to-Local (R2L) attacks. Without\neffective NIDS, networks are vulnerable to significant security breaches and\ndata loss. Machine learning techniques provide a promising approach to enhance\nNIDS by automating threat detection and improving accuracy. In this research,\nwe propose an Enhanced Convolutional Neural Network (EnCNN) for NIDS and\nevaluate its performance using the KDDCUP'99 dataset. Our methodology includes\ncomprehensive data preprocessing, exploratory data analysis (EDA), and feature\nengineering. We compare EnCNN with various machine learning algorithms,\nincluding Logistic Regression, Decision Trees, Support Vector Machines (SVM),\nand ensemble methods like Random Forest, AdaBoost, and Voting Ensemble. The\nresults show that EnCNN significantly improves detection accuracy, with a\nnotable 10% increase over state-of-art approaches. This demonstrates the\neffectiveness of EnCNN in real-time network intrusion detection, offering a\nrobust solution for identifying and mitigating security threats, and enhancing\noverall network resilience.\n","authors":["Ayush Kumar Sharma","Sourav Patel","Supriya Bharat Wakchaure","Abirami S"],"pdf_url":"https://arxiv.org/pdf/2409.18642v1.pdf","comment":"7 Pages , 2 figures , 4 Tables , Conference paper"},{"id":"http://arxiv.org/abs/2409.16937v2","updated":"2024-09-27T11:16:35Z","published":"2024-09-25T13:51:19Z","title":"Semi-Supervised Cognitive State Classification from Speech with\n Multi-View Pseudo-Labeling","summary":" The lack of labeled data is a common challenge in speech classification\ntasks, particularly those requiring extensive subjective assessment, such as\ncognitive state classification. In this work, we propose a Semi-Supervised\nLearning (SSL) framework, introducing a novel multi-view pseudo-labeling method\nthat leverages both acoustic and linguistic characteristics to select the most\nconfident data for training the classification model. Acoustically, unlabeled\ndata are compared to labeled data using the Frechet audio distance, calculated\nfrom embeddings generated by multiple audio encoders. Linguistically, large\nlanguage models are prompted to revise automatic speech recognition\ntranscriptions and predict labels based on our proposed task-specific\nknowledge. High-confidence data are identified when pseudo-labels from both\nsources align, while mismatches are treated as low-confidence data. A bimodal\nclassifier is then trained to iteratively label the low-confidence data until a\npredefined criterion is met. We evaluate our SSL framework on emotion\nrecognition and dementia detection tasks. Experimental results demonstrate that\nour method achieves competitive performance compared to fully supervised\nlearning using only 30% of the labeled data and significantly outperforms two\nselected baselines.\n","authors":["Yuanchao Li","Zixing Zhang","Jing Han","Peter Bell","Catherine Lai"],"pdf_url":"https://arxiv.org/pdf/2409.16937v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18633v1","updated":"2024-09-27T11:06:59Z","published":"2024-09-27T11:06:59Z","title":"Reducing Diversity to Generate Hierarchical Archetypes","summary":" The Artificial Intelligence field seldom address the development of a\nfundamental building piece: a framework, methodology or algorithm to\nautomatically build hierarchies of abstractions. This is a key requirement in\norder to build intelligent behaviour, as recent neuroscience studies clearly\nexpose. In this paper we present a primitive-based framework to automatically\ngenerate hierarchies of constructive archetypes, as a theory of how to generate\nhierarchies of abstractions. We assume the existence of a primitive with very\nspecific characteristics, and we develop our framework over it. We prove the\neffectiveness of our framework through mathematical definitions and proofs.\nFinally, we give a few insights about potential uses of our framework and the\nexpected results.\n","authors":["Alfredo Ibias","Hector Antona","Guillem Ramirez-Miranda","Enric Guinovart","Eduard Alarcon"],"pdf_url":"https://arxiv.org/pdf/2409.18633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18631v1","updated":"2024-09-27T10:58:25Z","published":"2024-09-27T10:58:25Z","title":"Quantum Algorithms for Drone Mission Planning","summary":" Mission planning often involves optimising the use of ISR (Intelligence,\nSurveillance and Reconnaissance) assets in order to achieve a set of mission\nobjectives within allowed parameters subject to constraints. The missions of\ninterest here, involve routing multiple UAVs visiting multiple targets,\nutilising sensors to capture data relating to each target. Finding such\nsolutions is often an NP-Hard problem and cannot be solved efficiently on\nclassical computers. Furthermore, during the mission new constraints and\nobjectives may arise, requiring a new solution to be computed within a short\ntime period. To achieve this we investigate near term quantum algorithms that\nhave the potential to offer speed-ups against current classical methods. We\ndemonstrate how a large family of these problems can be formulated as a Mixed\nInteger Linear Program (MILP) and then converted to a Quadratic Unconstrained\nBinary Optimisation (QUBO). The formulation provided is versatile and can be\nadapted for many different constraints with clear qubit scaling provided. We\ndiscuss the results of solving the QUBO formulation using commercial quantum\nannealers and compare the solutions to current edge classical solvers. We also\nanalyse the results from solving the QUBO using Quantum Approximate\nOptimisation Algorithms (QAOA) and discuss their results. Finally, we also\nprovide efficient methods to encode to the problem into the Variational Quantum\nEigensolver (VQE) formalism, where we have tailored the ansatz to the problem\nmaking efficient use of the qubits available.\n","authors":["Ethan Davies","Pranav Kalidindi"],"pdf_url":"https://arxiv.org/pdf/2409.18631v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.18630v1","updated":"2024-09-27T10:58:18Z","published":"2024-09-27T10:58:18Z","title":"Entropy, concentration, and learning: a statistical mechanics primer","summary":" Artificial intelligence models trained through loss minimization have\ndemonstrated significant success, grounded in principles from fields like\ninformation theory and statistical physics. This work explores these\nestablished connections through the lens of statistical mechanics, starting\nfrom first-principles sample concentration behaviors that underpin AI and\nmachine learning. Our development of statistical mechanics for modeling\nhighlights the key role of exponential families, and quantities of statistics,\nphysics, and information theory.\n","authors":["Akshay Balsubramani"],"pdf_url":"https://arxiv.org/pdf/2409.18630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18628v1","updated":"2024-09-27T10:55:58Z","published":"2024-09-27T10:55:58Z","title":"Towards Integrating Epistemic Uncertainty Estimation into the\n Radiotherapy Workflow","summary":" The precision of contouring target structures and organs-at-risk (OAR) in\nradiotherapy planning is crucial for ensuring treatment efficacy and patient\nsafety. Recent advancements in deep learning (DL) have significantly improved\nOAR contouring performance, yet the reliability of these models, especially in\nthe presence of out-of-distribution (OOD) scenarios, remains a concern in\nclinical settings. This application study explores the integration of epistemic\nuncertainty estimation within the OAR contouring workflow to enable OOD\ndetection in clinically relevant scenarios, using specifically compiled data.\nFurthermore, we introduce an advanced statistical method for OOD detection to\nenhance the methodological framework of uncertainty estimation. Our empirical\nevaluation demonstrates that epistemic uncertainty estimation is effective in\nidentifying instances where model predictions are unreliable and may require an\nexpert review. Notably, our approach achieves an AUC-ROC of 0.95 for OOD\ndetection, with a specificity of 0.95 and a sensitivity of 0.92 for implant\ncases, underscoring its efficacy. This study addresses significant gaps in the\ncurrent research landscape, such as the lack of ground truth for uncertainty\nestimation and limited empirical evaluations. Additionally, it provides a\nclinically relevant application of epistemic uncertainty estimation in an\nFDA-approved and widely used clinical solution for OAR segmentation from\nVarian, a Siemens Healthineers company, highlighting its practical benefits.\n","authors":["Marvin Tom Teichmann","Manasi Datar","Lisa Kratzke","Fernando Vega","Florin C. Ghesu"],"pdf_url":"https://arxiv.org/pdf/2409.18628v1.pdf","comment":"Keywords: Epistemic Uncertainty - Out-of-Distribution Detection - CT\n Segmentation - OAR contouring - Radiotherapy"},{"id":"http://arxiv.org/abs/2409.18626v1","updated":"2024-09-27T10:55:22Z","published":"2024-09-27T10:55:22Z","title":"Refutation of Spectral Graph Theory Conjectures with Search Algorithms)","summary":" We are interested in the automatic refutation of spectral graph theory\nconjectures. Most existing works address this problem either with the\nexhaustive generation of graphs with a limited size or with deep reinforcement\nlearning. Exhaustive generation is limited by the size of the generated graphs\nand deep reinforcement learning takes hours or days to refute a conjecture. We\npropose to use search algorithms to address these shortcomings to find\npotentially large counter-examples to spectral graph theory conjectures in\nseconds. We apply a wide range of search algorithms to a selection of\nconjectures from Graffiti. Out of 13 already refuted conjectures from Graffiti,\nour algorithms are able to refute 12 in seconds. We also refute conjecture 197\nfrom Graffiti which was open until now.\n","authors":["Milo Roucairol","Tristan Cazenave"],"pdf_url":"https://arxiv.org/pdf/2409.18626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18624v1","updated":"2024-09-27T10:50:49Z","published":"2024-09-27T10:50:49Z","title":"Unsupervised Cognition","summary":" Unsupervised learning methods have a soft inspiration in cognition models. To\nthis day, the most successful unsupervised learning methods revolve around\nclustering samples in a mathematical space. In this paper we propose a\nstate-of-the-art primitive-based unsupervised learning approach for\ndecision-making inspired by novel cognition models. This representation-centric\napproach models the input space constructively as a distributed hierarchical\nstructure in an input-agnostic way. We compared our approach with current\nstate-of-the-art in unsupervised learning classification, and with current\nstate-of-the-art in cancer type classification. We show how our proposal\noutperforms previous state-of-the-art. We also evaluate some cognition-like\nproperties of our proposal where it not only outperforms the compared\nalgorithms (even supervised learning ones), but it also shows a different, more\ncognition-like, behaviour.\n","authors":["Alfredo Ibias","Hector Antona","Guillem Ramirez-Miranda","Enric Guinovart","Eduard Alarcon"],"pdf_url":"https://arxiv.org/pdf/2409.18624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18618v1","updated":"2024-09-27T10:35:45Z","published":"2024-09-27T10:35:45Z","title":"Model-based Preference Optimization in Abstractive Summarization without\n Human Feedback","summary":" In abstractive summarization, the challenge of producing concise and accurate\nsummaries arises from the vast amount of information contained in the source\ndocument. Consequently, although Large Language Models (LLMs) can generate\nfluent text, they often introduce inaccuracies by hallucinating content not\nfound in the original source. While supervised fine-tuning methods that\nmaximize likelihood contribute to this issue, they do not consistently enhance\nthe faithfulness of the summaries. Preference-based optimization methods, such\nas Direct Preference Optimization (DPO), can further refine the model to align\nwith human preferences. However, these methods still heavily depend on costly\nhuman feedback. In this work, we introduce a novel and straightforward approach\ncalled Model-based Preference Optimization (MPO) to fine-tune LLMs for improved\nsummarization abilities without any human feedback. By leveraging the model's\ninherent summarization capabilities, we create a preference dataset that is\nfully generated by the model using different decoding strategies. Our\nexperiments on standard summarization datasets and various metrics demonstrate\nthat our proposed MPO significantly enhances the quality of generated summaries\nwithout relying on human feedback.\n","authors":["Jaepill Choi","Kyubyung Chae","Jiwoo Song","Yohan Jo","Taesup Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18618v1.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2405.03882v2","updated":"2024-09-27T10:19:55Z","published":"2024-05-06T21:57:35Z","title":"Trio-ViT: Post-Training Quantization and Acceleration for Softmax-Free\n Efficient Vision Transformer","summary":" Motivated by the huge success of Transformers in the field of natural\nlanguage processing (NLP), Vision Transformers (ViTs) have been rapidly\ndeveloped and achieved remarkable performance in various computer vision tasks.\nHowever, their huge model sizes and intensive computations hinder ViTs'\ndeployment on embedded devices, calling for effective model compression\nmethods, such as quantization. Unfortunately, due to the existence of\nhardware-unfriendly and quantization-sensitive non-linear operations,\nparticularly {Softmax}, it is non-trivial to completely quantize all operations\nin ViTs, yielding either significant accuracy drops or non-negligible hardware\ncosts. In response to challenges associated with \\textit{standard ViTs}, we\nfocus our attention towards the quantization and acceleration for\n\\textit{efficient ViTs}, which not only eliminate the troublesome Softmax but\nalso integrate linear attention with low computational complexity, and propose\nTrio-ViT accordingly. Specifically, at the algorithm level, we develop a\n{tailored post-training quantization engine} taking the unique activation\ndistributions of Softmax-free efficient ViTs into full consideration, aiming to\nboost quantization accuracy. Furthermore, at the hardware level, we build an\naccelerator dedicated to the specific Convolution-Transformer hybrid\narchitecture of efficient ViTs, thereby enhancing hardware efficiency.\nExtensive experimental results consistently prove the effectiveness of our\nTrio-ViT framework. {Particularly, we can gain up to\n$\\uparrow$$\\mathbf{3.6}\\times$, $\\uparrow$$\\mathbf{5.0}\\times$, and\n$\\uparrow$$\\mathbf{7.3}\\times$ FPS under comparable accuracy over\nstate-of-the-art ViT accelerators, as well as $\\uparrow$$\\mathbf{6.0}\\times$,\n$\\uparrow$$\\mathbf{1.5}\\times$, and $\\uparrow$$\\mathbf{2.1}\\times$ DSP\nefficiency.} Codes are available at\n\\url{https://github.com/shihuihong214/Trio-ViT}.\n","authors":["Huihong Shi","Haikuo Shao","Wendong Mao","Zhongfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.03882v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08424v2","updated":"2024-09-27T10:05:56Z","published":"2024-04-12T12:15:14Z","title":"Comparing Apples to Oranges: LLM-powered Multimodal Intention Prediction\n in an Object Categorization Task","summary":" Human intention-based systems enable robots to perceive and interpret user\nactions to interact with humans and adapt to their behavior proactively.\nTherefore, intention prediction is pivotal in creating a natural interaction\nwith social robots in human-designed environments. In this paper, we examine\nusing Large Language Models (LLMs) to infer human intention in a collaborative\nobject categorization task with a physical robot. We propose a novel multimodal\napproach that integrates user non-verbal cues, like hand gestures, body poses,\nand facial expressions, with environment states and user verbal cues to predict\nuser intentions in a hierarchical architecture. Our evaluation of five LLMs\nshows the potential for reasoning about verbal and non-verbal user cues,\nleveraging their context-understanding and real-world knowledge to support\nintention prediction while collaborating on a task with a social robot.\n","authors":["Hassan Ali","Philipp Allgeuer","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2404.08424v2.pdf","comment":"Accepted at ICSR 2024,14 pages,5 figures,2 tables; work was co-funded\n by Horizon Europe project TERAIS under Grant agreement number 101079338"},{"id":"http://arxiv.org/abs/2409.14378v2","updated":"2024-09-27T10:04:29Z","published":"2024-09-22T09:48:45Z","title":"Sparse Low-Ranked Self-Attention Transformer for Remaining Useful\n Lifetime Prediction of Optical Fiber Amplifiers","summary":" Optical fiber amplifiers are key elements in present optical networks.\nFailures of these components result in high financial loss of income of the\nnetwork operator as the communication traffic over an affected link is\ninterrupted. Applying Remaining useful lifetime (RUL) prediction in the context\nof Predictive Maintenance (PdM) to optical fiber amplifiers to predict upcoming\nsystem failures at an early stage, so that network outages can be minimized\nthrough planning of targeted maintenance actions, ensures reliability and\nsafety. Optical fiber amplifier are complex systems, that work under various\noperating conditions, which makes correct forecasting a difficult task.\nIncreased monitoring capabilities of systems results in datasets that\nfacilitate the application of data-driven RUL prediction methods. Deep learning\nmodels in particular have shown good performance, but generalization based on\ncomparatively small datasets for RUL prediction is difficult. In this paper, we\npropose Sparse Low-ranked self-Attention Transformer (SLAT) as a novel RUL\nprediction method. SLAT is based on an encoder-decoder architecture, wherein\ntwo parallel working encoders extract features for sensors and time steps. By\nutilizing the self-attention mechanism, long-term dependencies can be learned\nfrom long sequences. The implementation of sparsity in the attention matrix and\na low-rank parametrization reduce overfitting and increase generalization.\nExperimental application to optical fiber amplifiers exemplified on EDFA, as\nwell as a reference dataset from turbofan engines, shows that SLAT outperforms\nthe state-of-the-art methods.\n","authors":["Dominic Schneider","Lutz Rapp"],"pdf_url":"https://arxiv.org/pdf/2409.14378v2.pdf","comment":"9 pages, 7 figures, submitted to IEEE Transactions on Machine\n Learning in Communications and Networking (TMLCN)"},{"id":"http://arxiv.org/abs/2407.12789v2","updated":"2024-09-27T10:00:15Z","published":"2024-06-17T13:53:39Z","title":"Generalisation to unseen topologies: Towards control of biological\n neural network activity","summary":" Novel imaging and neurostimulation techniques open doors for advancements in\nclosed-loop control of activity in biological neural networks. This would allow\nfor applications in the investigation of activity propagation, and for\ndiagnosis and treatment of pathological behaviour. Due to the partially\nobservable characteristics of activity propagation, through networks in which\nedges can not be observed, and the dynamic nature of neuronal systems, there is\na need for adaptive, generalisable control. In this paper, we introduce an\nenvironment that procedurally generates neuronal networks with different\ntopologies to investigate this generalisation problem. Additionally, an\nexisting transformer-based architecture is adjusted to evaluate the\ngeneralisation performance of a deep RL agent in the presented partially\nobservable environment. The agent demonstrates the capability to generalise\ncontrol from a limited number of training networks to unseen test networks.\n","authors":["Laurens Engwegen","Daan Brinks","Wendelin Böhmer"],"pdf_url":"https://arxiv.org/pdf/2407.12789v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18597v1","updated":"2024-09-27T09:56:20Z","published":"2024-09-27T09:56:20Z","title":"TemporalPaD: a reinforcement-learning framework for temporal feature\n representation and dimension reduction","summary":" Recent advancements in feature representation and dimension reduction have\nhighlighted their crucial role in enhancing the efficacy of predictive\nmodeling. This work introduces TemporalPaD, a novel end-to-end deep learning\nframework designed for temporal pattern datasets. TemporalPaD integrates\nreinforcement learning (RL) with neural networks to achieve concurrent feature\nrepresentation and feature reduction. The framework consists of three\ncooperative modules: a Policy Module, a Representation Module, and a\nClassification Module, structured based on the Actor-Critic (AC) framework. The\nPolicy Module, responsible for dimensionality reduction through RL, functions\nas the actor, while the Representation Module for feature extraction and the\nClassification Module collectively serve as the critic. We comprehensively\nevaluate TemporalPaD using 29 UCI datasets, a well-known benchmark for\nvalidating feature reduction algorithms, through 10 independent tests and\n10-fold cross-validation. Additionally, given that TemporalPaD is specifically\ndesigned for time series data, we apply it to a real-world DNA classification\nproblem involving enhancer category and enhancer strength. The results\ndemonstrate that TemporalPaD is an efficient and effective framework for\nachieving feature reduction, applicable to both structured data and sequence\ndatasets. The source code of the proposed TemporalPaD is freely available as\nsupplementary material to this article and at\nhttp://www.healthinformaticslab.org/supp/.\n","authors":["Xuechen Mu","Zhenyu Huang","Kewei Li","Haotian Zhang","Xiuli Wang","Yusi Fan","Kai Zhang","Fengfeng Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.18597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18596v1","updated":"2024-09-27T09:56:02Z","published":"2024-09-27T09:56:02Z","title":"ASAG2024: A Combined Benchmark for Short Answer Grading","summary":" Open-ended questions test a more thorough understanding than closed-ended\nquestions and are often a preferred assessment method. However, open-ended\nquestions are tedious to grade and subject to personal bias. Therefore, there\nhave been efforts to speed up the grading process through automation. Short\nAnswer Grading (SAG) systems aim to automatically score students' answers.\nDespite growth in SAG methods and capabilities, there exists no comprehensive\nshort-answer grading benchmark across different subjects, grading scales, and\ndistributions. Thus, it is hard to assess the capabilities of current automated\ngrading methods in terms of their generalizability. In this preliminary work,\nwe introduce the combined ASAG2024 benchmark to facilitate the comparison of\nautomated grading systems. Combining seven commonly used short-answer grading\ndatasets in a common structure and grading scale. For our benchmark, we\nevaluate a set of recent SAG methods, revealing that while LLM-based approaches\nreach new high scores, they still are far from reaching human performance. This\nopens up avenues for future research on human-machine SAG systems.\n","authors":["Gérôme Meyer","Philip Breuer","Jonathan Fürst"],"pdf_url":"https://arxiv.org/pdf/2409.18596v1.pdf","comment":"Accepted at SIGCSE-Virtual 2024"},{"id":"http://arxiv.org/abs/2409.18594v1","updated":"2024-09-27T09:53:48Z","published":"2024-09-27T09:53:48Z","title":"\"Oh LLM, I'm Asking Thee, Please Give Me a Decision Tree\": Zero-Shot\n Decision Tree Induction and Embedding with Large Language Models","summary":" Large language models (LLMs) provide powerful means to leverage prior\nknowledge for predictive modeling when data is limited. In this work, we\ndemonstrate how LLMs can use their compressed world knowledge to generate\nintrinsically interpretable machine learning models, i.e., decision trees,\nwithout any training data. We find that these zero-shot decision trees can\nsurpass data-driven trees on some small-sized tabular datasets and that\nembeddings derived from these trees perform on par with data-driven tree-based\nembeddings on average. Our knowledge-driven decision tree induction and\nembedding approaches therefore serve as strong new baselines for data-driven\nmachine learning methods in the low-data regime.\n","authors":["Ricardo Knauer","Mario Koddenbrock","Raphael Wallsberger","Nicholas M. Brisson","Georg N. Duda","Deborah Falla","David W. Evans","Erik Rodner"],"pdf_url":"https://arxiv.org/pdf/2409.18594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14842v2","updated":"2024-09-27T09:52:57Z","published":"2024-09-23T09:20:19Z","title":"HW-TSC's Submission to the CCMT 2024 Machine Translation Tasks","summary":" This paper presents the submission of Huawei Translation Services Center\n(HW-TSC) to machine translation tasks of the 20th China Conference on Machine\nTranslation (CCMT 2024). We participate in the bilingual machine translation\ntask and multi-domain machine translation task. For these two translation\ntasks, we use training strategies such as regularized dropout, bidirectional\ntraining, data diversification, forward translation, back translation,\nalternated training, curriculum learning, and transductive ensemble learning to\ntrain neural machine translation (NMT) models based on the deep Transformer-big\narchitecture. Furthermore, to explore whether large language model (LLM) can\nhelp improve the translation quality of NMT systems, we use supervised\nfine-tuning to train llama2-13b as an Automatic post-editing (APE) model to\nimprove the translation results of the NMT model on the multi-domain machine\ntranslation task. By using these plyometric strategies, our submission achieves\na competitive result in the final evaluation.\n","authors":["Zhanglin Wu","Yuanchang Luo","Daimeng Wei","Jiawei Zheng","Bin Wei","Zongyao Li","Hengchao Shang","Jiaxin Guo","Shaojun Li","Weidong Zhang","Ning Xie","Hao Yang"],"pdf_url":"https://arxiv.org/pdf/2409.14842v2.pdf","comment":"14 pages, 2 figures, 6 Tables, CCMT2024. arXiv admin note:\n substantial text overlap with arXiv:2409.14800"},{"id":"http://arxiv.org/abs/2409.18586v1","updated":"2024-09-27T09:45:21Z","published":"2024-09-27T09:45:21Z","title":"Analysis of Truncated Singular Value Decomposition for Koopman\n Operator-Based Lane Change Model","summary":" Understanding and modeling complex dynamic systems is crucial for enhancing\nvehicle performance and safety, especially in the context of autonomous\ndriving. Recently, popular methods such as Koopman operators and their\napproximators, known as Extended Dynamic Mode Decomposition (EDMD), have\nemerged for their effectiveness in transforming strongly nonlinear system\nbehavior into linear representations. This allows them to be integrated with\nconventional linear controllers. To achieve this, Singular Value Decomposition\n(SVD), specifically truncated SVD, is employed to approximate Koopman operators\nfrom extensive datasets efficiently. This study evaluates different basis\nfunctions used in EDMD and ranks for truncated SVD for representing lane change\nbehavior models, aiming to balance computational efficiency with information\nloss. The findings, however, suggest that the technique of truncated SVD does\nnot necessarily achieve substantial reductions in computational training time\nand results in significant information loss.\n","authors":["Chinnawut Nantabut"],"pdf_url":"https://arxiv.org/pdf/2409.18586v1.pdf","comment":"Submitted to the 21st International Conference on Informatics in\n Control, Automation and Robotics (ICINCO 2024)"},{"id":"http://arxiv.org/abs/2212.01635v3","updated":"2024-09-27T09:45:07Z","published":"2022-12-03T15:31:34Z","title":"An Empirical Study of AI Techniques in Mobile Applications","summary":" The integration of artificial intelligence (AI) into mobile applications has\nsignificantly transformed various domains, enhancing user experiences and\nproviding personalized services through advanced machine learning (ML) and deep\nlearning (DL) technologies. AI-driven mobile apps typically refer to\napplications that leverage ML/DL technologies to perform key tasks such as\nimage recognition and natural language processing. In this paper, we conducted\nthe most extensive empirical study on AI applications, exploring on-device ML\napps, on-device DL apps, and AI service-supported (cloud-based) apps. Our study\nencompasses 56,682 real-world AI applications, focusing on three crucial\nperspectives: 1) Application analysis, where we analyze the popularity of AI\napps and investigate the update states of AI apps; 2) Framework and model\nanalysis, where we analyze AI framework usage and AI model protection; 3) User\nanalysis, where we examine user privacy protection and user review attitudes.\nOur study has strong implications for AI app developers, users, and AI R\\&D. On\none hand, our findings highlight the growing trend of AI integration in mobile\napplications, demonstrating the widespread adoption of various AI frameworks\nand models. On the other hand, our findings emphasize the need for robust model\nprotection to enhance app security. Additionally, our study highlights the\nimportance of user privacy and presents user attitudes towards the AI\ntechnologies utilized in current AI apps. We provide our AI app dataset\n(currently the most extensive AI app dataset) as an open-source resource for\nfuture research on AI technologies utilized in mobile applications.\n","authors":["Yinghua Li","Xueqi Dang","Haoye Tian","Tiezhu Sun","Zhijie Wang","Lei Ma","Jacques Klein","Tegawendé F. Bissyandé"],"pdf_url":"https://arxiv.org/pdf/2212.01635v3.pdf","comment":"This paper is accepted by the Journal of Systems and Software (JSS)\n 2024"},{"id":"http://arxiv.org/abs/2309.15039v2","updated":"2024-09-27T09:40:34Z","published":"2023-09-26T16:15:54Z","title":"Can-SAVE: Mass Cancer Risk Prediction via Survival Analysis Variables\n and EHR","summary":" Specific medical cancer screening methods are often costly, time-consuming,\nand weakly applicable on a large scale. Advanced Artificial Intelligence (AI)\nmethods greatly help cancer detection but require specific or deep medical\ndata. These aspects prevent the mass implementation of cancer screening\nmethods. For this reason, it is a disruptive change for healthcare to apply AI\nmethods for mass personalized assessment of the cancer risk among patients\nbased on the existing Electronic Health Records (EHR) volume. This paper\npresents a novel Can-SAVE cancer risk assessment method combining a survival\nanalysis approach with a gradient-boosting algorithm. It is highly accessible\nand resource-efficient, utilizing only a sequence of high-level medical events.\nWe tested the proposed method in a long-term retrospective experiment covering\nmore than 1.1 million people and four regions of Russia. The Can-SAVE method\nsignificantly exceeds the baselines by the Average Precision metric of\n22.8%$\\pm$2.7% vs 15.1%$\\pm$2.6%. The extensive ablation study also confirmed\nthe proposed method's dominant performance. The experiment supervised by\noncologists shows a reliable cancer patient detection rate of up to 84 out of\n1000 selected. Such results surpass the medical screening strategies estimates;\nthe typical age-specific Number Needed to Screen is only 9 out of 1000 (for\ncolorectal cancer). Overall, our experiments show a 4.7-6.4 times improvement\nin cancer detection rate (TOP@1k) compared to the traditional healthcare risk\nestimation approach.\n","authors":["Petr Philonenko","Vladimir Kokh","Pavel Blinov"],"pdf_url":"https://arxiv.org/pdf/2309.15039v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.18578v1","updated":"2024-09-27T09:28:27Z","published":"2024-09-27T09:28:27Z","title":"An Enhanced Federated Prototype Learning Method under Domain Shift","summary":" Federated Learning (FL) allows collaborative machine learning training\nwithout sharing private data. Numerous studies have shown that one significant\nfactor affecting the performance of federated learning models is the\nheterogeneity of data across different clients, especially when the data is\nsampled from various domains. A recent paper introduces variance-aware\ndual-level prototype clustering and uses a novel $\\alpha$-sparsity prototype\nloss, which increases intra-class similarity and reduces inter-class\nsimilarity. To ensure that the features converge within specific clusters, we\nintroduce an improved algorithm, Federated Prototype Learning with Convergent\nClusters, abbreviated as FedPLCC. To increase inter-class distances, we weight\neach prototype with the size of the cluster it represents. To reduce\nintra-class distances, considering that prototypes with larger distances might\ncome from different domains, we select only a certain proportion of prototypes\nfor the loss function calculation. Evaluations on the Digit-5, Office-10, and\nDomainNet datasets show that our method performs better than existing\napproaches.\n","authors":["Liang Kuang","Kuangpu Guo","Jian Liang","Jianguo Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18578v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.02332v2","updated":"2024-09-27T09:21:03Z","published":"2024-04-26T06:22:43Z","title":"Efficient Exploration of Image Classifier Failures with Bayesian\n Optimization and Text-to-Image Models","summary":" Image classifiers should be used with caution in the real world. Performance\nevaluated on a validation set may not reflect performance in the real world. In\nparticular, classifiers may perform well for conditions that are frequently\nencountered during training, but poorly for other infrequent conditions. In\nthis study, we hypothesize that recent advances in text-to-image generative\nmodels make them valuable for benchmarking computer vision models such as image\nclassifiers: they can generate images conditioned by textual prompts that cause\nclassifier failures, allowing failure conditions to be described with textual\nattributes. However, their generation cost becomes an issue when a large number\nof synthetic images need to be generated, which is the case when many different\nattribute combinations need to be tested. We propose an image classifier\nbenchmarking method as an iterative process that alternates image generation,\nclassifier evaluation, and attribute selection. This method efficiently\nexplores the attributes that ultimately lead to poor behavior detection.\n","authors":["Adrien LeCoz","Houssem Ouertatani","Stéphane Herbin","Faouzi Adjed"],"pdf_url":"https://arxiv.org/pdf/2405.02332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18568v1","updated":"2024-09-27T09:11:52Z","published":"2024-09-27T09:11:52Z","title":"Experimental Evaluation of Machine Learning Models for Goal-oriented\n Customer Service Chatbot with Pipeline Architecture","summary":" Integrating machine learning (ML) into customer service chatbots enhances\ntheir ability to understand and respond to user queries, ultimately improving\nservice performance. However, they may appear artificial to some users and\naffecting customer experience. Hence, meticulous evaluation of ML models for\neach pipeline component is crucial for optimizing performance, though\ndifferences in functionalities can lead to unfair comparisons. In this paper,\nwe present a tailored experimental evaluation approach for goal-oriented\ncustomer service chatbots with pipeline architecture, focusing on three key\ncomponents: Natural Language Understanding (NLU), dialogue management (DM), and\nNatural Language Generation (NLG). Our methodology emphasizes individual\nassessment to determine optimal ML models. Specifically, we focus on optimizing\nhyperparameters and evaluating candidate models for NLU (utilizing BERT and\nLSTM), DM (employing DQN and DDQN), and NLG (leveraging GPT-2 and DialoGPT).\nThe results show that for the NLU component, BERT excelled in intent detection\nwhereas LSTM was superior for slot filling. For the DM component, the DDQN\nmodel outperformed DQN by achieving fewer turns, higher rewards, as well as\ngreater success rates. For NLG, the large language model GPT-2 surpassed\nDialoGPT in BLEU, METEOR, and ROUGE metrics. These findings aim to provide a\nbenchmark for future research in developing and optimizing customer service\nchatbots, offering valuable insights into model performance and optimal\nhyperparameters.\n","authors":["Nurul Ain Nabilah Mohd Isa","Siti Nuraishah Agos Jawaddi","Azlan Ismail"],"pdf_url":"https://arxiv.org/pdf/2409.18568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18553v1","updated":"2024-09-27T08:45:55Z","published":"2024-09-27T08:45:55Z","title":"Efficient Noise Mitigation for Enhancing Inference Accuracy in DNNs on\n Mixed-Signal Accelerators","summary":" In this paper, we propose a framework to enhance the robustness of the neural\nmodels by mitigating the effects of process-induced and aging-related\nvariations of analog computing components on the accuracy of the analog neural\nnetworks. We model these variations as the noise affecting the precision of the\nactivations and introduce a denoising block inserted between selected layers of\na pre-trained model. We demonstrate that training the denoising block\nsignificantly increases the model's robustness against various noise levels. To\nminimize the overhead associated with adding these blocks, we present an\nexploration algorithm to identify optimal insertion points for the denoising\nblocks. Additionally, we propose a specialized architecture to efficiently\nexecute the denoising blocks, which can be integrated into mixed-signal\naccelerators. We evaluate the effectiveness of our approach using Deep Neural\nNetwork (DNN) models trained on the ImageNet and CIFAR-10 datasets. The results\nshow that on average, by accepting 2.03% parameter count overhead, the accuracy\ndrop due to the variations reduces from 31.7% to 1.15%.\n","authors":["Seyedarmin Azizi","Mohammad Erfan Sadeghi","Mehdi Kamal","Massoud Pedram"],"pdf_url":"https://arxiv.org/pdf/2409.18553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18548v1","updated":"2024-09-27T08:34:42Z","published":"2024-09-27T08:34:42Z","title":"Research on Predicting Public Opinion Event Heat Levels Based on Large\n Language Models","summary":" In recent years, with the rapid development of large language models, serval\nmodels such as GPT-4o have demonstrated extraordinary capabilities, surpassing\nhuman performance in various language tasks. As a result, many researchers have\nbegun exploring their potential applications in the field of public opinion\nanalysis. This study proposes a novel large-language-models-based method for\npublic opinion event heat level prediction. First, we preprocessed and\nclassified 62,836 Chinese hot event data collected between July 2022 and\nDecember 2023. Then, based on each event's online dissemination heat index, we\nused the MiniBatchKMeans algorithm to automatically cluster the events and\ncategorize them into four heat levels (ranging from low heat to very high\nheat). Next, we randomly selected 250 events from each heat level, totalling\n1,000 events, to build the evaluation dataset. During the evaluation process,\nwe employed various large language models to assess their accuracy in\npredicting event heat levels in two scenarios: without reference cases and with\nsimilar case references. The results showed that GPT-4o and DeepseekV2\nperformed the best in the latter case, achieving prediction accuracies of 41.4%\nand 41.5%, respectively. Although the overall prediction accuracy remains\nrelatively low, it is worth noting that for low-heat (Level 1) events, the\nprediction accuracies of these two models reached 73.6% and 70.4%,\nrespectively. Additionally, the prediction accuracy showed a downward trend\nfrom Level 1 to Level 4, which correlates with the uneven distribution of data\nacross the heat levels in the actual dataset. This suggests that with the more\nrobust dataset, public opinion event heat level prediction based on large\nlanguage models will have significant research potential for the future.\n","authors":["Yi Ren","Tianyi Zhang","Weibin Li","DuoMu Zhou","Chenhao Qin","FangCheng Dong"],"pdf_url":"https://arxiv.org/pdf/2409.18548v1.pdf","comment":"conference"},{"id":"http://arxiv.org/abs/2409.18545v1","updated":"2024-09-27T08:27:36Z","published":"2024-09-27T08:27:36Z","title":"An Epistemic Human-Aware Task Planner which Anticipates Human Beliefs\n and Decisions","summary":" We present a substantial extension of our Human-Aware Task Planning\nframework, tailored for scenarios with intermittent shared execution\nexperiences and significant belief divergence between humans and robots,\nparticularly due to the uncontrollable nature of humans. Our objective is to\nbuild a robot policy that accounts for uncontrollable human behaviors, thus\nenabling the anticipation of possible advancements achieved by the robot when\nthe execution is not shared, e.g. when humans are briefly absent from the\nshared environment to complete a subtask. But, this anticipation is considered\nfrom the perspective of humans who have access to an estimated model for the\nrobot. To this end, we propose a novel planning framework and build a solver\nbased on AND-OR search, which integrates knowledge reasoning, including\nsituation assessment by perspective taking. Our approach dynamically models and\nmanages the expansion and contraction of potential advances while precisely\nkeeping track of when (and when not) agents share the task execution\nexperience. The planner systematically assesses the situation and ignores\nworlds that it has reason to think are impossible for humans. Overall, our new\nsolver can estimate the distinct beliefs of the human and the robot along\npotential courses of action, enabling the synthesis of plans where the robot\nselects the right moment for communication, i.e. informing, or replying to an\ninquiry, or defers ontic actions until the execution experiences can be shared.\nPreliminary experiments in two domains, one novel and one adapted, demonstrate\nthe effectiveness of the framework.\n","authors":["Shashank Shekhar","Anthony Favier","Rachid Alami"],"pdf_url":"https://arxiv.org/pdf/2409.18545v1.pdf","comment":"15 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2409.18542v1","updated":"2024-09-27T08:21:31Z","published":"2024-09-27T08:21:31Z","title":"MIMII-Gen: Generative Modeling Approach for Simulated Evaluation of\n Anomalous Sound Detection System","summary":" Insufficient recordings and the scarcity of anomalies present significant\nchallenges in developing and validating robust anomaly detection systems for\nmachine sounds. To address these limitations, we propose a novel approach for\ngenerating diverse anomalies in machine sound using a latent diffusion-based\nmodel that integrates an encoder-decoder framework. Our method utilizes the\nFlan-T5 model to encode captions derived from audio file metadata, enabling\nconditional generation through a carefully designed U-Net architecture. This\napproach aids our model in generating audio signals within the EnCodec latent\nspace, ensuring high contextual relevance and quality. We objectively evaluated\nthe quality of our generated sounds using the Fr\\'echet Audio Distance (FAD)\nscore and other metrics, demonstrating that our approach surpasses existing\nmodels in generating reliable machine audio that closely resembles actual\nabnormal conditions. The evaluation of the anomaly detection system using our\ngenerated data revealed a strong correlation, with the area under the curve\n(AUC) score differing by 4.8\\% from the original, validating the effectiveness\nof our generated data. These results demonstrate the potential of our approach\nto enhance the evaluation and robustness of anomaly detection systems across\nvaried and previously unseen conditions. Audio samples can be found at\n\\url{https://hpworkhub.github.io/MIMII-Gen.github.io/}.\n","authors":["Harsh Purohit","Tomoya Nishida","Kota Dohi","Takashi Endo","Yohei Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2409.18542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18541v1","updated":"2024-09-27T08:20:59Z","published":"2024-09-27T08:20:59Z","title":"Align$^2$LLaVA: Cascaded Human and Large Language Model Preference\n Alignment for Multi-modal Instruction Curation","summary":" Recent advances in Multi-modal Large Language Models (MLLMs), such as\nLLaVA-series models, are driven by massive machine-generated\ninstruction-following data tuning. Such automatic instruction collection\npipelines, however, inadvertently introduce significant variability in data\nquality. This paper introduces a novel instruction curation algorithm, derived\nfrom two unique perspectives, human and LLM preference alignment, to compress\nthis vast corpus of machine-generated multimodal instructions to a compact and\nhigh-quality form: (i) For human preference alignment, we have collected a\nmachine-generated multimodal instruction dataset and established a\ncomprehensive set of both subjective and objective criteria to guide the data\nquality assessment critically from human experts. By doing so, a reward model\nwas trained on the annotated dataset to internalize the nuanced human\nunderstanding of instruction alignment. (ii) For LLM preference alignment,\ngiven the instruction selected by the reward model, we propose leveraging the\ninner LLM used in MLLM to align the writing style of visual instructions with\nthat of the inner LLM itself, resulting in LLM-aligned instruction improvement.\nExtensive experiments demonstrate that we can maintain or even improve model\nperformance by compressing synthetic multimodal instructions by up to 90%.\nImpressively, by aggressively reducing the total training sample size from 158k\nto 14k (9$\\times$ smaller), our model consistently outperforms its full-size\ndataset counterpart across various MLLM benchmarks. Our project is available at\nhttps://github.com/DCDmllm/Align2LLaVA.\n","authors":["Hongzhe Huang","Zhewen Yu","Jiang Liu","Li Cai","Dian Jiao","Wenqiao Zhang","Siliang Tang","Juncheng Li","Hao Jiang","Haoyuan Li","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2409.18541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03553v3","updated":"2024-09-27T08:16:28Z","published":"2024-05-06T15:20:30Z","title":"AlphaMath Almost Zero: Process Supervision without Process","summary":" Although recent advancements in large language models (LLMs) have\nsignificantly improved their performance on various tasks, they still face\nchallenges with complex and symbolic multi-step reasoning, particularly in\nmathematical reasoning. To bolster the mathematical reasoning capabilities of\nLLMs, most existing efforts concentrate on seeking assistance from either\ndomain experts or GPT-4 for high-quality process-supervised data, which is not\nonly expensive but also labor-intensive. In our study, we propose an innovative\nframework, AlphaMath, that bypasses the need for process annotations (from\nhumans or GPTs) by leveraging Monte Carlo Tree Search (MCTS). This framework\nfocuses on unleashing the potential of a well-pretrained LLM to autonomously\nenhance its mathematical reasoning. Specifically, we integrate a value model\nwith the LLM, automatically generating both process supervision and step-level\nevaluation signals in MCTS. Furthermore, we propose an efficient inference\nstrategy, step-level beam search, where the value model is crafted to assist\nthe policy model (i.e., LLM) in navigating more effective reasoning paths,\nrather than solely relying on prior probabilities. The experimental results on\nboth in-domain and out-of-domain datasets demonstrate that even without GPT-4\nor human-annotated process supervision, our AlphaMath framework achieves\ncomparable or superior results to previous state-of-the-art methods.\n","authors":["Guoxin Chen","Minpeng Liao","Chengxi Li","Kai Fan"],"pdf_url":"https://arxiv.org/pdf/2405.03553v3.pdf","comment":"Camera ready version for NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.10858v2","updated":"2024-09-27T08:03:07Z","published":"2024-06-16T09:06:17Z","title":"Step-level Value Preference Optimization for Mathematical Reasoning","summary":" Direct Preference Optimization (DPO) using an implicit reward model has\nproven to be an effective alternative to reinforcement learning from human\nfeedback (RLHF) for fine-tuning preference aligned large language models\n(LLMs). However, the overall preference annotations of responses do not fully\ncapture the fine-grained quality of model outputs in complex multi-step\nreasoning tasks, such as mathematical reasoning. To address this limitation, we\nintroduce a novel algorithm called Step-level Value Preference Optimization\n(SVPO). Our approach employs Monte Carlo Tree Search (MCTS) to automatically\nannotate step-level preferences for multi-step reasoning. Furthermore, from the\nperspective of learning-to-rank, we train an explicit value model to replicate\nthe behavior of the implicit reward model, complementing standard preference\noptimization. This value model enables the LLM to generate higher reward\nresponses with minimal cost during inference. Experimental results demonstrate\nthat our method achieves state-of-the-art performance on both in-domain and\nout-of-domain mathematical reasoning benchmarks. Our code is available at\n\\url{https://github.com/MARIO-Math-Reasoning/Super_MARIO}.\n","authors":["Guoxin Chen","Minpeng Liao","Chengxi Li","Kai Fan"],"pdf_url":"https://arxiv.org/pdf/2406.10858v2.pdf","comment":"Camera ready version for EMNLP2024-Findings"},{"id":"http://arxiv.org/abs/2308.05978v3","updated":"2024-09-27T07:47:07Z","published":"2023-08-11T07:25:12Z","title":"CyberForce: A Federated Reinforcement Learning Framework for Malware\n Mitigation","summary":" Recent research has shown that the integration of Reinforcement Learning (RL)\nwith Moving Target Defense (MTD) can enhance cybersecurity in\nInternet-of-Things (IoT) devices. Nevertheless, the practicality of existing\nwork is hindered by data privacy concerns associated with centralized data\nprocessing in RL, and the unsatisfactory time needed to learn right MTD\ntechniques that are effective against a rising number of heterogeneous zero-day\nattacks. Thus, this work presents CyberForce, a framework that combines\nFederated and Reinforcement Learning (FRL) to collaboratively and privately\nlearn suitable MTD techniques for mitigating zero-day attacks. CyberForce\nintegrates device fingerprinting and anomaly detection to reward or penalize\nMTD mechanisms chosen by an FRL-based agent. The framework has been deployed\nand evaluated in a scenario consisting of ten physical devices of a real IoT\nplatform affected by heterogeneous malware samples. A pool of experiments has\ndemonstrated that CyberForce learns the MTD technique mitigating each attack\nfaster than existing RL-based centralized approaches. In addition, when various\ndevices are exposed to different attacks, CyberForce benefits from knowledge\ntransfer, leading to enhanced performance and reduced learning time in\ncomparison to recent works. Finally, different aggregation algorithms used\nduring the agent learning process provide CyberForce with notable robustness to\nmalicious attacks.\n","authors":["Chao Feng","Alberto Huertas Celdran","Pedro Miguel Sanchez Sanchez","Jan Kreischer","Jan von der Assen","Gerome Bovet","Gregorio Martinez Perez","Burkhard Stiller"],"pdf_url":"https://arxiv.org/pdf/2308.05978v3.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.18512v1","updated":"2024-09-27T07:46:52Z","published":"2024-09-27T07:46:52Z","title":"EmoPro: A Prompt Selection Strategy for Emotional Expression in LM-based\n Speech Synthesis","summary":" Recent advancements in speech synthesis models, trained on extensive\ndatasets, have demonstrated remarkable zero-shot capabilities. These models can\ncontrol content, timbre, and emotion in generated speech based on prompt\ninputs. Despite these advancements, the choice of prompts significantly impacts\nthe output quality, yet most existing selection schemes do not adequately\naddress the control of emotional intensity. To address this question, this\npaper proposes a two-stage prompt selection strategy EmoPro, which is\nspecifically designed for emotionally controllable speech synthesis. This\nstrategy focuses on selecting highly expressive and high-quality prompts by\nevaluating them from four perspectives: emotional expression strength, speech\nquality, text-emotion consistency, and model generation performance.\nExperimental results show that prompts selected using the proposed method\nresult in more emotionally expressive and engaging synthesized speech compared\nto those obtained through baseline. Audio samples and codes will be available\nat https://whyrrrrun.github.io/EmoPro/.\n","authors":["Haoyu Wang","Chunyu Qiang","Tianrui Wang","Cheng Gong","Qiuyu Liu","Yu Jiang","Xiaobao Wang","Chenyang Wang","Chen Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18499v1","updated":"2024-09-27T07:32:42Z","published":"2024-09-27T07:32:42Z","title":"Fairness-aware Multiobjective Evolutionary Learning","summary":" Multiobjective evolutionary learning (MOEL) has demonstrated its advantages\nof training fairer machine learning models considering a predefined set of\nconflicting objectives, including accuracy and different fairness measures.\nRecent works propose to construct a representative subset of fairness measures\nas optimisation objectives of MOEL throughout model training. However, the\ndetermination of a representative measure set relies on dataset, prior\nknowledge and requires substantial computational costs. What's more, those\nrepresentative measures may differ across different model training processes.\nInstead of using a static predefined set determined before model training, this\npaper proposes to dynamically and adaptively determine a representative measure\nset online during model training. The dynamically determined representative set\nis then used as optimising objectives of the MOEL framework and can vary with\ntime. Extensive experimental results on 12 well-known benchmark datasets\ndemonstrate that our proposed framework achieves outstanding performance\ncompared to state-of-the-art approaches for mitigating unfairness in terms of\naccuracy as well as 25 fairness measures although only a few of them were\ndynamically selected and used as optimisation objectives. The results indicate\nthe importance of setting optimisation objectives dynamically during training.\n","authors":["Qingquan Zhang","Jialin Liu","Xin Yao"],"pdf_url":"https://arxiv.org/pdf/2409.18499v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2404.15256v4","updated":"2024-09-27T07:16:23Z","published":"2024-04-23T17:42:45Z","title":"TOP-Nav: Legged Navigation Integrating Terrain, Obstacle and\n Proprioception Estimation","summary":" Legged navigation is typically examined within open-world, off-road, and\nchallenging environments. In these scenarios, estimating external disturbances\nrequires a complex synthesis of multi-modal information. This underlines a\nmajor limitation in existing works that primarily focus on avoiding obstacles.\nIn this work, we propose TOP-Nav, a novel legged navigation framework that\nintegrates a comprehensive path planner with Terrain awareness, Obstacle\navoidance and close-loop Proprioception. TOP-Nav underscores the synergies\nbetween vision and proprioception in both path and motion planning. Within the\npath planner, we present and integrate a terrain estimator that enables the\nrobot to select waypoints on terrains with higher traversability while\neffectively avoiding obstacles. In the motion planning level, we not only\nimplement a locomotion controller to track the navigation commands, but also\nconstruct a proprioception advisor to provide motion evaluations for the path\nplanner. Based on the close-loop motion feedback, we make online corrections\nfor the vision-based terrain and obstacle estimations. Consequently, TOP-Nav\nachieves open-world navigation that the robot can handle terrains or\ndisturbances beyond the distribution of prior knowledge and overcomes\nconstraints imposed by visual conditions. Building upon extensive experiments\nconducted in both simulation and real-world environments, TOP-Nav demonstrates\nsuperior performance in open-world navigation compared to existing methods.\n","authors":["Junli Ren","Yikai Liu","Yingru Dai","Junfeng Long","Guijin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.15256v4.pdf","comment":"Published on CoRL 2024"},{"id":"http://arxiv.org/abs/2408.07841v2","updated":"2024-09-27T07:02:12Z","published":"2024-08-14T22:43:52Z","title":"SustainDC -- Benchmarking for Sustainable Data Center Control","summary":" Machine learning has driven an exponential increase in computational demand,\nleading to massive data centers that consume significant amounts of energy and\ncontribute to climate change. This makes sustainable data center control a\npriority. In this paper, we introduce SustainDC, a set of Python environments\nfor benchmarking multi-agent reinforcement learning (MARL) algorithms for data\ncenters (DC). SustainDC supports custom DC configurations and tasks such as\nworkload scheduling, cooling optimization, and auxiliary battery management,\nwith multiple agents managing these operations while accounting for the effects\nof each other. We evaluate various MARL algorithms on SustainDC, showing their\nperformance across diverse DC designs, locations, weather conditions, grid\ncarbon intensity, and workload requirements. Our results highlight significant\nopportunities for improvement of data center operations using MARL algorithms.\nGiven the increasing use of DC due to AI, SustainDC provides a crucial platform\nfor the development and benchmarking of advanced algorithms essential for\nachieving sustainable computing and addressing other heterogeneous real-world\nchallenges.\n","authors":["Avisek Naug","Antonio Guillen","Ricardo Luna","Vineet Gundecha","Desik Rengarajan","Sahand Ghorbanpour","Sajad Mousavi","Ashwin Ramesh Babu","Dejan Markovikj","Lekhapriya D Kashyap","Soumyendu Sarkar"],"pdf_url":"https://arxiv.org/pdf/2408.07841v2.pdf","comment":"Under review at Advances in Neural Information Processing Systems\n 2024 (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2408.10566v4","updated":"2024-09-27T06:32:01Z","published":"2024-08-20T06:05:52Z","title":"Overcoming Growth-Induced Forgetting in Task-Agnostic Continual Learning","summary":" In continual learning (CL), model growth enhances adaptability over new data,\nimproving knowledge retention for more tasks. However, improper model growth\ncan lead to severe degradation of previously learned knowledge, an issue we\nname as growth-induced forgetting (GIFt), especially in task-agnostic CL using\nentire grown model for inference. Existing works, despite adopting model growth\nand random initialization for better adaptability, often fail to recognize the\npresence of GIFt caused by improper model growth. This oversight limits\ncomprehensive control of forgetting and hinders full utilization of model\ngrowth. We are the first in CL to identify this issue and conduct an in-depth\nstudy on root cause of GIFt, where layer expansion stands out among model\ngrowth strategies, widening layers without affecting model functionality. Yet,\ndirect adoption of layer expansion presents challenges. It lacks data-driven\ncontrol and initialization of expanded parameters to balance adaptability and\nknowledge retention. This paper presents a novel SparseGrow approach to\novercome the issue of GIFt while enhancing adaptability over new data.\nSparseGrow employs data-driven sparse layer expansion to control efficient\nparameter usage during growth, reducing GIFt from excessive growth and\nfunctionality changes. It also combines sparse growth with on-data\ninitialization at training late-stage to create partially 0-valued expansions\nthat fit learned distribution, enhancing retention and adaptability. To further\nminimize forgetting, freezing is applied by calculating the sparse mask,\nallowing data-driven preservation of important parameters. Through experiments\nacross datasets with various settings, cases, and task numbers, we demonstrate\nthe necessity of layer expansion and showcase the effectiveness of SparseGrow\nin overcoming GIFt, highlighting its adaptability and knowledge retention for\nincremental tasks.\n","authors":["Yuqing Zhao","Divya Saxena","Jiannong Cao","Xiaoyun Liu","Changlin Song"],"pdf_url":"https://arxiv.org/pdf/2408.10566v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18475v1","updated":"2024-09-27T06:31:03Z","published":"2024-09-27T06:31:03Z","title":"Data Analysis in the Era of Generative AI","summary":" This paper explores the potential of AI-powered tools to reshape data\nanalysis, focusing on design considerations and challenges. We explore how the\nemergence of large language and multimodal models offers new opportunities to\nenhance various stages of data analysis workflow by translating high-level user\nintentions into executable code, charts, and insights. We then examine\nhuman-centered design principles that facilitate intuitive interactions, build\nuser trust, and streamline the AI-assisted analysis workflow across multiple\napps. Finally, we discuss the research challenges that impede the development\nof these AI-based systems such as enhancing model capabilities, evaluating and\nbenchmarking, and understanding end-user needs.\n","authors":["Jeevana Priya Inala","Chenglong Wang","Steven Drucker","Gonzalo Ramos","Victor Dibia","Nathalie Riche","Dave Brown","Dan Marshall","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2409.18475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12842v3","updated":"2024-09-27T06:25:33Z","published":"2024-02-20T09:10:08Z","title":"PromptKD: Distilling Student-Friendly Knowledge for Generative Language\n Models via Prompt Tuning","summary":" Recent advancements in large language models (LLMs) have raised concerns\nabout inference costs, increasing the need for research into model compression.\nWhile knowledge distillation (KD) is a prominent method for this, research on\nKD for generative language models like LLMs is relatively sparse, and the\napproach of distilling student-friendly knowledge, which has shown promising\nperformance in KD for classification models, remains unexplored in generative\nlanguage models. To explore this approach, we propose PromptKD, a simple yet\neffective method that utilizes prompt tuning - for the first time in KD - to\nenable generative language models to transfer student-friendly knowledge.\nUnlike previous works in classification that require fine-tuning the entire\nteacher model for extracting student-friendly knowledge, PromptKD achieves\nsimilar effects by adding a small number of prompt tokens and tuning only the\nprompt with student guidance. Extensive experiments on instruction-following\ndatasets show that PromptKD achieves state-of-the-art performance while adding\nonly 0.0007% of the teacher's parameters as prompts. Further analysis suggests\nthat distilling student-friendly knowledge alleviates exposure bias effectively\nthroughout the entire training process, leading to performance enhancements.\n","authors":["Gyeongman Kim","Doohyuk Jang","Eunho Yang"],"pdf_url":"https://arxiv.org/pdf/2402.12842v3.pdf","comment":"EMNLP 2024 Findings. Our project page: https://promptkd.github.io"},{"id":"http://arxiv.org/abs/2409.18461v1","updated":"2024-09-27T05:49:48Z","published":"2024-09-27T05:49:48Z","title":"Towards Diverse Device Heterogeneous Federated Learning via Task\n Arithmetic Knowledge Integration","summary":" Federated Learning has emerged as a promising paradigm for collaborative\nmachine learning, while preserving user data privacy. Despite its potential,\nstandard FL lacks support for diverse heterogeneous device prototypes, which\nvary significantly in model and dataset sizes -- from small IoT devices to\nlarge workstations. This limitation is only partially addressed by existing\nknowledge distillation techniques, which often fail to transfer knowledge\neffectively across a broad spectrum of device prototypes with varied\ncapabilities. This failure primarily stems from two issues: the dilution of\ninformative logits from more capable devices by those from less capable ones,\nand the use of a single integrated logits as the distillation target across all\ndevices, which neglects their individual learning capacities and and the unique\ncontributions of each. To address these challenges, we introduce TAKFL, a novel\nKD-based framework that treats the knowledge transfer from each device\nprototype's ensemble as a separate task, independently distilling each to\npreserve its unique contributions and avoid dilution. TAKFL also incorporates a\nKD-based self-regularization technique to mitigate the issues related to the\nnoisy and unsupervised ensemble distillation process. To integrate the\nseparately distilled knowledge, we introduce an adaptive task arithmetic\nknowledge integration process, allowing each student model to customize the\nknowledge integration for optimal performance. Additionally, we present\ntheoretical results demonstrating the effectiveness of task arithmetic in\ntransferring knowledge across heterogeneous devices with varying capacities.\nComprehensive evaluations of our method across both CV and NLP tasks\ndemonstrate that TAKFL achieves SOTA results in a variety of datasets and\nsettings, significantly outperforming existing KD-based methods. Code is\nreleased at https://github.com/MMorafah/TAKFL\n","authors":["Mahdi Morafah","Vyacheslav Kungurtsev","Hojin Chang","Chen Chen","Bill Lin"],"pdf_url":"https://arxiv.org/pdf/2409.18461v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.18455v1","updated":"2024-09-27T05:31:04Z","published":"2024-09-27T05:31:04Z","title":"Review of Digital Asset Development with Graph Neural Network Unlearning","summary":" In the rapidly evolving landscape of digital assets, the imperative for\nrobust data privacy and compliance with regulatory frameworks has intensified.\nThis paper investigates the critical role of Graph Neural Networks (GNNs) in\nthe management of digital assets and introduces innovative unlearning\ntechniques specifically tailored to GNN architectures. We categorize unlearning\nstrategies into two primary classes: data-driven approximation, which\nmanipulates the graph structure to isolate and remove the influence of specific\nnodes, and model-driven approximation, which modifies the internal parameters\nand architecture of the GNN itself. By examining recent advancements in these\nunlearning methodologies, we highlight their applicability in various use\ncases, including fraud detection, risk assessment, token relationship\nprediction, and decentralized governance. We discuss the challenges inherent in\nbalancing model performance with the requirements for data unlearning,\nparticularly in the context of real-time financial applications. Furthermore,\nwe propose a hybrid approach that combines the strengths of both unlearning\nstrategies to enhance the efficiency and effectiveness of GNNs in digital asset\necosystems. Ultimately, this paper aims to provide a comprehensive framework\nfor understanding and implementing GNN unlearning techniques, paving the way\nfor secure and compliant deployment of machine learning in the digital asset\ndomain.\n","authors":["Zara Lisbon"],"pdf_url":"https://arxiv.org/pdf/2409.18455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18454v1","updated":"2024-09-27T05:29:31Z","published":"2024-09-27T05:29:31Z","title":"Leveraging Long-Context Large Language Models for Multi-Document\n Understanding and Summarization in Enterprise Applications","summary":" The rapid increase in unstructured data across various fields has made\nmulti-document comprehension and summarization a critical task. Traditional\napproaches often fail to capture relevant context, maintain logical\nconsistency, and extract essential information from lengthy documents. This\npaper explores the use of Long-context Large Language Models (LLMs) for\nmulti-document summarization, demonstrating their exceptional capacity to grasp\nextensive connections, provide cohesive summaries, and adapt to various\nindustry domains and integration with enterprise applications/systems. The\npaper discusses the workflow of multi-document summarization for effectively\ndeploying long-context LLMs, supported by case studies in legal applications,\nenterprise functions such as HR, finance, and sourcing, as well as in the\nmedical and news domains. These case studies show notable enhancements in both\nefficiency and accuracy. Technical obstacles, such as dataset diversity, model\nscalability, and ethical considerations like bias mitigation and factual\naccuracy, are carefully analyzed. Prospective research avenues are suggested to\naugment the functionalities and applications of long-context LLMs, establishing\nthem as pivotal tools for transforming information processing across diverse\nsectors and enterprise applications.\n","authors":["Aditi Godbole","Jabin Geevarghese George","Smita Shandilya"],"pdf_url":"https://arxiv.org/pdf/2409.18454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09858v2","updated":"2024-09-27T04:50:00Z","published":"2024-09-15T20:41:18Z","title":"A Survey of Out-of-distribution Generalization for Graph Machine\n Learning from a Causal View","summary":" Graph machine learning (GML) has been successfully applied across a wide\nrange of tasks. Nonetheless, GML faces significant challenges in generalizing\nover out-of-distribution (OOD) data, which raises concerns about its wider\napplicability. Recent advancements have underscored the crucial role of\ncausality-driven approaches in overcoming these generalization challenges.\nDistinct from traditional GML methods that primarily rely on statistical\ndependencies, causality-focused strategies delve into the underlying causal\nmechanisms of data generation and model prediction, thus significantly\nimproving the generalization of GML across different environments. This paper\noffers a thorough review of recent progress in causality-involved GML\ngeneralization. We elucidate the fundamental concepts of employing causality to\nenhance graph model generalization and categorize the various approaches,\nproviding detailed descriptions of their methodologies and the connections\namong them. Furthermore, we explore the incorporation of causality in other\nrelated important areas of trustworthy GML, such as explanation, fairness, and\nrobustness. Concluding with a discussion on potential future research\ndirections, this review seeks to articulate the continuing development and\nfuture potential of causality in enhancing the trustworthiness of graph machine\nlearning.\n","authors":["Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2409.09858v2.pdf","comment":"15 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2409.18444v1","updated":"2024-09-27T04:45:06Z","published":"2024-09-27T04:45:06Z","title":"Cost-Aware Dynamic Cloud Workflow Scheduling using Self-Attention and\n Evolutionary Reinforcement Learning","summary":" The Cost-aware Dynamic Multi-Workflow Scheduling (CDMWS) in the cloud is a\nkind of cloud workflow management problem, which aims to assign virtual machine\n(VM) instances to execute tasks in workflows so as to minimize the total costs,\nincluding both the penalties for violating Service Level Agreement (SLA) and\nthe VM rental fees. Powered by deep neural networks, Reinforcement Learning\n(RL) methods can construct effective scheduling policies for solving CDMWS\nproblems. Traditional policy networks in RL often use basic feedforward\narchitectures to separately determine the suitability of assigning any VM\ninstances, without considering all VMs simultaneously to learn their global\ninformation. This paper proposes a novel self-attention policy network for\ncloud workflow scheduling (SPN-CWS) that captures global information from all\nVMs. We also develop an Evolution Strategy-based RL (ERL) system to train\nSPN-CWS reliably and effectively. The trained SPN-CWS can effectively process\nall candidate VM instances simultaneously to identify the most suitable VM\ninstance to execute every workflow task. Comprehensive experiments show that\nour method can noticeably outperform several state-of-the-art algorithms on\nmultiple benchmark CDMWS problems.\n","authors":["Ya Shen","Gang Chen","Hui Ma","Mengjie Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18444v1.pdf","comment":"This paper has been accepted by ICSOC (International Conference on\n Service-Oriented Computing) 2024"},{"id":"http://arxiv.org/abs/2409.18439v1","updated":"2024-09-27T04:28:19Z","published":"2024-09-27T04:28:19Z","title":"State-free Reinforcement Learning","summary":" In this work, we study the \\textit{state-free RL} problem, where the\nalgorithm does not have the states information before interacting with the\nenvironment. Specifically, denote the reachable state set by ${S}^\\Pi := \\{\ns|\\max_{\\pi\\in \\Pi}q^{P, \\pi}(s)>0 \\}$, we design an algorithm which requires\nno information on the state space $S$ while having a regret that is completely\nindependent of ${S}$ and only depend on ${S}^\\Pi$. We view this as a concrete\nfirst step towards \\textit{parameter-free RL}, with the goal of designing RL\nalgorithms that require no hyper-parameter tuning.\n","authors":["Mingyu Chen","Aldo Pacchiano","Xuezhou Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18438v1","updated":"2024-09-27T04:21:02Z","published":"2024-09-27T04:21:02Z","title":"Physics Augmented Tuple Transformer for Autism Severity Level Detection","summary":" Early diagnosis of Autism Spectrum Disorder (ASD) is an effective and\nfavorable step towards enhancing the health and well-being of children with\nASD. Manual ASD diagnosis testing is labor-intensive, complex, and prone to\nhuman error due to several factors contaminating the results. This paper\nproposes a novel framework that exploits the laws of physics for ASD severity\nrecognition. The proposed physics-informed neural network architecture encodes\nthe behaviour of the subject extracted by observing a part of the\nskeleton-based motion trajectory in a higher dimensional latent space. Two\ndecoders, namely physics-based and non-physics-based decoder, use this latent\nembedding and predict the future motion patterns. The physics branch leverages\nthe laws of physics that apply to a skeleton sequence in the prediction process\nwhile the non-physics-based branch is optimised to minimise the difference\nbetween the predicted and actual motion of the subject. A classifier also\nleverages the same latent space embeddings to recognise the ASD severity. This\ndual generative objective explicitly forces the network to compare the actual\nbehaviour of the subject with the general normal behaviour of children that are\ngoverned by the laws of physics, aiding the ASD recognition task. The proposed\nmethod attains state-of-the-art performance on multiple ASD diagnosis\nbenchmarks. To illustrate the utility of the proposed framework beyond the task\nASD diagnosis, we conduct a third experiment using a publicly available\nbenchmark for the task of fall prediction and demonstrate the superiority of\nour model.\n","authors":["Chinthaka Ranasingha","Harshala Gammulle","Tharindu Fernando","Sridha Sridharan","Clinton Fookes"],"pdf_url":"https://arxiv.org/pdf/2409.18438v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2408.10571v3","updated":"2024-09-27T04:04:24Z","published":"2024-08-20T06:17:56Z","title":"Prompt-Agnostic Adversarial Perturbation for Customized Diffusion Models","summary":" Diffusion models have revolutionized customized text-to-image generation,\nallowing for efficient synthesis of photos from personal data with textual\ndescriptions. However, these advancements bring forth risks including privacy\nbreaches and unauthorized replication of artworks. Previous researches\nprimarily center around using prompt-specific methods to generate adversarial\nexamples to protect personal images, yet the effectiveness of existing methods\nis hindered by constrained adaptability to different prompts. In this paper, we\nintroduce a Prompt-Agnostic Adversarial Perturbation (PAP) method for\ncustomized diffusion models. PAP first models the prompt distribution using a\nLaplace Approximation, and then produces prompt-agnostic perturbations by\nmaximizing a disturbance expectation based on the modeled distribution. This\napproach effectively tackles the prompt-agnostic attacks, leading to improved\ndefense stability. Extensive experiments in face privacy and artistic style\nprotection, demonstrate the superior generalization of PAP in comparison to\nexisting techniques. Our project page is available at\nhttps://github.com/vancyland/Prompt-Agnostic-Adversarial-Perturbation-for-Customized-Diffusion-Models.github.io.\n","authors":["Cong Wan","Yuhang He","Xiang Song","Yihong Gong"],"pdf_url":"https://arxiv.org/pdf/2408.10571v3.pdf","comment":"Accepted by NIPS 2024"},{"id":"http://arxiv.org/abs/2409.18435v1","updated":"2024-09-27T03:57:54Z","published":"2024-09-27T03:57:54Z","title":"Multi-agent Reinforcement Learning for Dynamic Dispatching in Material\n Handling Systems","summary":" This paper proposes a multi-agent reinforcement learning (MARL) approach to\nlearn dynamic dispatching strategies, which is crucial for optimizing\nthroughput in material handling systems across diverse industries. To benchmark\nour method, we developed a material handling environment that reflects the\ncomplexities of an actual system, such as various activities at different\nlocations, physical constraints, and inherent uncertainties. To enhance\nexploration during learning, we propose a method to integrate domain knowledge\nin the form of existing dynamic dispatching heuristics. Our experimental\nresults show that our method can outperform heuristics by up to 7.4 percent in\nterms of median throughput. Additionally, we analyze the effect of different\narchitectures on MARL performance when training multiple agents with different\nfunctions. We also demonstrate that the MARL agents performance can be further\nimproved by using the first iteration of MARL agents as heuristics to train a\nsecond iteration of MARL agents. This work demonstrates the potential of\napplying MARL to learn effective dynamic dispatching strategies that may be\ndeployed in real-world systems to improve business outcomes.\n","authors":["Xian Yeow Lee","Haiyan Wang","Daisuke Katsumata","Takaharu Matsui","Chetan Gupta"],"pdf_url":"https://arxiv.org/pdf/2409.18435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18433v1","updated":"2024-09-27T03:49:56Z","published":"2024-09-27T03:49:56Z","title":"Easy2Hard-Bench: Standardized Difficulty Labels for Profiling LLM\n Performance and Generalization","summary":" While generalization over tasks from easy to hard is crucial to profile\nlanguage models (LLMs), the datasets with fine-grained difficulty annotations\nfor each problem across a broad range of complexity are still blank. Aiming to\naddress this limitation, we present Easy2Hard-Bench, a consistently formatted\ncollection of 6 benchmark datasets spanning various domains, such as\nmathematics and programming problems, chess puzzles, and reasoning questions.\nEach problem within these datasets is annotated with numerical difficulty\nscores. To systematically estimate problem difficulties, we collect abundant\nperformance data on attempts to each problem by humans in the real world or\nLLMs on the prominent leaderboard. Leveraging the rich performance data, we\napply well-established difficulty ranking systems, such as Item Response Theory\n(IRT) and Glicko-2 models, to uniformly assign numerical difficulty scores to\nproblems. Moreover, datasets in Easy2Hard-Bench distinguish themselves from\nprevious collections by a higher proportion of challenging problems. Through\nextensive experiments with six state-of-the-art LLMs, we provide a\ncomprehensive analysis of their performance and generalization capabilities\nacross varying levels of difficulty, with the aim of inspiring future research\nin LLM generalization. The datasets are available at\nhttps://huggingface.co/datasets/furonghuang-lab/Easy2Hard-Bench.\n","authors":["Mucong Ding","Chenghao Deng","Jocelyn Choo","Zichu Wu","Aakriti Agrawal","Avi Schwarzschild","Tianyi Zhou","Tom Goldstein","John Langford","Anima Anandkumar","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2409.18433v1.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2407.04737v2","updated":"2024-09-27T03:22:07Z","published":"2024-07-02T06:12:55Z","title":"Hierarchical Decoupling Capacitor Optimization for Power Distribution\n Network of 2.5D ICs with Co-Analysis of Frequency and Time Domains Based on\n Deep Reinforcement Learning","summary":" With the growing need for higher memory bandwidth and computation density,\n2.5D design, which involves integrating multiple chiplets onto an interposer,\nemerges as a promising solution. However, this integration introduces\nsignificant challenges due to increasing data rates and a large number of I/Os,\nnecessitating advanced optimization of the power distribution networks (PDNs)\nboth on-chip and on-interposer to mitigate the small signal noise and\nsimultaneous switching noise (SSN). Traditional PDN optimization strategies in\n2.5D systems primarily focus on reducing impedance by integrating decoupling\ncapacitors (decaps) to lessen small signal noises. Unfortunately, relying\nsolely on frequency-domain analysis has been proven inadequate for addressing\ncoupled SSN, as indicated by our experimental results. In this work, we\nintroduce a novel two-phase optimization flow using deep reinforcement learning\nto tackle both the on-chip small signal noise and SSN. Initially, we optimize\nthe impedance in the frequency domain to maintain the small signal noise within\nacceptable limits while avoiding over-design. Subsequently, in the time domain,\nwe refine the PDN to minimize the voltage violation integral (VVI), a more\naccurate measure of SSN severity. To the best of our knowledge, this is the\nfirst dual-domain optimization strategy that simultaneously addresses both the\nsmall signal noise and SSN propagation through strategic decap placement in\non-chip and on-interposer PDNs, offering a significant step forward in the\ndesign of robust PDNs for 2.5D integrated systems.\n","authors":["Yuanyuan Duan","Haiyang Feng","Zhiping Yu","Hanming Wu","Leilai Shao","Xiaolei Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.04737v2.pdf","comment":"The data needs to be experimentally revalidated, and the experimental\n details require further optimization"},{"id":"http://arxiv.org/abs/2409.18418v1","updated":"2024-09-27T03:17:01Z","published":"2024-09-27T03:17:01Z","title":"A3: Active Adversarial Alignment for Source-Free Domain Adaptation","summary":" Unsupervised domain adaptation (UDA) aims to transfer knowledge from a\nlabeled source domain to an unlabeled target domain. Recent works have focused\non source-free UDA, where only target data is available. This is challenging as\nmodels rely on noisy pseudo-labels and struggle with distribution shifts. We\npropose Active Adversarial Alignment (A3), a novel framework combining\nself-supervised learning, adversarial training, and active learning for robust\nsource-free UDA. A3 actively samples informative and diverse data using an\nacquisition function for training. It adapts models via adversarial losses and\nconsistency regularization, aligning distributions without source data access.\nA3 advances source-free UDA through its synergistic integration of active and\nadversarial learning for effective domain alignment and noise reduction.\n","authors":["Chrisantus Eze","Christopher Crick"],"pdf_url":"https://arxiv.org/pdf/2409.18418v1.pdf","comment":"Accepted at ICMLA 2024"},{"id":"http://arxiv.org/abs/2409.18417v1","updated":"2024-09-27T03:15:07Z","published":"2024-09-27T03:15:07Z","title":"VickreyFeedback: Cost-efficient Data Construction for Reinforcement\n Learning from Human Feedback","summary":" This paper addresses the cost-efficiency aspect of Reinforcement Learning\nfrom Human Feedback (RLHF). RLHF leverages datasets of human preferences over\noutputs of large language models (LLM) to instill human expectations into LLMs.\nWhile preference annotation comes with a monetized cost, the economic utility\nof a preference dataset has not been considered by far. What exacerbates this\nsituation is that given complex intransitive or cyclic relationships in\npreference datasets, existing algorithms for fine-tuning LLMs are still far\nfrom capturing comprehensive preferences. This raises severe cost-efficiency\nconcerns in production environments, where preference data accumulate over\ntime. In this paper, we see the fine-tuning of LLMs as a monetized economy and\nintroduce an auction mechanism to improve the efficiency of the preference data\ncollection in dollar terms. We show that introducing an auction mechanism can\nplay an essential role in enhancing the cost-efficiency of RLHF while\nmaintaining satisfactory model performance. Experimental results demonstrate\nthat our proposed auction-based protocol is cost-efficient for fine-tuning LLMs\nby concentrating on high-quality feedback.\n","authors":["Guoxi Zhang","Jiuding Duan"],"pdf_url":"https://arxiv.org/pdf/2409.18417v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.18412v1","updated":"2024-09-27T03:00:29Z","published":"2024-09-27T03:00:29Z","title":"SciDFM: A Large Language Model with Mixture-of-Experts for Science","summary":" Recently, there has been a significant upsurge of interest in leveraging\nlarge language models (LLMs) to assist scientific discovery. However, most LLMs\nonly focus on general science, while they lack domain-specific knowledge, such\nas chemical molecules and amino acid sequences. To bridge these gaps, we\nintroduce SciDFM, a mixture-of-experts LLM, which is trained from scratch and\nis able to conduct college-level scientific reasoning and understand molecules\nand amino acid sequences. We collect a large-scale training corpus containing\nnumerous scientific papers and books from different disciplines as well as data\nfrom domain-specific databases. We further fine-tune the pre-trained model on\nlots of instruction data to improve performances on downstream benchmarks. From\nexperiment results, we show that SciDFM achieves strong performance on general\nscientific benchmarks such as SciEval and SciQ, and it reaches a SOTA\nperformance on domain-specific benchmarks among models of similar size. We\nfurther analyze the expert layers and show that the results of expert selection\nvary with data from different disciplines. To benefit the broader research\ncommunity, we open-source SciDFM at\nhttps://huggingface.co/OpenDFM/SciDFM-MoE-A5.6B-v1.0.\n","authors":["Liangtai Sun","Danyu Luo","Da Ma","Zihan Zhao","Baocai Chen","Zhennan Shen","Su Zhu","Lu Chen","Xin Chen","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2409.18412v1.pdf","comment":"12 pages, 1 figure, 9 tables. Technical Report, Under Review"},{"id":"http://arxiv.org/abs/2409.18411v1","updated":"2024-09-27T02:58:46Z","published":"2024-09-27T02:58:46Z","title":"BoT-Drive: Hierarchical Behavior and Trajectory Planning for Autonomous\n Driving using POMDPs","summary":" Uncertainties in dynamic road environments pose significant challenges for\nbehavior and trajectory planning in autonomous driving. This paper introduces\nBoT-Drive, a planning algorithm that addresses uncertainties at both behavior\nand trajectory levels within a Partially Observable Markov Decision Process\n(POMDP) framework. BoT-Drive employs driver models to characterize unknown\nbehavioral intentions and utilizes their model parameters to infer hidden\ndriving styles. By also treating driver models as decision-making actions for\nthe autonomous vehicle, BoT-Drive effectively tackles the exponential\ncomplexity inherent in POMDPs. To enhance safety and robustness, the planner\nfurther applies importance sampling to refine the driving trajectory\nconditioned on the planned high-level behavior. Evaluation on real-world data\nshows that BoT-Drive consistently outperforms both existing planning methods\nand learning-based methods in regular and complex urban driving scenes,\ndemonstrating significant improvements in driving safety and reliability.\n","authors":["Xuanjin Jin","Chendong Zeng","Shengfa Zhu","Chunxiao Liu","Panpan Cai"],"pdf_url":"https://arxiv.org/pdf/2409.18411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00234v5","updated":"2024-09-27T02:55:06Z","published":"2022-12-31T15:57:09Z","title":"A Survey on In-context Learning","summary":" With the increasing capabilities of large language models (LLMs), in-context\nlearning (ICL) has emerged as a new paradigm for natural language processing\n(NLP), where LLMs make predictions based on contexts augmented with a few\nexamples. It has been a significant trend to explore ICL to evaluate and\nextrapolate the ability of LLMs. In this paper, we aim to survey and summarize\nthe progress and challenges of ICL. We first present a formal definition of ICL\nand clarify its correlation to related studies. Then, we organize and discuss\nadvanced techniques, including training strategies, prompt designing\nstrategies, and related analysis. Additionally, we explore various ICL\napplication scenarios, such as data engineering and knowledge updating.\nFinally, we address the challenges of ICL and suggest potential directions for\nfurther research. We hope that our work can encourage more research on\nuncovering how ICL works and improving ICL.\n","authors":["Qingxiu Dong","Lei Li","Damai Dai","Ce Zheng","Jingyuan Ma","Rui Li","Heming Xia","Jingjing Xu","Zhiyong Wu","Tianyu Liu","Baobao Chang","Xu Sun","Lei Li","Zhifang Sui"],"pdf_url":"https://arxiv.org/pdf/2301.00234v5.pdf","comment":"Update"},{"id":"http://arxiv.org/abs/2409.18401v1","updated":"2024-09-27T02:32:42Z","published":"2024-09-27T02:32:42Z","title":"GenesisTex2: Stable, Consistent and High-Quality Text-to-Texture\n Generation","summary":" Large-scale text-guided image diffusion models have shown astonishing results\nin text-to-image (T2I) generation. However, applying these models to synthesize\ntextures for 3D geometries remains challenging due to the domain gap between 2D\nimages and textures on a 3D surface. Early works that used a\nprojecting-and-inpainting approach managed to preserve generation diversity but\noften resulted in noticeable artifacts and style inconsistencies. While recent\nmethods have attempted to address these inconsistencies, they often introduce\nother issues, such as blurring, over-saturation, or over-smoothing. To overcome\nthese challenges, we propose a novel text-to-texture synthesis framework that\nleverages pretrained diffusion models. We first introduce a local attention\nreweighing mechanism in the self-attention layers to guide the model in\nconcentrating on spatial-correlated patches across different views, thereby\nenhancing local details while preserving cross-view consistency. Additionally,\nwe propose a novel latent space merge pipeline, which further ensures\nconsistency across different viewpoints without sacrificing too much diversity.\nOur method significantly outperforms existing state-of-the-art techniques\nregarding texture consistency and visual quality, while delivering results much\nfaster than distillation-based methods. Importantly, our framework does not\nrequire additional training or fine-tuning, making it highly adaptable to a\nwide range of models available on public platforms.\n","authors":["Jiawei Lu","Yingpeng Zhang","Zengjun Zhao","He Wang","Kun Zhou","Tianjia Shao"],"pdf_url":"https://arxiv.org/pdf/2409.18401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18399v1","updated":"2024-09-27T02:29:02Z","published":"2024-09-27T02:29:02Z","title":"Multimodal Trajectory Prediction for Autonomous Driving on Unstructured\n Roads using Deep Convolutional Network","summary":" Recently, the application of autonomous driving in open-pit mining has\ngarnered increasing attention for achieving safe and efficient mineral\ntransportation. Compared to urban structured roads, unstructured roads in\nmining sites have uneven boundaries and lack clearly defined lane markings.\nThis leads to a lack of sufficient constraint information for predicting the\ntrajectories of other human-driven vehicles, resulting in higher uncertainty in\ntrajectory prediction problems. A method is proposed to predict multiple\npossible trajectories and their probabilities of the target vehicle. The\nsurrounding environment and historical trajectories of the target vehicle are\nencoded as a rasterized image, which is used as input to our deep convolutional\nnetwork to predict the target vehicle's multiple possible trajectories. The\nmethod underwent offline testing on a dataset specifically designed for\nautonomous driving scenarios in open-pit mining and was compared and evaluated\nagainst physics-based method. The open-source code and data are available at\nhttps://github.com/LLsxyc/mine_motion_prediction.git\n","authors":["Lei Li","Zhifa Chen","Jian Wang","Bin Zhou","Guizhen Yu","Xiaoxuan Chen"],"pdf_url":"https://arxiv.org/pdf/2409.18399v1.pdf","comment":"11 pages,6 figures"},{"id":"http://arxiv.org/abs/2409.18395v1","updated":"2024-09-27T02:25:29Z","published":"2024-09-27T02:25:29Z","title":"Code Vulnerability Repair with Large Language Model using Context-Aware\n Prompt Tuning","summary":" Large Language Models (LLMs) have shown significant challenges in detecting\nand repairing vulnerable code, particularly when dealing with vulnerabilities\ninvolving multiple aspects, such as variables, code flows, and code structures.\nIn this study, we utilize GitHub Copilot as the LLM and focus on buffer\noverflow vulnerabilities. Our experiments reveal a notable gap in Copilot's\nabilities when dealing with buffer overflow vulnerabilities, with a 76%\nvulnerability detection rate but only a 15% vulnerability repair rate. To\naddress this issue, we propose context-aware prompt tuning techniques designed\nto enhance LLM performance in repairing buffer overflow. By injecting a\nsequence of domain knowledge about the vulnerability, including various\nsecurity and code contexts, we demonstrate that Copilot's successful repair\nrate increases to 63%, representing more than four times the improvement\ncompared to repairs without domain knowledge.\n","authors":["Arshiya Khan","Guannan Liu","Xing Gao"],"pdf_url":"https://arxiv.org/pdf/2409.18395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03958v2","updated":"2024-09-27T02:13:42Z","published":"2024-05-07T02:45:28Z","title":"Simple Drop-in LoRA Conditioning on Attention Layers Will Improve Your\n Diffusion Model","summary":" Current state-of-the-art diffusion models employ U-Net architectures\ncontaining convolutional and (qkv) self-attention layers. The U-Net processes\nimages while being conditioned on the time embedding input for each sampling\nstep and the class or caption embedding input corresponding to the desired\nconditional generation. Such conditioning involves scale-and-shift operations\nto the convolutional layers but does not directly affect the attention layers.\nWhile these standard architectural choices are certainly effective, not\nconditioning the attention layers feels arbitrary and potentially suboptimal.\nIn this work, we show that simply adding LoRA conditioning to the attention\nlayers without changing or tuning the other parts of the U-Net architecture\nimproves the image generation quality. For example, a drop-in addition of LoRA\nconditioning to EDM diffusion model yields FID scores of 1.91/1.75 for\nunconditional and class-conditional CIFAR-10 generation, improving upon the\nbaseline of 1.97/1.79.\n","authors":["Joo Young Choi","Jaesung R. Park","Inkyu Park","Jaewoong Cho","Albert No","Ernest K. Ryu"],"pdf_url":"https://arxiv.org/pdf/2405.03958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18390v1","updated":"2024-09-27T02:12:56Z","published":"2024-09-27T02:12:56Z","title":"Speech to Reality: On-Demand Production using Natural Language, 3D\n Generative AI, and Discrete Robotic Assembly","summary":" We present a system that transforms speech into physical objects by combining\n3D generative Artificial Intelligence with robotic assembly. The system\nleverages natural language input to make design and manufacturing more\naccessible, enabling individuals without expertise in 3D modeling or robotic\nprogramming to create physical objects. We propose utilizing discrete robotic\nassembly of lattice-based voxel components to address the challenges of using\ngenerative AI outputs in physical production, such as design variability,\nfabrication speed, structural integrity, and material waste. The system\ninterprets speech to generate 3D objects, discretizes them into voxel\ncomponents, computes an optimized assembly sequence, and generates a robotic\ntoolpath. The results are demonstrated through the assembly of various objects,\nranging from chairs to shelves, which are prompted via speech and realized\nwithin 5 minutes using a 6-axis robotic arm.\n","authors":["Alexander Htet Kyaw","Se Hwan Jeon","Miana Smith","Neil Gershenfeld"],"pdf_url":"https://arxiv.org/pdf/2409.18390v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible. An updated version will replace this version"},{"id":"http://arxiv.org/abs/2409.18385v1","updated":"2024-09-27T02:01:05Z","published":"2024-09-27T02:01:05Z","title":"Robo-CSK-Organizer: Commonsense Knowledge to Organize Detected Objects\n for Multipurpose Robots","summary":" This paper presents a system called Robo-CSK-Organizer that infuses\ncommonsense knowledge from a classical knowledge based to enhance the context\nrecognition capabilities of robots so as to facilitate the organization of\ndetected objects by classifying them in a task-relevant manner. It is\nparticularly useful in multipurpose robotics. Unlike systems relying solely on\ndeep learning tools such as ChatGPT, the Robo-CSK-Organizer system stands out\nin multiple avenues as follows. It resolves ambiguities well, and maintains\nconsistency in object placement. Moreover, it adapts to diverse task-based\nclassifications. Furthermore, it contributes to explainable AI, hence helping\nto improve trust and human-robot collaboration. Controlled experiments\nperformed in our work, simulating domestic robotics settings, make\nRobo-CSK-Organizer demonstrate superior performance while placing objects in\ncontextually relevant locations. This work highlights the capacity of an\nAI-based system to conduct commonsense-guided decision-making in robotics\ncloser to the thresholds of human cognition. Hence, Robo-CSK-Organizer makes\npositive impacts on AI and robotics.\n","authors":["Rafael Hidalgo","Jesse Parron","Aparna S. Varde","Weitian Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10727v4","updated":"2024-09-27T01:40:17Z","published":"2023-04-21T03:45:59Z","title":"RoCOCO: Robustness Benchmark of MS-COCO to Stress-test Image-Text\n Matching Models","summary":" With the extensive use of vision-language models in various downstream tasks,\nevaluating their robustness is crucial. In this paper, we propose a benchmark\nfor assessing the robustness of vision-language models. We believe that a\nrobust model should properly understand both linguistic and visual semantics\nand be resilient to explicit variations. In pursuit of this goal, we create new\nvariants of texts and images in the MS-COCO test set and re-evaluate the\nstate-of-the-art (SOTA) models with the new data. Specifically, we alter the\nmeaning of text by replacing a word, and generate visually altered images that\nmaintain some visual context while introducing noticeable pixel changes through\nimage mixing techniques.Our evaluations on the proposed benchmark reveal\nsubstantial performance degradation in many SOTA models (e.g., Image-to-Text\nRecall@1: 81.9\\% $\\rightarrow$ 48.4\\% in BLIP, 66.1\\% $\\rightarrow$ 37.6\\% in\nVSE$\\infty$), with the models often favoring the altered texts/images over the\noriginal ones. This indicates the current vision-language models struggle with\nsubtle changes and often fail to understand the overall context of texts and\nimages. Based on these findings, we propose semantic contrastive loss and\nvisual contrastive loss to learn more robust embedding. Datasets and code are\navailable at {\\url{https://github.com/pseulki/rococo}}.\n","authors":["Seulki Park","Daeho Um","Hajung Yoon","Sanghyuk Chun","Sangdoo Yun"],"pdf_url":"https://arxiv.org/pdf/2304.10727v4.pdf","comment":"Accepted to ECCV Synthetic Data for Computer Vision Workshop (Oral)"},{"id":"http://arxiv.org/abs/2409.18374v1","updated":"2024-09-27T01:25:22Z","published":"2024-09-27T01:25:22Z","title":"Adaptive Learning of the Latent Space of Wasserstein Generative\n Adversarial Networks","summary":" Generative models based on latent variables, such as generative adversarial\nnetworks (GANs) and variational auto-encoders (VAEs), have gained lots of\ninterests due to their impressive performance in many fields. However, many\ndata such as natural images usually do not populate the ambient Euclidean space\nbut instead reside in a lower-dimensional manifold. Thus an inappropriate\nchoice of the latent dimension fails to uncover the structure of the data,\npossibly resulting in mismatch of latent representations and poor generative\nqualities. Towards addressing these problems, we propose a novel framework\ncalled the latent Wasserstein GAN (LWGAN) that fuses the Wasserstein\nauto-encoder and the Wasserstein GAN so that the intrinsic dimension of the\ndata manifold can be adaptively learned by a modified informative latent\ndistribution. We prove that there exist an encoder network and a generator\nnetwork in such a way that the intrinsic dimension of the learned encoding\ndistribution is equal to the dimension of the data manifold. We theoretically\nestablish that our estimated intrinsic dimension is a consistent estimate of\nthe true dimension of the data manifold. Meanwhile, we provide an upper bound\non the generalization error of LWGAN, implying that we force the synthetic data\ndistribution to be similar to the real data distribution from a population\nperspective. Comprehensive empirical experiments verify our framework and show\nthat LWGAN is able to identify the correct intrinsic dimension under several\nscenarios, and simultaneously generate high-quality synthetic data by sampling\nfrom the learned latent distribution.\n","authors":["Yixuan Qiu","Qingyi Gao","Xiao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18364v1","updated":"2024-09-27T00:49:08Z","published":"2024-09-27T00:49:08Z","title":"Multi-hypotheses Conditioned Point Cloud Diffusion for 3D Human\n Reconstruction from Occluded Images","summary":" 3D human shape reconstruction under severe occlusion due to human-object or\nhuman-human interaction is a challenging problem. Parametric models i.e.,\nSMPL(-X), which are based on the statistics across human shapes, can represent\nwhole human body shapes but are limited to minimally-clothed human shapes.\nImplicit-function-based methods extract features from the parametric models to\nemploy prior knowledge of human bodies and can capture geometric details such\nas clothing and hair. However, they often struggle to handle misaligned\nparametric models and inpaint occluded regions given a single RGB image. In\nthis work, we propose a novel pipeline, MHCDIFF, Multi-hypotheses Conditioned\nPoint Cloud Diffusion, composed of point cloud diffusion conditioned on\nprobabilistic distributions for pixel-aligned detailed 3D human reconstruction\nunder occlusion. Compared to previous implicit-function-based methods, the\npoint cloud diffusion model can capture the global consistent features to\ngenerate the occluded regions, and the denoising process corrects the\nmisaligned SMPL meshes. The core of MHCDIFF is extracting local features from\nmultiple hypothesized SMPL(-X) meshes and aggregating the set of features to\ncondition the diffusion model. In the experiments on CAPE and MultiHuman\ndatasets, the proposed method outperforms various SOTA methods based on SMPL,\nimplicit functions, point cloud diffusion, and their combined, under synthetic\nand real occlusions.\n","authors":["Donghwan Kim","Tae-Kyun Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18364v1.pdf","comment":"17 pages, 7 figures, accepted NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.18351v1","updated":"2024-09-27T00:05:01Z","published":"2024-09-27T00:05:01Z","title":"Tracking Software Security Topics","summary":" Software security incidents occur everyday and thousands of software security\nreports are announced each month. Thus, it is difficult for software security\nresearchers, engineers, and other stakeholders to follow software security\ntopics of their interests in real-time. In this paper, we propose, SOSK, a\nnovel tool for this problem. SOSK allows a user to import a collection of\nsoftware security reports. It pre-processes and extracts the most important\nkeywords from the textual description of the reports. Based on the similarity\nof embedding vectors of keywords, SOSK can expand and/or refine a keyword set\nfrom a much smaller set of user-provided keywords. Thus, SOSK allows users to\ndefine any topic of their interests and retrieve security reports relevant to\nthat topic effectively. Our preliminary evaluation shows that SOSK can expand\nkeywords and retrieve reports relevant to user requests.\n","authors":["Phong Minh Vu","Tung Thanh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.18351v1.pdf","comment":null}],"Computation and Language":[{"id":"http://arxiv.org/abs/2409.17827v2","updated":"2024-09-27T16:07:54Z","published":"2024-09-26T13:26:46Z","title":"BeanCounter: A low-toxicity, large-scale, and open dataset of\n business-oriented text","summary":" Many of the recent breakthroughs in language modeling have resulted from\nscaling effectively the same model architecture to larger datasets. In this\nvein, recent work has highlighted performance gains from increasing training\ndataset size and quality, suggesting a need for novel sources of large-scale\ndatasets. In this work, we introduce BeanCounter, a public dataset consisting\nof more than 159B tokens extracted from businesses' disclosures. We show that\nthis data is indeed novel: less than 0.1% of BeanCounter appears in Common\nCrawl-based datasets and it is an order of magnitude larger than datasets\nrelying on similar sources. Given the data's provenance, we hypothesize that\nBeanCounter is comparatively more factual and less toxic than web-based\ndatasets. Exploring this hypothesis, we find that many demographic identities\noccur with similar prevalence in BeanCounter but with significantly less toxic\ncontext relative to other datasets. To demonstrate the utility of BeanCounter,\nwe evaluate and compare two LLMs continually pre-trained on BeanCounter with\ntheir base models. We find an 18-33% reduction in toxic generation and improved\nperformance within the finance domain for the continually pretrained models.\nCollectively, our work suggests that BeanCounter is a novel source of\nlow-toxicity and high-quality domain-specific data with sufficient scale to\ntrain multi-billion parameter LLMs.\n","authors":["Siyan Wang","Bradford Levy"],"pdf_url":"https://arxiv.org/pdf/2409.17827v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17745v2","updated":"2024-09-27T08:19:29Z","published":"2024-09-26T11:19:09Z","title":"Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval\n Model","summary":" A supervised ranking model, despite its advantage of being effective, usually\ninvolves complex processing - typically multiple stages of task-specific\npre-training and fine-tuning. This has motivated researchers to explore simpler\npipelines leveraging large language models (LLMs) that are capable of working\nin a zero-shot manner. However, since zero-shot inference does not make use of\na training set of pairs of queries and their relevant documents, its\nperformance is mostly worse than that of supervised models, which are trained\non such example pairs. Motivated by the existing findings that training\nexamples generally improve zero-shot performance, in our work, we explore if\nthis also applies to ranking models. More specifically, given a query and a\npair of documents, the preference prediction task is improved by augmenting\nexamples of preferences for similar queries from a training set. Our proposed\npairwise few-shot ranker demonstrates consistent improvements over the\nzero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset)\nretrieval benchmarks. Our method also achieves a close performance to that of a\nsupervised model without requiring any complex training pipeline.\n","authors":["Nilanjan Sinhababu","Andrew Parry","Debasis Ganguly","Debasis Samanta","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2409.17745v2.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2403.01432v4","updated":"2024-09-27T13:47:33Z","published":"2024-03-03T08:07:55Z","title":"Fine Tuning vs. Retrieval Augmented Generation for Less Popular\n Knowledge","summary":" Language Models (LMs) memorize a vast amount of factual knowledge, exhibiting\nstrong performance across diverse tasks and domains. However, it has been\nobserved that the performance diminishes when dealing with less-popular or\nlow-frequency concepts and entities, for example in domain specific\napplications. The two prominent approaches to enhance the performance of LMs on\nlow-frequent topics are: Retrieval Augmented Generation (RAG) and fine-tuning\n(FT) over synthetic data. This paper explores and evaluates the impact of RAG\nand FT on customizing LMs in handling low-frequency entities on question\nanswering tasks. We conduct extensive experiments on twelve LMs of varying size\nand type and different fine tuning, data augmentation, and retrieval models.\nOur findings indicate that while FT boosts the performance across entities of\nvarying popularity, RAG surpasses FT by a large margin particularly for least\npopular factual knowledge. Additionally, the success of both RAG and FT\napproaches is amplified by improving retrieval and data augmentation\ntechniques. Fine tuning, while beneficial for small LMs, requires extensive\nresources. To address this issue, we propose the new Stimulus RAG approach that\nsurpasses the effectiveness of fine tuning based approaches, thereby\neliminating the need for the costly data augmentation and fine tuning step for\nenriching LMs with less popular factual knowledge.\n","authors":["Heydar Soudani","Evangelos Kanoulas","Faegheh Hasibi"],"pdf_url":"https://arxiv.org/pdf/2403.01432v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06802v3","updated":"2024-09-27T06:13:06Z","published":"2024-05-10T20:29:25Z","title":"Summarizing Radiology Reports Findings into Impressions","summary":" Patient hand-off and triage are two fundamental problems in health care.\nOften doctors must painstakingly summarize complex findings to efficiently\ncommunicate with specialists and quickly make decisions on which patients have\nthe most urgent cases. In pursuit of these challenges, we present (1) a model\nwith state-of-art radiology report summarization performance using (2) a novel\nmethod for augmenting medical data, and (3) an analysis of the model\nlimitations and radiology knowledge gain. We also provide a data processing\npipeline for future models developed on the the MIMIC CXR dataset. Our best\nperforming model was a fine-tuned BERT-to-BERT encoder-decoder with 58.75/100\nROUGE-L F1, which outperformed specialized checkpoints with more sophisticated\nattention mechanisms. We investigate these aspects in this work.\n","authors":["Raul Salles de Padua","Imran Qureshi"],"pdf_url":"https://arxiv.org/pdf/2405.06802v3.pdf","comment":"This version reverts to the original preprint, following the advice\n from the Artificial Intelligence in Health editorial office. The published\n version is peer-reviewed and available in the journal (see external DOI). The\n preprint remains unchanged to maintain version transparency, as noted in the\n further disclosure section of the published article"},{"id":"http://arxiv.org/abs/2409.17545v2","updated":"2024-09-27T06:48:08Z","published":"2024-09-26T05:24:14Z","title":"Modulated Intervention Preference Optimization (MIPO): Keep the Easy,\n Refine the Difficult","summary":" Preference optimization methods typically begin training with a well-trained\nSFT model as a reference model. In RLHF and DPO, a regularization term is used\nduring the preference optimization process to prevent the policy model from\ndeviating too far from the reference model's distribution, thereby avoiding the\ngeneration of anomalous responses. When the reference model is already\nwell-aligned with the given data or only requires slight adjustments, this\napproach can produce a well-aligned model. However, if the reference model is\nnot aligned with the given data and requires significant deviation from its\ncurrent state, a regularization term may actually hinder the model alignment.\nIn this study, we propose \\textbf{Modulated Intervention Preference\nOptimization (MIPO)} to address this issue. MIPO modulates the degree of\nintervention from the reference model based on how well the given data is\naligned with it. If the data is well-aligned, the intervention is increased to\nprevent the policy model from diverging significantly from reference model.\nConversely, if the alignment is poor, the interference is reduced to facilitate\nmore extensive training. We compare the performance of MIPO and DPO using\nMistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental\nresults demonstrate that MIPO consistently outperforms DPO across various\nevaluation scenarios.\n","authors":["Cheolhun Jang"],"pdf_url":"https://arxiv.org/pdf/2409.17545v2.pdf","comment":"8pages, submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2409.17391v2","updated":"2024-09-27T02:18:22Z","published":"2024-09-25T22:08:31Z","title":"Scaling Behavior for Large Language Models regarding Numeral Systems: An\n Example using Pythia","summary":" Though Large Language Models (LLMs) have shown remarkable abilities in\nmathematics reasoning, they are still struggling with performing numeric\noperations accurately, such as addition and multiplication. Numbers can be\ntokenized into tokens in various ways by different LLMs and affect the numeric\noperations performance. Currently, there are two representatives: 1) Tokenize\ninto $1$-digit, and 2) Tokenize into $1\\sim 3$ digit. The difference is roughly\nequivalent to using different numeral systems (namely base $10$ or base\n$10^{3}$). In light of this, we study the scaling behavior of different numeral\nsystems in the context of transformer-based large language models. We\nempirically show that a base $10$ system is consistently more data-efficient\nthan a base $10^{2}$ or $10^{3}$ system across training data scale, model sizes\nunder from-scratch training settings, while different number systems have very\nsimilar fine-tuning performances. We attribute this to higher token frequencies\nof a base $10$ system. Additionally, we reveal extrapolation behavior patterns\non addition and multiplication. We identify that base $100$ and base $1000$\nsystems struggle on token-level discernment and token-level operations. We also\nsheds light on the mechanism learnt by the models.\n","authors":["Zhejian Zhou","Jiayu Wang","Dahua Lin","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2409.17391v2.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17353v2","updated":"2024-09-27T01:42:54Z","published":"2024-09-25T20:59:12Z","title":"Internalizing ASR with Implicit Chain of Thought for Efficient\n Speech-to-Speech Conversational LLM","summary":" Current speech-based LLMs are predominantly trained on extensive ASR and TTS\ndatasets, excelling in tasks related to these domains. However, their ability\nto handle direct speech-to-speech conversations remains notably constrained.\nThese models often rely on an ASR-to-TTS chain-of-thought pipeline, converting\nspeech into text for processing before generating audio responses, which\nintroduces latency and loses audio features. We propose a method that\nimplicitly internalizes ASR chain of thought into a speech LLM, enhancing its\nnative speech understanding capabilities. Our approach reduces latency and\nimproves the model's native understanding of speech, paving the way for more\nefficient and natural real-time audio interactions. We also release a\nlarge-scale synthetic conversational dataset to facilitate further research.\n","authors":["Robin Shing-Hei Yuen","Timothy Tin-Long Tse","Jian Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.17353v2.pdf","comment":"Corrected style from final to preprint"},{"id":"http://arxiv.org/abs/2409.17213v2","updated":"2024-09-27T12:12:44Z","published":"2024-09-25T17:38:39Z","title":"Plurals: A System for Guiding LLMs Via Simulated Social Ensembles","summary":" Recent debates raised concerns that language models may favor certain\nviewpoints. But what if the solution is not to aim for a 'view from nowhere'\nbut rather to leverage different viewpoints? We introduce Plurals, a system and\nPython library for pluralistic AI deliberation. Plurals consists of Agents\n(LLMs, optionally with personas) which deliberate within customizable\nStructures, with Moderators overseeing deliberation. Plurals is a generator of\nsimulated social ensembles. Plurals integrates with government datasets to\ncreate nationally representative personas, includes deliberation templates\ninspired by democratic deliberation theory, and allows users to customize both\ninformation-sharing structures and deliberation behavior within Structures. Six\ncase studies demonstrate fidelity to theoretical constructs and efficacy. Three\nrandomized experiments show simulated focus groups produced output resonant\nwith an online sample of the relevant audiences (chosen over zero-shot\ngeneration in 75% of trials). Plurals is both a paradigm and a concrete system\nfor pluralistic AI. The Plurals library is available at\nhttps://github.com/josh-ashkinaze/plurals and will be continually updated.\n","authors":["Joshua Ashkinaze","Emily Fry","Narendra Edara","Eric Gilbert","Ceren Budak"],"pdf_url":"https://arxiv.org/pdf/2409.17213v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16937v2","updated":"2024-09-27T11:16:35Z","published":"2024-09-25T13:51:19Z","title":"Semi-Supervised Cognitive State Classification from Speech with\n Multi-View Pseudo-Labeling","summary":" The lack of labeled data is a common challenge in speech classification\ntasks, particularly those requiring extensive subjective assessment, such as\ncognitive state classification. In this work, we propose a Semi-Supervised\nLearning (SSL) framework, introducing a novel multi-view pseudo-labeling method\nthat leverages both acoustic and linguistic characteristics to select the most\nconfident data for training the classification model. Acoustically, unlabeled\ndata are compared to labeled data using the Frechet audio distance, calculated\nfrom embeddings generated by multiple audio encoders. Linguistically, large\nlanguage models are prompted to revise automatic speech recognition\ntranscriptions and predict labels based on our proposed task-specific\nknowledge. High-confidence data are identified when pseudo-labels from both\nsources align, while mismatches are treated as low-confidence data. A bimodal\nclassifier is then trained to iteratively label the low-confidence data until a\npredefined criterion is met. We evaluate our SSL framework on emotion\nrecognition and dementia detection tasks. Experimental results demonstrate that\nour method achieves competitive performance compared to fully supervised\nlearning using only 30% of the labeled data and significantly outperforms two\nselected baselines.\n","authors":["Yuanchao Li","Zixing Zhang","Jing Han","Peter Bell","Catherine Lai"],"pdf_url":"https://arxiv.org/pdf/2409.16937v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18957v1","updated":"2024-09-27T17:58:50Z","published":"2024-09-27T17:58:50Z","title":"LML: Language Model Learning a Dataset for Data-Augmented Prediction","summary":" This paper introduces a new approach to using Large Language Models (LLMs)\nfor classification tasks, which are typically handled using Machine Learning\n(ML) models. Unlike ML models that rely heavily on data cleaning and feature\nengineering, this method streamlines the process using LLMs. This paper\nproposes a new concept called \"Language Model Learning (LML)\" powered by a new\nmethod called \"Data-Augmented Prediction (DAP)\". The classification is\nperformed by LLMs using a method similar to humans manually exploring and\nunderstanding the data and deciding classifications using data as a reference.\nTraining data is summarized and evaluated to determine the features that lead\nto the classification of each label the most. In the process of DAP, the system\nuses the data summary to automatically create a query, which is used to\nretrieve relevant rows from the dataset. A classification is generated by the\nLLM using data summary and relevant rows, ensuring satisfactory accuracy even\nwith complex data. Usage of data summary and similar data in DAP ensures\ncontext-aware decision-making. The proposed method uses the words \"Act as an\nExplainable Machine Learning Model\" in the prompt to enhance the\ninterpretability of the predictions by allowing users to review the logic\nbehind each prediction. In some test cases, the system scored an accuracy above\n90%, proving the effectiveness of the system and its potential to outperform\nconventional ML models in various scenarios. The code is available at\nhttps://github.com/Pro-GenAI/LML-DAP\n","authors":["Praneeth Vadlapati"],"pdf_url":"https://arxiv.org/pdf/2409.18957v1.pdf","comment":"First version"},{"id":"http://arxiv.org/abs/2409.18943v1","updated":"2024-09-27T17:44:58Z","published":"2024-09-27T17:44:58Z","title":"Ruler: A Model-Agnostic Method to Control Generated Length for Large\n Language Models","summary":" The instruction-following ability of large language models enables humans to\ninteract with AI agents in a natural way. However, when required to generate\nresponses of a specific length, large language models often struggle to meet\nusers' needs due to their inherent difficulty in accurately perceiving\nnumerical constraints. To explore the ability of large language models to\ncontrol the length of generated responses, we propose the Target Length\nGeneration Task (TLG) and design two metrics, Precise Match (PM) and Flexible\nMatch (FM) to evaluate the model's performance in adhering to specified\nresponse lengths. Furthermore, we introduce a novel, model-agnostic approach\ncalled Ruler, which employs Meta Length Tokens (MLTs) to enhance the\ninstruction-following ability of large language models under length-constrained\ninstructions. Specifically, Ruler equips LLMs with the ability to generate\nresponses of a specified length based on length constraints within the\ninstructions. Moreover, Ruler can automatically generate appropriate MLT when\nlength constraints are not explicitly provided, demonstrating excellent\nversatility and generalization. Comprehensive experiments show the\neffectiveness of Ruler across different LLMs on Target Length Generation Task,\ne.g., at All Level 27.97 average gain on PM, 29.57 average gain on FM. In\naddition, we conduct extensive ablation experiments to further substantiate the\nefficacy and generalization of Ruler. Our code and data is available at\nhttps://github.com/Geaming2002/Ruler.\n","authors":["Jiaming Li","Lei Zhang","Yunshui Li","Ziqiang Liu","yuelin bai","Run Luo","Longze Chen","Min Yang"],"pdf_url":"https://arxiv.org/pdf/2409.18943v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18924v1","updated":"2024-09-27T17:17:15Z","published":"2024-09-27T17:17:15Z","title":"AIPatient: Simulating Patients with EHRs and LLM Powered Agentic\n Workflow","summary":" Simulated patient systems play a crucial role in modern medical education and\nresearch, providing safe, integrative learning environments and enabling\nclinical decision-making simulations. Large Language Models (LLM) could advance\nsimulated patient systems by replicating medical conditions and patient-doctor\ninteractions with high fidelity and low cost. However, ensuring the\neffectiveness and trustworthiness of these systems remains a challenge, as they\nrequire a large, diverse, and precise patient knowledgebase, along with a\nrobust and stable knowledge diffusion to users. Here, we developed AIPatient,\nan advanced simulated patient system with AIPatient Knowledge Graph (AIPatient\nKG) as the input and the Reasoning Retrieval-Augmented Generation (Reasoning\nRAG) agentic workflow as the generation backbone. AIPatient KG samples data\nfrom Electronic Health Records (EHRs) in the Medical Information Mart for\nIntensive Care (MIMIC)-III database, producing a clinically diverse and\nrelevant cohort of 1,495 patients with high knowledgebase validity (F1 0.89).\nReasoning RAG leverages six LLM powered agents spanning tasks including\nretrieval, KG query generation, abstraction, checker, rewrite, and\nsummarization. This agentic framework reaches an overall accuracy of 94.15% in\nEHR-based medical Question Answering (QA), outperforming benchmarks that use\neither no agent or only partial agent integration. Our system also presents\nhigh readability (median Flesch Reading Ease 77.23; median Flesch Kincaid Grade\n5.6), robustness (ANOVA F-value 0.6126, p<0.1), and stability (ANOVA F-value\n0.782, p<0.1). The promising performance of the AIPatient system highlights its\npotential to support a wide range of applications, including medical education,\nmodel evaluation, and system integration.\n","authors":["Huizi Yu","Jiayan Zhou","Lingyao Li","Shan Chen","Jack Gallifant","Anye Shi","Xiang Li","Wenyue Hua","Mingyu Jin","Guang Chen","Yang Zhou","Zhao Li","Trisha Gupte","Ming-Li Chen","Zahra Azizi","Yongfeng Zhang","Themistocles L. Assimes","Xin Ma","Danielle S. Bitterman","Lin Lu","Lizhou Fan"],"pdf_url":"https://arxiv.org/pdf/2409.18924v1.pdf","comment":"42 pages, 6 figures, 7 tables"},{"id":"http://arxiv.org/abs/2409.18911v1","updated":"2024-09-27T16:54:36Z","published":"2024-09-27T16:54:36Z","title":"Soft Measures for Extracting Causal Collective Intelligence","summary":" Understanding and modeling collective intelligence is essential for\naddressing complex social systems. Directed graphs called fuzzy cognitive maps\n(FCMs) offer a powerful tool for encoding causal mental models, but extracting\nhigh-integrity FCMs from text is challenging. This study presents an approach\nusing large language models (LLMs) to automate FCM extraction. We introduce\nnovel graph-based similarity measures and evaluate them by correlating their\noutputs with human judgments through the Elo rating system. Results show\npositive correlations with human evaluations, but even the best-performing\nmeasure exhibits limitations in capturing FCM nuances. Fine-tuning LLMs\nimproves performance, but existing measures still fall short. This study\nhighlights the need for soft similarity measures tailored to FCM extraction,\nadvancing collective intelligence modeling with NLP.\n","authors":["Maryam Berijanian","Spencer Dork","Kuldeep Singh","Michael Riley Millikan","Ashlin Riggs","Aadarsh Swaminathan","Sarah L. Gibbs","Scott E. Friedman","Nathan Brugnone"],"pdf_url":"https://arxiv.org/pdf/2409.18911v1.pdf","comment":"Camera-ready version accepted for publication in the EMNLP 2024\n Workshop NLP4Science"},{"id":"http://arxiv.org/abs/2409.18892v1","updated":"2024-09-27T16:29:12Z","published":"2024-09-27T16:29:12Z","title":"IDGen: Item Discrimination Induced Prompt Generation for LLM Evaluation","summary":" As Large Language Models (LLMs) grow increasingly adept at managing complex\ntasks, the evaluation set must keep pace with these advancements to ensure it\nremains sufficiently discriminative. Item Discrimination (ID) theory, which is\nwidely used in educational assessment, measures the ability of individual test\nitems to differentiate between high and low performers. Inspired by this\ntheory, we propose an ID-induced prompt synthesis framework for evaluating LLMs\nto ensure the evaluation set can continually update and refine according to\nmodel abilities. Our data synthesis framework prioritizes both breadth and\nspecificity. It can generate prompts that comprehensively evaluate the\ncapabilities of LLMs while revealing meaningful performance differences between\nmodels, allowing for effective discrimination of their relative strengths and\nweaknesses across various tasks and domains. To produce high-quality data, we\nincorporate a self-correct mechanism into our generalization framework, and\ndevelop two models to predict prompt discrimination and difficulty score to\nfacilitate our data synthesis framework, contributing valuable tools to\nevaluation data synthesis research. We apply our generated data to evaluate\nfive SOTA models. Our data achieves an average score of 51.92, accompanied by a\nvariance of 10.06. By contrast, previous works (i.e., SELF-INSTRUCT and\nWizardLM) obtain an average score exceeding 67, with a variance below 3.2. The\nresults demonstrate that the data generated by our framework is more\nchallenging and discriminative compared to previous works. We will release a\ndataset of over 3,000 carefully crafted prompts to facilitate evaluation\nresearch of LLMs.\n","authors":["Fan Lin","Shuyi Xie","Yong Dai","Wenlin Yao","Tianjiao Lang","Zishan Xu","Zhichao Hu","Xiao Xiao","Yuhong Liu","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18892v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.15657v3","updated":"2024-09-27T16:24:50Z","published":"2024-09-24T01:40:24Z","title":"M$^2$PT: Multimodal Prompt Tuning for Zero-shot Instruction Learning","summary":" Multimodal Large Language Models (MLLMs) demonstrate remarkable performance\nacross a wide range of domains, with increasing emphasis on enhancing their\nzero-shot generalization capabilities for unseen tasks across various\nmodalities. Instruction tuning has emerged as an effective strategy for\nachieving zero-shot generalization by finetuning pretrained models on diverse\nmultimodal tasks. As the scale of MLLMs continues to grow, parameter-efficient\nfinetuning becomes increasingly critical. However, most existing\nparameter-efficient approaches focus only on single modalities and often\noverlook the multimodal characteristics during finetuning. In this work, we\nintroduce a novel Multimodal Prompt Tuning (M$^2$PT) approach for efficient\ninstruction tuning of MLLMs. M$^2$PT effectively integrates visual and textual\nprompts into the vision encoder and language processor respectively during\nfinetuning, facilitating the extraction and alignment of features across\nmodalities. Empirical results on various multimodal evaluation datasets\ndemonstrate the superior performance of our approach compared to several\nstate-of-the-art baselines. A comprehensive set of ablation studies validates\nthe effectiveness of our prompt design and the efficiency of our approach.\n","authors":["Taowen Wang","Yiyang Liu","James Chenhao Liang","junhan zhao","Yiming Cui","Yuning Mao","Shaoliang Nie","Jiahao Liu","Fuli Feng","Zenglin Xu","Cheng Han","Lifu Huang","Qifan Wang","Dongfang Liu"],"pdf_url":"https://arxiv.org/pdf/2409.15657v3.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18878v1","updated":"2024-09-27T16:13:38Z","published":"2024-09-27T16:13:38Z","title":"Suicide Phenotyping from Clinical Notes in Safety-Net Psychiatric\n Hospital Using Multi-Label Classification with Pre-Trained Language Models","summary":" Accurate identification and categorization of suicidal events can yield\nbetter suicide precautions, reducing operational burden, and improving care\nquality in high-acuity psychiatric settings. Pre-trained language models offer\npromise for identifying suicidality from unstructured clinical narratives. We\nevaluated the performance of four BERT-based models using two fine-tuning\nstrategies (multiple single-label and single multi-label) for detecting\ncoexisting suicidal events from 500 annotated psychiatric evaluation notes. The\nnotes were labeled for suicidal ideation (SI), suicide attempts (SA), exposure\nto suicide (ES), and non-suicidal self-injury (NSSI). RoBERTa outperformed\nother models using binary relevance (acc=0.86, F1=0.78). MentalBERT (F1=0.74)\nalso exceeded BioClinicalBERT (F1=0.72). RoBERTa fine-tuned with a single\nmulti-label classifier further improved performance (acc=0.88, F1=0.81),\nhighlighting that models pre-trained on domain-relevant data and the single\nmulti-label classification strategy enhance efficiency and performance.\n Keywords: EHR-based Phynotyping; Natural Language Processing; Secondary Use\nof EHR Data; Suicide Classification; BERT-based Model; Psychiatry; Mental\nHealth\n","authors":["Zehan Li","Yan Hu","Scott Lane","Salih Selek","Lokesh Shahani","Rodrigo Machado-Vieira","Jair Soares","Hua Xu","Hongfang Liu","Ming Huang"],"pdf_url":"https://arxiv.org/pdf/2409.18878v1.pdf","comment":"submitted to AMIA Informatics Summit 2025 as a conference paper"},{"id":"http://arxiv.org/abs/2409.03291v2","updated":"2024-09-27T16:04:40Z","published":"2024-09-05T06:55:13Z","title":"LLM Detectors Still Fall Short of Real World: Case of LLM-Generated\n Short News-Like Posts","summary":" With the emergence of widely available powerful LLMs, disinformation\ngenerated by large Language Models (LLMs) has become a major concern.\nHistorically, LLM detectors have been touted as a solution, but their\neffectiveness in the real world is still to be proven. In this paper, we focus\non an important setting in information operations -- short news-like posts\ngenerated by moderately sophisticated attackers.\n We demonstrate that existing LLM detectors, whether zero-shot or\npurpose-trained, are not ready for real-world use in that setting. All tested\nzero-shot detectors perform inconsistently with prior benchmarks and are highly\nvulnerable to sampling temperature increase, a trivial attack absent from\nrecent benchmarks. A purpose-trained detector generalizing across LLMs and\nunseen attacks can be developed, but it fails to generalize to new\nhuman-written texts.\n We argue that the former indicates domain-specific benchmarking is needed,\nwhile the latter suggests a trade-off between the adversarial evasion\nresilience and overfitting to the reference human text, with both needing\nevaluation in benchmarks and currently absent. We believe this suggests a\nre-consideration of current LLM detector benchmarking approaches and provides a\ndynamically extensible benchmark to allow it\n(https://github.com/Reliable-Information-Lab-HEVS/benchmark_llm_texts_detection).\n","authors":["Henrique Da Silva Gameiro","Andrei Kucharavy","Ljiljana Dolamic"],"pdf_url":"https://arxiv.org/pdf/2409.03291v2.pdf","comment":"20 pages, 7 tables, 13 figures, under consideration for EMNLP"},{"id":"http://arxiv.org/abs/2409.18868v1","updated":"2024-09-27T16:04:06Z","published":"2024-09-27T16:04:06Z","title":"Individuation in Neural Models with and without Visual Grounding","summary":" We show differences between a language-and-vision model CLIP and two\ntext-only models - FastText and SBERT - when it comes to the encoding of\nindividuation information. We study latent representations that CLIP provides\nfor substrates, granular aggregates, and various numbers of objects. We\ndemonstrate that CLIP embeddings capture quantitative differences in\nindividuation better than models trained on text-only data. Moreover, the\nindividuation hierarchy we deduce from the CLIP embeddings agrees with the\nhierarchies proposed in linguistics and cognitive science.\n","authors":["Alexey Tikhonov","Lisa Bylinina","Ivan P. Yamshchikov"],"pdf_url":"https://arxiv.org/pdf/2409.18868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17730v2","updated":"2024-09-27T15:27:58Z","published":"2024-04-26T23:30:07Z","title":"Bridging the Social & Technical Divide in Augmentative and Alternative\n Communication (AAC) Applications for Autistic Adults","summary":" Natural Language Processing (NLP) techniques are being used more frequently\nto improve high-tech Augmentative and Alternative Communication (AAC), but many\nof these techniques are integrated without the inclusion of the users'\nperspectives. Autistic adults are particularly neglected in the design of AAC\ntools. We conducted in-depth interviews with 12 autistic adults to find the\npain points of current AAC and determine what technological advances they might\nfind helpful. We found that in addition to technological issues, there are many\nsocietal issues as well. We found 9 different categories of themes from our\ninterviews: input flexibility, output flexibility, selecting or adapting AAC\nfor a good fit, when to start or swap AAC, benefits, access as an adult,\nstumbling blocks for continued use, social concerns, and control of\ncommunication. In this paper, we go through these categories in depth and then\nsuggest possible guidelines for developers, NLP researchers, and policy makers.\n","authors":["Lara J. Martin","Malathy Nagalakshmi"],"pdf_url":"https://arxiv.org/pdf/2404.17730v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19898v2","updated":"2024-09-27T15:17:53Z","published":"2024-06-28T13:06:31Z","title":"Paraphrase Types Elicit Prompt Engineering Capabilities","summary":" Much of the success of modern language models depends on finding a suitable\nprompt to instruct the model. Until now, it has been largely unknown how\nvariations in the linguistic expression of prompts affect these models. This\nstudy systematically and empirically evaluates which linguistic features\ninfluence models through paraphrase types, i.e., different linguistic changes\nat particular positions. We measure behavioral changes for five models across\n120 tasks and six families of paraphrases (i.e., morphology, syntax, lexicon,\nlexico-syntax, discourse, and others). We also control for other prompt\nengineering factors (e.g., prompt length, lexical diversity, and proximity to\ntraining data). Our results show a potential for language models to improve\ntasks when their prompts are adapted in specific paraphrase types (e.g., 6.7%\nmedian gain in Mixtral 8x7B; 5.5% in LLaMA 3 8B). In particular, changes in\nmorphology and lexicon, i.e., the vocabulary used, showed promise in improving\nprompts. These findings contribute to developing more robust language models\ncapable of handling variability in linguistic expression.\n","authors":["Jan Philip Wahle","Terry Ruas","Yang Xu","Bela Gipp"],"pdf_url":"https://arxiv.org/pdf/2406.19898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18819v1","updated":"2024-09-27T15:15:50Z","published":"2024-09-27T15:15:50Z","title":"Local Transcription Models in Home Care Nursing in Switzerland: an\n Interdisciplinary Case Study","summary":" Latest advances in the field of natural language processing (NLP) enable new\nuse cases for different domains, including the medical sector. In particular,\ntranscription can be used to support automation in the nursing documentation\nprocess and give nurses more time to interact with the patients. However,\ndifferent challenges including (a) data privacy, (b) local languages and\ndialects, and (c) domain-specific vocabulary need to be addressed. In this case\nstudy, we investigate the case of home care nursing documentation in\nSwitzerland. We assessed different transcription tools and models, and\nconducted several experiments with OpenAI Whisper, involving different\nvariations of German (i.e., dialects, foreign accent) and manually curated\nexample texts by a domain expert of home care nursing. Our results indicate\nthat even the used out-of-the-box model performs sufficiently well to be a good\nstarting point for future research in the field.\n","authors":["Jeremy Kramer","Tetiana Kravchenko","Beatrice Kaufmann","Friederike J. S. Thilo","Mascha Kurpicz-Briki"],"pdf_url":"https://arxiv.org/pdf/2409.18819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18812v1","updated":"2024-09-27T15:04:39Z","published":"2024-09-27T15:04:39Z","title":"LLMs4Synthesis: Leveraging Large Language Models for Scientific\n Synthesis","summary":" In response to the growing complexity and volume of scientific literature,\nthis paper introduces the LLMs4Synthesis framework, designed to enhance the\ncapabilities of Large Language Models (LLMs) in generating high-quality\nscientific syntheses. This framework addresses the need for rapid, coherent,\nand contextually rich integration of scientific insights, leveraging both\nopen-source and proprietary LLMs. It also examines the effectiveness of LLMs in\nevaluating the integrity and reliability of these syntheses, alleviating\ninadequacies in current quantitative metrics. Our study contributes to this\nfield by developing a novel methodology for processing scientific papers,\ndefining new synthesis types, and establishing nine detailed quality criteria\nfor evaluating syntheses. The integration of LLMs with reinforcement learning\nand AI feedback is proposed to optimize synthesis quality, ensuring alignment\nwith established criteria. The LLMs4Synthesis framework and its components are\nmade available, promising to enhance both the generation and evaluation\nprocesses in scientific research synthesis.\n","authors":["Hamed Babaei Giglou","Jennifer D'Souza","Sören Auer"],"pdf_url":"https://arxiv.org/pdf/2409.18812v1.pdf","comment":"12 pages, 3 figures, Accepted to JCDL 2024 Research Track"},{"id":"http://arxiv.org/abs/2402.01115v5","updated":"2024-09-27T15:04:24Z","published":"2024-02-02T03:15:13Z","title":"Interpretation of Intracardiac Electrograms Through Textual\n Representations","summary":" Understanding the irregular electrical activity of atrial fibrillation (AFib)\nhas been a key challenge in electrocardiography. For serious cases of AFib,\ncatheter ablations are performed to collect intracardiac electrograms (EGMs).\nEGMs offer intricately detailed and localized electrical activity of the heart\nand are an ideal modality for interpretable cardiac studies. Recent\nadvancements in artificial intelligence (AI) has allowed some works to utilize\ndeep learning frameworks to interpret EGMs during AFib. Additionally, language\nmodels (LMs) have shown exceptional performance in being able to generalize to\nunseen domains, especially in healthcare. In this study, we are the first to\nleverage pretrained LMs for finetuning of EGM interpolation and AFib\nclassification via masked language modeling. We formulate the EGM as a textual\nsequence and present competitive performances on AFib classification compared\nagainst other representations. Lastly, we provide a comprehensive\ninterpretability study to provide a multi-perspective intuition of the model's\nbehavior, which could greatly benefit the clinical use.\n","authors":["William Jongwon Han","Diana Gomez","Avi Alok","Chaojing Duan","Michael A. Rosenberg","Douglas Weber","Emerson Liu","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.01115v5.pdf","comment":"17 pages, 7 figures; Accepted to CHIL 2024"},{"id":"http://arxiv.org/abs/2404.16807v2","updated":"2024-09-27T14:50:59Z","published":"2024-04-25T17:52:39Z","title":"Improving Diversity of Commonsense Generation by Large Language Models\n via In-Context Learning","summary":" Generative Commonsense Reasoning (GCR) requires a model to reason about a\nsituation using commonsense knowledge, while generating coherent sentences.\nAlthough the quality of the generated sentences is crucial, the diversity of\nthe generation is equally important because it reflects the model's ability to\nuse a range of commonsense knowledge facts. Large Language Models (LLMs) have\nshown proficiency in enhancing the generation quality across various tasks\nthrough in-context learning (ICL) using given examples without the need for any\nfine-tuning. However, the diversity aspect in LLM outputs has not been\nsystematically studied before. To address this, we propose a simple method that\ndiversifies the LLM generations, while preserving their quality. Experimental\nresults on three benchmark GCR datasets show that our method achieves an ideal\nbalance between the quality and diversity. Moreover, the sentences generated by\nour proposed method can be used as training data to improve diversity in\nexisting commonsense generators.\n","authors":["Tianhui Zhang","Bei Peng","Danushka Bollegala"],"pdf_url":"https://arxiv.org/pdf/2404.16807v2.pdf","comment":"EMNLP 2024 Findings, Camera-ready version"},{"id":"http://arxiv.org/abs/2409.18786v1","updated":"2024-09-27T14:34:54Z","published":"2024-09-27T14:34:54Z","title":"A Survey on the Honesty of Large Language Models","summary":" Honesty is a fundamental principle for aligning large language models (LLMs)\nwith human values, requiring these models to recognize what they know and don't\nknow and be able to faithfully express their knowledge. Despite promising,\ncurrent LLMs still exhibit significant dishonest behaviors, such as confidently\npresenting wrong answers or failing to express what they know. In addition,\nresearch on the honesty of LLMs also faces challenges, including varying\ndefinitions of honesty, difficulties in distinguishing between known and\nunknown knowledge, and a lack of comprehensive understanding of related\nresearch. To address these issues, we provide a survey on the honesty of LLMs,\ncovering its clarification, evaluation approaches, and strategies for\nimprovement. Moreover, we offer insights for future research, aiming to inspire\nfurther exploration in this important area.\n","authors":["Siheng Li","Cheng Yang","Taiqiang Wu","Chufan Shi","Yuji Zhang","Xinyu Zhu","Zesen Cheng","Deng Cai","Mo Yu","Lemao Liu","Jie Zhou","Yujiu Yang","Ngai Wong","Xixin Wu","Wai Lam"],"pdf_url":"https://arxiv.org/pdf/2409.18786v1.pdf","comment":"Project Page: https://github.com/SihengLi99/LLM-Honesty-Survey"},{"id":"http://arxiv.org/abs/2311.13833v2","updated":"2024-09-27T14:04:35Z","published":"2023-11-23T07:33:38Z","title":"Lego: Learning to Disentangle and Invert Personalized Concepts Beyond\n Object Appearance in Text-to-Image Diffusion Models","summary":" Text-to-Image (T2I) models excel at synthesizing concepts such as nouns,\nappearances, and styles. To enable customized content creation based on a few\nexample images of a concept, methods such as Textual Inversion and DreamBooth\ninvert the desired concept and enable synthesizing it in new scenes. However,\ninverting personalized concepts that go beyond object appearance and style\n(adjectives and verbs) through natural language remains a challenge. Two key\ncharacteristics of these concepts contribute to the limitations of current\ninversion methods. 1) Adjectives and verbs are entangled with nouns (subject)\nand can hinder appearance-based inversion methods, where the subject appearance\nleaks into the concept embedding, and 2) describing such concepts often extends\nbeyond single word embeddings.\n In this study, we introduce Lego, a textual inversion method designed to\ninvert subject-entangled concepts from a few example images. Lego disentangles\nconcepts from their associated subjects using a simple yet effective Subject\nSeparation step and employs a Context Loss that guides the inversion of\nsingle/multi-embedding concepts. In a thorough user study, Lego-generated\nconcepts were preferred over 70% of the time when compared to the baseline in\nterms of authentically generating concepts according to a reference.\nAdditionally, visual question answering using an LLM suggested Lego-generated\nconcepts are better aligned with the text description of the concept.\n","authors":["Saman Motamed","Danda Pani Paudel","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2311.13833v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18764v1","updated":"2024-09-27T14:02:48Z","published":"2024-09-27T14:02:48Z","title":"Charting the Future: Using Chart Question-Answering for Scalable\n Evaluation of LLM-Driven Data Visualizations","summary":" We propose a novel framework that leverages Visual Question Answering (VQA)\nmodels to automate the evaluation of LLM-generated data visualizations.\nTraditional evaluation methods often rely on human judgment, which is costly\nand unscalable, or focus solely on data accuracy, neglecting the effectiveness\nof visual communication. By employing VQA models, we assess data representation\nquality and the general communicative clarity of charts. Experiments were\nconducted using two leading VQA benchmark datasets, ChartQA and PlotQA, with\nvisualizations generated by OpenAI's GPT-3.5 Turbo and Meta's Llama 3.1\n70B-Instruct models. Our results indicate that LLM-generated charts do not\nmatch the accuracy of the original non-LLM-generated charts based on VQA\nperformance measures. Moreover, while our results demonstrate that few-shot\nprompting significantly boosts the accuracy of chart generation, considerable\nprogress remains to be made before LLMs can fully match the precision of\nhuman-generated graphs. This underscores the importance of our work, which\nexpedites the research process by enabling rapid iteration without the need for\nhuman annotation, thus accelerating advancements in this field.\n","authors":["James Ford","Xingmeng Zhao","Dan Schumacher","Anthony Rios"],"pdf_url":"https://arxiv.org/pdf/2409.18764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09197v2","updated":"2024-09-27T14:01:16Z","published":"2024-07-12T11:53:40Z","title":"A Chatbot for Asylum-Seeking Migrants in Europe","summary":" We present ACME: A Chatbot for asylum-seeking Migrants in Europe. ACME relies\non computational argumentation and aims to help migrants identify the highest\nlevel of protection they can apply for. This would contribute to a more\nsustainable migration by reducing the load on territorial commissions, Courts,\nand humanitarian organizations supporting asylum applicants. We describe the\nbackground context, system architecture, underlying technologies, and a case\nstudy used to validate the tool with domain experts.\n","authors":["Bettina Fazzinga","Elena Palmieri","Margherita Vestoso","Luca Bolognini","Andrea Galassi","Filippo Furfaro","Paolo Torroni"],"pdf_url":"https://arxiv.org/pdf/2407.09197v2.pdf","comment":"Accepted for publication at IEEE International Conference on Tools\n with Artificial Intelligence (ICTAI) @IEEE"},{"id":"http://arxiv.org/abs/2405.20611v3","updated":"2024-09-27T13:29:00Z","published":"2024-05-31T03:57:19Z","title":"Bi-Directional Transformers vs. word2vec: Discovering Vulnerabilities in\n Lifted Compiled Code","summary":" Detecting vulnerabilities within compiled binaries is challenging due to lost\nhigh-level code structures and other factors such as architectural\ndependencies, compilers, and optimization options. To address these obstacles,\nthis research explores vulnerability detection using natural language\nprocessing (NLP) embedding techniques with word2vec, BERT, and RoBERTa to learn\nsemantics from intermediate representation (LLVM IR) code. Long short-term\nmemory (LSTM) neural networks were trained on embeddings from encoders created\nusing approximately 48k LLVM functions from the Juliet dataset. This study is\npioneering in its comparison of word2vec models with multiple bidirectional\ntransformers (BERT, RoBERTa) embeddings built using LLVM code to train neural\nnetworks to detect vulnerabilities in compiled binaries. Word2vec Skip-Gram\nmodels achieved 92% validation accuracy in detecting vulnerabilities,\noutperforming word2vec Continuous Bag of Words (CBOW), BERT, and RoBERTa. This\nsuggests that complex contextual embeddings may not provide advantages over\nsimpler word2vec models for this task when a limited number (e.g. 48K) of data\nsamples are used to train the bidirectional transformer-based models. The\ncomparative results provide novel insights into selecting optimal embeddings\nfor learning compiler-independent semantic code representations to advance\nmachine learning detection of vulnerabilities in compiled binaries.\n","authors":["Gary A. McCully","John D. Hastings","Shengjie Xu","Adam Fortier"],"pdf_url":"https://arxiv.org/pdf/2405.20611v3.pdf","comment":"Updated with improvements"},{"id":"http://arxiv.org/abs/2406.07736v2","updated":"2024-09-27T13:20:18Z","published":"2024-06-11T21:46:03Z","title":"MultiPragEval: Multilingual Pragmatic Evaluation of Large Language\n Models","summary":" As the capabilities of Large Language Models (LLMs) expand, it becomes\nincreasingly important to evaluate them beyond basic knowledge assessment,\nfocusing on higher-level language understanding. This study introduces\nMultiPragEval, the first multilingual pragmatic evaluation of LLMs, designed\nfor English, German, Korean, and Chinese. Comprising 1200 question units\ncategorized according to Grice's Cooperative Principle and its four\nconversational maxims, MultiPragEval enables an in-depth assessment of LLMs'\ncontextual awareness and their ability to infer implied meanings. Our findings\ndemonstrate that Claude3-Opus significantly outperforms other models in all\ntested languages, establishing a state-of-the-art in the field. Among\nopen-source models, Solar-10.7B and Qwen1.5-14B emerge as strong competitors.\nBy analyzing pragmatic inference, we provide valuable insights into the\ncapabilities essential for advanced language comprehension in AI systems.\n","authors":["Dojun Park","Jiwoo Lee","Seohyun Park","Hyeyun Jeong","Youngeun Koo","Soonha Hwang","Seonwoo Park","Sungeun Lee"],"pdf_url":"https://arxiv.org/pdf/2406.07736v2.pdf","comment":"The 2nd GenBench workshop on generalisation (benchmarking) in NLP -\n EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18724v1","updated":"2024-09-27T13:19:19Z","published":"2024-09-27T13:19:19Z","title":"Cross-Domain Keyword Extraction with Keyness Patterns","summary":" Domain dependence and annotation subjectivity pose challenges for supervised\nkeyword extraction. Based on the premises that second-order keyness patterns\nare existent at the community level and learnable from annotated keyword\nextraction datasets, this paper proposes a supervised ranking approach to\nkeyword extraction that ranks keywords with keyness patterns consisting of\nindependent features (such as sublanguage domain and term length) and three\ncategories of dependent features -- heuristic features, specificity features,\nand representavity features. The approach uses two convolutional-neural-network\nbased models to learn keyness patterns from keyword datasets and overcomes\nannotation subjectivity by training the two models with bootstrap sampling\nstrategy. Experiments demonstrate that the approach not only achieves\nstate-of-the-art performance on ten keyword datasets in general supervised\nkeyword extraction with an average top-10-F-measure of 0.316 , but also robust\ncross-domain performance with an average top-10-F-measure of 0.346 on four\ndatasets that are excluded in the training process. Such cross-domain\nrobustness is attributed to the fact that community-level keyness patterns are\nlimited in number and temperately independent of language domains, the\ndistinction between independent features and dependent features, and the\nsampling training strategy that balances excess risk and lack of negative\ntraining data.\n","authors":["Dongmei Zhou","Xuri Tang"],"pdf_url":"https://arxiv.org/pdf/2409.18724v1.pdf","comment":"26 pages, 14 figures"},{"id":"http://arxiv.org/abs/2404.03471v3","updated":"2024-09-27T13:12:23Z","published":"2024-04-04T14:24:06Z","title":"The Impact of Unstated Norms in Bias Analysis of Language Models","summary":" Bias in large language models (LLMs) has many forms, from overt\ndiscrimination to implicit stereotypes. Counterfactual bias evaluation is a\nwidely used approach to quantifying bias and often relies on template-based\nprobes that explicitly state group membership. It measures whether the outcome\nof a task, performed by an LLM, is invariant to a change of group membership.\nIn this work, we find that template-based probes can lead to unrealistic bias\nmeasurements. For example, LLMs appear to mistakenly cast text associated with\nWhite race as negative at higher rates than other groups. We hypothesize that\nthis arises artificially via a mismatch between commonly unstated norms, in the\nform of markedness, in the pretraining text of LLMs (e.g., Black president vs.\npresident) and templates used for bias measurement (e.g., Black president vs.\nWhite president). The findings highlight the potential misleading impact of\nvarying group membership through explicit mention in counterfactual bias\nquantification.\n","authors":["Farnaz Kohankhaki","D. B. Emerson","Jacob-Junqi Tian","Laleh Seyyed-Kalantari","Faiza Khan Khattak"],"pdf_url":"https://arxiv.org/pdf/2404.03471v3.pdf","comment":"23 Pages, 5 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2409.12059v2","updated":"2024-09-27T13:07:26Z","published":"2024-09-18T15:32:48Z","title":"Dual-Layer Training and Decoding of Large Language Model with\n Simultaneously Thinking and Speaking","summary":" Large Language Model can reasonably understand and generate human expressions\nbut may lack of thorough thinking and reasoning mechanisms. Recently there have\nbeen several studies which enhance the thinking ability of language models but\nmost of them are not data-driven or training-based. In this paper, we are\nmotivated by the cognitive mechanism in the natural world, and design a novel\nmodel architecture called TaS which allows it to first consider the thoughts\nand then express the response based upon the query. We design several pipelines\nto annotate or generate the thought contents from prompt-response samples, then\nadd language heads in a middle layer which behaves as the thinking layer. We\ntrain the language model by the thoughts-augmented data and successfully let\nthe thinking layer automatically generate reasonable thoughts and finally\noutput more reasonable responses. Both qualitative examples and quantitative\nresults validate the effectiveness and performance of TaS. Our code is\navailable at https://anonymous.4open.science/r/TadE.\n","authors":["Ningyuan Xi","Xiaoyu Wang","Yetao Wu","Teng Chen","Qingqing Gu","Jinxian Qu","Zhonglin Jiang","Yong Chen","Luo Ji"],"pdf_url":"https://arxiv.org/pdf/2409.12059v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.18708v1","updated":"2024-09-27T12:54:13Z","published":"2024-09-27T12:54:13Z","title":"Read Over the Lines: Attacking LLMs and Toxicity Detection Systems with\n ASCII Art to Mask Profanity","summary":" We introduce a novel family of adversarial attacks that exploit the inability\nof language models to interpret ASCII art. To evaluate these attacks, we\npropose the ToxASCII benchmark and develop two custom ASCII art fonts: one\nleveraging special tokens and another using text-filled letter shapes. Our\nattacks achieve a perfect 1.0 Attack Success Rate across ten models, including\nOpenAI's o1-preview and LLaMA 3.1.\n Warning: this paper contains examples of toxic language used for research\npurposes.\n","authors":["Sergey Berezin","Reza Farahbakhsh","Noel Crespi"],"pdf_url":"https://arxiv.org/pdf/2409.18708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15360v2","updated":"2024-09-27T12:36:58Z","published":"2024-09-18T02:35:41Z","title":"Reward-Robust RLHF in LLMs","summary":" As Large Language Models (LLMs) continue to progress toward more advanced\nforms of intelligence, Reinforcement Learning from Human Feedback (RLHF) is\nincreasingly seen as a key pathway toward achieving Artificial General\nIntelligence (AGI). However, the reliance on reward-model-based (RM-based)\nalignment methods introduces significant challenges due to the inherent\ninstability and imperfections of Reward Models (RMs), which can lead to\ncritical issues such as reward hacking and misalignment with human intentions.\nIn this paper, we introduce a reward-robust RLHF framework aimed at addressing\nthese fundamental challenges, paving the way for more reliable and resilient\nlearning in LLMs. Our approach introduces a novel optimization objective that\ncarefully balances performance and robustness by incorporating Bayesian Reward\nModel Ensembles (BRME) to model the uncertainty set of reward functions. This\nallows the framework to integrate both nominal performance and minimum reward\nsignals, ensuring more stable learning even with imperfect RMs. Empirical\nresults demonstrate that our framework consistently outperforms baselines\nacross diverse benchmarks, showing improved accuracy and long-term stability.\nWe also provide a theoretical analysis, demonstrating that reward-robust RLHF\napproaches the stability of constant reward settings, which proves to be\nacceptable even in a stochastic-case analysis. Together, these contributions\nhighlight the framework potential to enhance both the performance and stability\nof LLM alignment.\n","authors":["Yuzi Yan","Xingzhou Lou","Jialian Li","Yiping Zhang","Jian Xie","Chao Yu","Yu Wang","Dong Yan","Yuan Shen"],"pdf_url":"https://arxiv.org/pdf/2409.15360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18695v1","updated":"2024-09-27T12:33:57Z","published":"2024-09-27T12:33:57Z","title":"KALE-LM: Unleash The Power Of AI For Science Via Knowledge And Logic\n Enhanced Large Model","summary":" Artificial intelligence is gradually demonstrating its immense potential, and\nincreasing attention is being given to how AI can be harnessed to advance\nscientific research. In this vision paper, we present our perspectives on how\nAI can better assist scientific inquiry and explore corresponding technical\napproach. We have proposed and open-sourced a large model of our KALE-LM model\nseries, Llama3-KALE-LM-Chem-8B, which has achieved outstanding performance in\ntasks related to the field of chemistry. We hope that our work serves as a\nstrong starting point, helping to realize more intelligent AI and promoting the\nadvancement of human science and technology, as well as societal development.\n","authors":["Weichen Dai","Yezeng Chen","Zijie Dai","Zhijie Huang","Yubo Liu","Yixuan Pan","Baiyang Song","Chengli Zhong","Xinhe Li","Zeyu Wang","Zhuoying Feng","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.18695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14224v2","updated":"2024-09-27T12:31:24Z","published":"2024-07-19T11:48:36Z","title":"Hierarchical Windowed Graph Attention Network and a Large Scale Dataset\n for Isolated Indian Sign Language Recognition","summary":" Automatic Sign Language (SL) recognition is an important task in the computer\nvision community. To build a robust SL recognition system, we need a\nconsiderable amount of data which is lacking particularly in Indian sign\nlanguage (ISL). In this paper, we introduce a large-scale isolated ISL dataset\nand a novel SL recognition model based on skeleton graph structure. The dataset\ncovers 2002 daily used common words in the deaf community recorded by 20 (10\nmale and 10 female) deaf adult signers (contains 40033 videos). We propose a SL\nrecognition model namely Hierarchical Windowed Graph Attention Network (HWGAT)\nby utilizing the human upper body skeleton graph. The HWGAT tries to capture\ndistinctive motions by giving attention to different body parts induced by the\nhuman skeleton graph. The utility of the proposed dataset and the usefulness of\nour model are evaluated through extensive experiments. We pre-trained the\nproposed model on the presented dataset and fine-tuned it across different sign\nlanguage datasets further boosting the performance of 1.10, 0.46, 0.78, and\n6.84 percentage points on INCLUDE, LSA64, AUTSL and WLASL respectively compared\nto the existing state-of-the-art keypoints-based models.\n","authors":["Suvajit Patra","Arkadip Maitra","Megha Tiwari","K. Kumaran","Swathy Prabhu","Swami Punyeshwarananda","Soumitra Samanta"],"pdf_url":"https://arxiv.org/pdf/2407.14224v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00010v2","updated":"2024-09-27T12:19:43Z","published":"2024-05-18T14:06:53Z","title":"EnterpriseEM: Fine-tuned Embeddings for Enterprise Semantic Search","summary":" Enterprises grapple with the significant challenge of managing proprietary\nunstructured data, hindering efficient information retrieval. This has led to\nthe emergence of AI-driven information retrieval solutions, designed to adeptly\nextract relevant insights to address employee inquiries. These solutions often\nleverage pre-trained embedding models and generative models as foundational\ncomponents. While pre-trained embeddings may exhibit proximity or disparity\nbased on their original training objectives, they might not fully align with\nthe unique characteristics of enterprise-specific data, leading to suboptimal\nalignment with the retrieval goals of enterprise environments. In this paper,\nwe propose a comprehensive methodology for contextualizing pre-trained\nembedding models to enterprise environments, covering the entire process from\ndata preparation to model fine-tuning and evaluation. By adapting the\nembeddings to better suit the retrieval tasks prevalent in enterprises, we aim\nto enhance the performance of information retrieval solutions. We discuss the\nprocess of fine-tuning, its effect on retrieval accuracy, and the potential\nbenefits for enterprise information management. Our findings demonstrate the\nefficacy of fine-tuned embedding models in improving the precision and\nrelevance of search results in enterprise settings.\n","authors":["Kamalkumar Rathinasamy","Jayarama Nettar","Amit Kumar","Vishal Manchanda","Arun Vijayakumar","Ayush Kataria","Venkateshprasanna Manjunath","Chidambaram GS","Jaskirat Singh Sodhi","Shoeb Shaikh","Wasim Akhtar Khan","Prashant Singh","Tanishq Dattatray Ige","Vipin Tiwari","Rajab Ali Mondal","Harshini K","S Reka","Chetana Amancharla","Faiz ur Rahman","Harikrishnan P A","Indraneel Saha","Bhavya Tiwary","Navin Shankar Patel","Pradeep T S","Balaji A J"," Priyapravas","Mohammed Rafee Tarafdar"],"pdf_url":"https://arxiv.org/pdf/2406.00010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14277v2","updated":"2024-09-27T12:18:34Z","published":"2024-06-20T12:59:27Z","title":"QPaug: Question and Passage Augmentation for Open-Domain Question\n Answering of LLMs","summary":" Retrieval-augmented generation (RAG) has received much attention for\nOpen-domain question-answering (ODQA) tasks as a means to compensate for the\nparametric knowledge of large language models (LLMs). While previous approaches\nfocused on processing retrieved passages to remove irrelevant context, they\nstill rely heavily on the quality of retrieved passages which can degrade if\nthe question is ambiguous or complex. In this paper, we propose a simple yet\nefficient method called question and passage augmentation (QPaug) via LLMs for\nopen-domain QA. QPaug first decomposes the original questions into\nmultiple-step sub-questions. By augmenting the original question with detailed\nsub-questions and planning, we are able to make the query more specific on what\nneeds to be retrieved, improving the retrieval performance. In addition, to\ncompensate for the case where the retrieved passages contain distracting\ninformation or divided opinions, we augment the retrieved passages with\nself-generated passages by LLMs to guide the answer extraction. Experimental\nresults show that QPaug outperforms the previous state-of-the-art and achieves\nsignificant performance gain over existing RAG methods. The source code is\navailable at \\url{https://github.com/kmswin1/QPaug}.\n","authors":["Minsang Kim","Cheoneum Park","Seungjun Baek"],"pdf_url":"https://arxiv.org/pdf/2406.14277v2.pdf","comment":"The 2024 Conference on Empirical Methods in Natural Language\n Processing (EMNLP), Findings"},{"id":"http://arxiv.org/abs/2409.10482v3","updated":"2024-09-27T12:12:19Z","published":"2024-09-16T17:18:11Z","title":"Schrodinger's Memory: Large Language Models","summary":" Memory is the foundation of all human activities; without memory, it would be\nnearly impossible for people to perform any task in daily life. With the\ndevelopment of Large Language Models (LLMs), their language capabilities are\nbecoming increasingly comparable to those of humans. But do LLMs have memory?\nBased on current performance, LLMs do appear to exhibit memory. So, what is the\nunderlying mechanism of this memory? Previous research has lacked a deep\nexploration of LLMs' memory capabilities and the underlying theory. In this\npaper, we use Universal Approximation Theorem (UAT) to explain the memory\nmechanism in LLMs. We also conduct experiments to verify the memory\ncapabilities of various LLMs, proposing a new method to assess their abilities\nbased on these memory ability. We argue that LLM memory operates like\nSchr\\\"odinger's memory, meaning that it only becomes observable when a specific\nmemory is queried. We can only determine if the model retains a memory based on\nits output in response to the query; otherwise, it remains indeterminate.\nFinally, we expand on this concept by comparing the memory capabilities of the\nhuman brain and LLMs, highlighting the similarities and differences in their\noperational mechanisms.\n","authors":["Wei Wang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2409.10482v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18680v1","updated":"2024-09-27T12:06:53Z","published":"2024-09-27T12:06:53Z","title":"Beyond Single-Audio: Advancing Multi-Audio Processing in Audio Large\n Language Models","summary":" Various audio-LLMs (ALLMs) have been explored recently for tackling different\naudio tasks simultaneously using a single, unified model. While existing\nevaluations of ALLMs primarily focus on single-audio tasks, real-world\napplications often involve processing multiple audio streams simultaneously. To\nbridge this gap, we propose the first multi-audio evaluation (MAE) benchmark\nthat consists of 20 datasets from 11 multi-audio tasks encompassing both speech\nand sound scenarios. Comprehensive experiments on MAE demonstrate that the\nexisting ALLMs, while being powerful in comprehending primary audio elements in\nindividual audio inputs, struggling to handle multi-audio scenarios. To this\nend, we propose a novel multi-audio-LLM (MALLM) to capture audio context among\nmultiple similar audios using discriminative learning on our proposed synthetic\ndata. The results demonstrate that the proposed MALLM outperforms all baselines\nand achieves high data efficiency using synthetic data without requiring human\nannotations. The proposed MALLM opens the door for ALLMs towards multi-audio\nprocessing era and brings us closer to replicating human auditory capabilities\nin machines.\n","authors":["Yiming Chen","Xianghu Yue","Xiaoxue Gao","Chen Zhang","Luis Fernando D'Haro","Robby T. Tan","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2409.18680v1.pdf","comment":"EMNLP24 Findings"},{"id":"http://arxiv.org/abs/2409.18679v1","updated":"2024-09-27T12:05:12Z","published":"2024-09-27T12:05:12Z","title":"\"Why\" Has the Least Side Effect on Model Editing","summary":" Training large language models (LLMs) from scratch is an expensive endeavor,\nparticularly as world knowledge continually evolves. To maintain relevance and\naccuracy of LLMs, model editing has emerged as a pivotal research area. While\nthese methods hold promise, they can also produce unintended side effects.\nTheir underlying factors and causes remain largely unexplored. This paper\ndelves into a critical factor-question type-by categorizing model editing\nquestions. Our findings reveal that the extent of performance degradation\nvaries significantly across different question types, providing new insights\nfor experimental design in knowledge editing. Furthermore, we investigate\nwhether insights from smaller models can be extrapolated to larger models. Our\nresults indicate discrepancies in findings between models of different sizes,\nsuggesting that insights from smaller models may not necessarily apply to\nlarger models. Additionally, we examine the impact of batch size on side\neffects, discovering that increasing the batch size can mitigate performance\ndrops.\n","authors":["Tsung-Hsuan Pan","Chung-Chi Chen","Hen-Hsen Huang","Hsin-Hsi Chen"],"pdf_url":"https://arxiv.org/pdf/2409.18679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18678v1","updated":"2024-09-27T12:05:05Z","published":"2024-09-27T12:05:05Z","title":"Rehearsing Answers to Probable Questions with Perspective-Taking","summary":" Question answering (QA) has been a long-standing focus in the NLP field,\npredominantly addressing reading comprehension and common sense QA. However,\nscenarios involving the preparation of answers to probable questions during\nprofessional oral presentations remain underexplored. In this paper, we pioneer\nthe examination of this crucial yet overlooked topic by utilizing real-world QA\nconversation transcripts between company managers and professional analysts. We\nexplore the proposed task using three causal knowledge graphs (KGs) and three\nlarge language models (LLMs). This work provides foundational insights into the\napplication of LLMs in professional QA scenarios, highlighting the importance\nof causal KGs and perspective-taking in generating effective responses.\n","authors":["Yung-Yu Shih","Ziwei Xu","Hiroya Takamura","Yun-Nung Chen","Chung-Chi Chen"],"pdf_url":"https://arxiv.org/pdf/2409.18678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18677v1","updated":"2024-09-27T12:04:58Z","published":"2024-09-27T12:04:58Z","title":"Co-Trained Retriever-Generator Framework for Question Generation in\n Earnings Calls","summary":" In diverse professional environments, ranging from academic conferences to\ncorporate earnings calls, the ability to anticipate audience questions stands\nparamount. Traditional methods, which rely on manual assessment of an\naudience's background, interests, and subject knowledge, often fall short -\nparticularly when facing large or heterogeneous groups, leading to imprecision\nand inefficiency. While NLP has made strides in text-based question generation,\nits primary focus remains on academic settings, leaving the intricate\nchallenges of professional domains, especially earnings call conferences,\nunderserved. Addressing this gap, our paper pioneers the multi-question\ngeneration (MQG) task specifically designed for earnings call contexts. Our\nmethodology involves an exhaustive collection of earnings call transcripts and\na novel annotation technique to classify potential questions. Furthermore, we\nintroduce a retriever-enhanced strategy to extract relevant information. With a\ncore aim of generating a spectrum of potential questions that analysts might\npose, we derive these directly from earnings call content. Empirical\nevaluations underscore our approach's edge, revealing notable excellence in the\naccuracy, consistency, and perplexity of the questions generated.\n","authors":["Yining Juan","Chung-Chi Chen","Hen-Hsen Huang","Hsin-Hsi Chen"],"pdf_url":"https://arxiv.org/pdf/2409.18677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07309v4","updated":"2024-09-27T12:02:44Z","published":"2024-02-11T21:16:26Z","title":"HyperBERT: Mixing Hypergraph-Aware Layers with Language Models for Node\n Classification on Text-Attributed Hypergraphs","summary":" Hypergraphs are characterized by complex topological structure, representing\nhigher-order interactions among multiple entities through hyperedges. Lately,\nhypergraph-based deep learning methods to learn informative data\nrepresentations for the problem of node classification on text-attributed\nhypergraphs have garnered increasing research attention. However, existing\nmethods struggle to simultaneously capture the full extent of hypergraph\nstructural information and the rich linguistic attributes inherent in the nodes\nattributes, which largely hampers their effectiveness and generalizability. To\novercome these challenges, we explore ways to further augment a pretrained BERT\nmodel with specialized hypergraph-aware layers for the task of node\nclassification. Such layers introduce higher-order structural inductive bias\ninto the language model, thus improving the model's capacity to harness both\nhigher-order context information from the hypergraph structure and semantic\ninformation present in text. In this paper, we propose a new architecture,\nHyperBERT, a mixed text-hypergraph model which simultaneously models hypergraph\nrelational structure while maintaining the high-quality text encoding\ncapabilities of a pre-trained BERT. Notably, HyperBERT presents results that\nachieve a new state-of-the-art on five challenging text-attributed hypergraph\nnode classification benchmarks.\n","authors":["Adrián Bazaga","Pietro Liò","Gos Micklem"],"pdf_url":"https://arxiv.org/pdf/2402.07309v4.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2406.07393v3","updated":"2024-09-27T11:46:37Z","published":"2024-06-11T15:58:59Z","title":"Large Language Models are Limited in Out-of-Context Knowledge Reasoning","summary":" Large Language Models (LLMs) possess extensive knowledge and strong\ncapabilities in performing in-context reasoning. However, previous work\nchallenges their out-of-context reasoning ability, i.e., the ability to infer\ninformation from their training data, instead of from the context or prompt.\nThis paper focuses on a significant aspect of out-of-context reasoning:\nOut-of-Context Knowledge Reasoning (OCKR), which is to combine multiple\nknowledge to infer new knowledge. We designed a synthetic dataset with seven\nrepresentative OCKR tasks to systematically assess the OCKR capabilities of\nLLMs. Using this dataset, we evaluated several LLMs and discovered that their\nproficiency in this aspect is limited, regardless of whether the knowledge is\ntrained in a separate or adjacent training settings. Moreover, training the\nmodel to reason with reasoning examples does not result in significant\nimprovement, while training the model to perform explicit knowledge retrieval\nhelps for retrieving attribute knowledge but not the relation knowledge,\nindicating that the model's limited OCKR capabilities are due to difficulties\nin knowledge retrieval. Furthermore, we treat cross-lingual knowledge transfer\nas a distinct form of OCKR, and evaluate this ability. Our results show that\nthe evaluated model also exhibits limited ability in transferring knowledge\nacross languages.\n","authors":["Peng Hu","Changjiang Gao","Ruiqi Gao","Jiajun Chen","Shujian Huang"],"pdf_url":"https://arxiv.org/pdf/2406.07393v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15861v5","updated":"2024-09-27T11:28:50Z","published":"2024-02-24T17:08:45Z","title":"MATHWELL: Generating Educational Math Word Problems Using Teacher\n Annotations","summary":" Math word problems are critical K-8 educational tools, but writing them is\ntime consuming and requires extensive expertise. To be educational, problems\nmust be solvable, have accurate answers, and, most importantly, be\neducationally appropriate. We propose that language models have potential to\nsupport K-8 math education by automatically generating word problems. However,\nevaluating educational appropriateness is hard to quantify. We fill this gap by\nhaving teachers evaluate problems generated by LLMs, who find existing models\nand data often fail to be educationally appropriate. We then explore\nautomatically generating educational word problems, ultimately using our expert\nannotations to finetune a 70B language model. Our model, MATHWELL, is the first\nK-8 word problem generator targeted at educational appropriateness. Further\nexpert studies find MATHWELL generates problems far more solvable, accurate,\nand appropriate than public models. MATHWELL also matches GPT-4's problem\nquality while attaining more appropriate reading levels for K-8 students and\navoiding generating harmful questions.\n","authors":["Bryan R Christ","Jonathan Kropko","Thomas Hartvigsen"],"pdf_url":"https://arxiv.org/pdf/2402.15861v5.pdf","comment":"24 pages, 10 figures Accepted to EMNLP 2024 (Findings)"},{"id":"http://arxiv.org/abs/2409.15545v2","updated":"2024-09-27T11:28:04Z","published":"2024-09-23T20:59:15Z","title":"Rethinking Emotion Bias in Music via Frechet Audio Distance","summary":" The subjective nature of music emotion introduces inherent bias in both\nrecognition and generation, especially when relying on a single audio encoder,\nemotion classifier, or evaluation metric. In this work, we conduct a study on\nMusic Emotion Recognition (MER) and Emotional Music Generation (EMG), employing\ndiverse audio encoders alongside the Frechet Audio Distance (FAD), a\nreference-free evaluation metric. Our study begins with a benchmark evaluation\nof MER, highlighting the limitations associated with using a single audio\nencoder and the disparities observed across different measurements. We then\npropose assessing MER performance using FAD from multiple encoders to provide a\nmore objective measure of music emotion. Furthermore, we introduce an enhanced\nEMG approach designed to improve both the variation and prominence of generated\nmusic emotion, thus enhancing realism. Additionally, we investigate the realism\ndisparities between the emotions conveyed in real and synthetic music,\ncomparing our EMG model against two baseline models. Experimental results\nunderscore the emotion bias problem in both MER and EMG and demonstrate the\npotential of using FAD and diverse audio encoders to evaluate music emotion\nobjectively.\n","authors":["Yuanchao Li","Azalea Gui","Dimitra Emmanouilidou","Hannes Gamper"],"pdf_url":"https://arxiv.org/pdf/2409.15545v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18647v1","updated":"2024-09-27T11:28:01Z","published":"2024-09-27T11:28:01Z","title":"HiCuLR: Hierarchical Curriculum Learning for Rhetorical Role Labeling of\n Legal Documents","summary":" Rhetorical Role Labeling (RRL) of legal documents is pivotal for various\ndownstream tasks such as summarization, semantic case search and argument\nmining. Existing approaches often overlook the varying difficulty levels\ninherent in legal document discourse styles and rhetorical roles. In this work,\nwe propose HiCuLR, a hierarchical curriculum learning framework for RRL. It\nnests two curricula: Rhetorical Role-level Curriculum (RC) on the outer layer\nand Document-level Curriculum (DC) on the inner layer. DC categorizes documents\nbased on their difficulty, utilizing metrics like deviation from a standard\ndiscourse structure and exposes the model to them in an easy-to-difficult\nfashion. RC progressively strengthens the model to discern\ncoarse-to-fine-grained distinctions between rhetorical roles. Our experiments\non four RRL datasets demonstrate the efficacy of HiCuLR, highlighting the\ncomplementary nature of DC and RC.\n","authors":["T. Y. S. S. Santosh","Apolline Isaia","Shiyu Hong","Matthias Grabmair"],"pdf_url":"https://arxiv.org/pdf/2409.18647v1.pdf","comment":"Accepted to EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.18645v1","updated":"2024-09-27T11:25:10Z","published":"2024-09-27T11:25:10Z","title":"The Craft of Selective Prediction: Towards Reliable Case Outcome\n Classification -- An Empirical Study on European Court of Human Rights Cases","summary":" In high-stakes decision-making tasks within legal NLP, such as Case Outcome\nClassification (COC), quantifying a model's predictive confidence is crucial.\nConfidence estimation enables humans to make more informed decisions,\nparticularly when the model's certainty is low, or where the consequences of a\nmistake are significant. However, most existing COC works prioritize high task\nperformance over model reliability. This paper conducts an empirical\ninvestigation into how various design choices including pre-training corpus,\nconfidence estimator and fine-tuning loss affect the reliability of COC models\nwithin the framework of selective prediction. Our experiments on the\nmulti-label COC task, focusing on European Court of Human Rights (ECtHR) cases,\nhighlight the importance of a diverse yet domain-specific pre-training corpus\nfor better calibration. Additionally, we demonstrate that larger models tend to\nexhibit overconfidence, Monte Carlo dropout methods produce reliable confidence\nestimates, and confident error regularization effectively mitigates\noverconfidence. To our knowledge, this is the first systematic exploration of\nselective prediction in legal NLP. Our findings underscore the need for further\nresearch on enhancing confidence measurement and improving the trustworthiness\nof models in the legal domain.\n","authors":["T. Y. S. S. Santosh","Irtiza Chowdhury","Shanshan Xu","Matthias Grabmair"],"pdf_url":"https://arxiv.org/pdf/2409.18645v1.pdf","comment":"Accepted to EMNLP Findings"},{"id":"http://arxiv.org/abs/2409.18644v1","updated":"2024-09-27T11:24:35Z","published":"2024-09-27T11:24:35Z","title":"Incorporating Precedents for Legal Judgement Prediction on European\n Court of Human Rights Cases","summary":" Inspired by the legal doctrine of stare decisis, which leverages precedents\n(prior cases) for informed decision-making, we explore methods to integrate\nthem into LJP models. To facilitate precedent retrieval, we train a retriever\nwith a fine-grained relevance signal based on the overlap ratio of alleged\narticles between cases. We investigate two strategies to integrate precedents:\ndirect incorporation at inference via label interpolation based on case\nproximity and during training via a precedent fusion module using a\nstacked-cross attention model. We employ joint training of the retriever and\nLJP models to address latent space divergence between them. Our experiments on\nLJP tasks from the ECHR jurisdiction reveal that integrating precedents during\ntraining coupled with joint training of the retriever and LJP model,\noutperforms models without precedents or with precedents incorporated only at\ninference, particularly benefiting sparser articles.\n","authors":["T. Y. S. S. Santosh","Mohamed Hesham Elganayni","Stanisław Sójka","Matthias Grabmair"],"pdf_url":"https://arxiv.org/pdf/2409.18644v1.pdf","comment":"Accepted to EMNLP Findings"},{"id":"http://arxiv.org/abs/2409.10357v2","updated":"2024-09-27T10:59:21Z","published":"2024-09-16T15:06:12Z","title":"2D or not 2D: How Does the Dimensionality of Gesture Representation\n Affect 3D Co-Speech Gesture Generation?","summary":" Co-speech gestures are fundamental for communication. The advent of recent\ndeep learning techniques has facilitated the creation of lifelike, synchronous\nco-speech gestures for Embodied Conversational Agents. \"In-the-wild\" datasets,\naggregating video content from platforms like YouTube via human pose detection\ntechnologies, provide a feasible solution by offering 2D skeletal sequences\naligned with speech. Concurrent developments in lifting models enable the\nconversion of these 2D sequences into 3D gesture databases. However, it is\nimportant to note that the 3D poses estimated from the 2D extracted poses are,\nin essence, approximations of the ground-truth, which remains in the 2D domain.\nThis distinction raises questions about the impact of gesture representation\ndimensionality on the quality of generated motions - a topic that, to our\nknowledge, remains largely unexplored. Our study examines the effect of using\neither 2D or 3D joint coordinates as training data on the performance of\nspeech-to-gesture deep generative models. We employ a lifting model for\nconverting generated 2D pose sequences into 3D and assess how gestures created\ndirectly in 3D stack up against those initially generated in 2D and then\nconverted to 3D. We perform an objective evaluation using widely used metrics\nin the gesture generation field as well as a user study to qualitatively\nevaluate the different approaches.\n","authors":["Téo Guichoux","Laure Soulier","Nicolas Obin","Catherine Pelachaud"],"pdf_url":"https://arxiv.org/pdf/2409.10357v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2406.15111"},{"id":"http://arxiv.org/abs/2409.18618v1","updated":"2024-09-27T10:35:45Z","published":"2024-09-27T10:35:45Z","title":"Model-based Preference Optimization in Abstractive Summarization without\n Human Feedback","summary":" In abstractive summarization, the challenge of producing concise and accurate\nsummaries arises from the vast amount of information contained in the source\ndocument. Consequently, although Large Language Models (LLMs) can generate\nfluent text, they often introduce inaccuracies by hallucinating content not\nfound in the original source. While supervised fine-tuning methods that\nmaximize likelihood contribute to this issue, they do not consistently enhance\nthe faithfulness of the summaries. Preference-based optimization methods, such\nas Direct Preference Optimization (DPO), can further refine the model to align\nwith human preferences. However, these methods still heavily depend on costly\nhuman feedback. In this work, we introduce a novel and straightforward approach\ncalled Model-based Preference Optimization (MPO) to fine-tune LLMs for improved\nsummarization abilities without any human feedback. By leveraging the model's\ninherent summarization capabilities, we create a preference dataset that is\nfully generated by the model using different decoding strategies. Our\nexperiments on standard summarization datasets and various metrics demonstrate\nthat our proposed MPO significantly enhances the quality of generated summaries\nwithout relying on human feedback.\n","authors":["Jaepill Choi","Kyubyung Chae","Jiwoo Song","Yohan Jo","Taesup Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18618v1.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18602v1","updated":"2024-09-27T10:07:33Z","published":"2024-09-27T10:07:33Z","title":"Do LLMs suffer from Multi-Party Hangover? A Diagnostic Approach to\n Addressee Recognition and Response Selection in Conversations","summary":" Assessing the performance of systems to classify Multi-Party Conversations\n(MPC) is challenging due to the interconnection between linguistic and\nstructural characteristics of conversations. Conventional evaluation methods\noften overlook variances in model behavior across different levels of\nstructural complexity on interaction graphs. In this work, we propose a\nmethodological pipeline to investigate model performance across specific\nstructural attributes of conversations. As a proof of concept we focus on\nResponse Selection and Addressee Recognition tasks, to diagnose model\nweaknesses. To this end, we extract representative diagnostic subdatasets with\na fixed number of users and a good structural variety from a large and open\ncorpus of online MPCs. We further frame our work in terms of data minimization,\navoiding the use of original usernames to preserve privacy, and propose\nalternatives to using original text messages. Results show that response\nselection relies more on the textual content of conversations, while addressee\nrecognition requires capturing their structural dimension. Using an LLM in a\nzero-shot setting, we further highlight how sensitivity to prompt variations is\ntask-dependent.\n","authors":["Nicolò Penzo","Maryam Sajedinia","Bruno Lepri","Sara Tonelli","Marco Guerini"],"pdf_url":"https://arxiv.org/pdf/2409.18602v1.pdf","comment":"Accepted to EMNLP 2024 main conference"},{"id":"http://arxiv.org/abs/2409.18596v1","updated":"2024-09-27T09:56:02Z","published":"2024-09-27T09:56:02Z","title":"ASAG2024: A Combined Benchmark for Short Answer Grading","summary":" Open-ended questions test a more thorough understanding than closed-ended\nquestions and are often a preferred assessment method. However, open-ended\nquestions are tedious to grade and subject to personal bias. Therefore, there\nhave been efforts to speed up the grading process through automation. Short\nAnswer Grading (SAG) systems aim to automatically score students' answers.\nDespite growth in SAG methods and capabilities, there exists no comprehensive\nshort-answer grading benchmark across different subjects, grading scales, and\ndistributions. Thus, it is hard to assess the capabilities of current automated\ngrading methods in terms of their generalizability. In this preliminary work,\nwe introduce the combined ASAG2024 benchmark to facilitate the comparison of\nautomated grading systems. Combining seven commonly used short-answer grading\ndatasets in a common structure and grading scale. For our benchmark, we\nevaluate a set of recent SAG methods, revealing that while LLM-based approaches\nreach new high scores, they still are far from reaching human performance. This\nopens up avenues for future research on human-machine SAG systems.\n","authors":["Gérôme Meyer","Philip Breuer","Jonathan Fürst"],"pdf_url":"https://arxiv.org/pdf/2409.18596v1.pdf","comment":"Accepted at SIGCSE-Virtual 2024"},{"id":"http://arxiv.org/abs/2409.18594v1","updated":"2024-09-27T09:53:48Z","published":"2024-09-27T09:53:48Z","title":"\"Oh LLM, I'm Asking Thee, Please Give Me a Decision Tree\": Zero-Shot\n Decision Tree Induction and Embedding with Large Language Models","summary":" Large language models (LLMs) provide powerful means to leverage prior\nknowledge for predictive modeling when data is limited. In this work, we\ndemonstrate how LLMs can use their compressed world knowledge to generate\nintrinsically interpretable machine learning models, i.e., decision trees,\nwithout any training data. We find that these zero-shot decision trees can\nsurpass data-driven trees on some small-sized tabular datasets and that\nembeddings derived from these trees perform on par with data-driven tree-based\nembeddings on average. Our knowledge-driven decision tree induction and\nembedding approaches therefore serve as strong new baselines for data-driven\nmachine learning methods in the low-data regime.\n","authors":["Ricardo Knauer","Mario Koddenbrock","Raphael Wallsberger","Nicholas M. Brisson","Georg N. Duda","Deborah Falla","David W. Evans","Erik Rodner"],"pdf_url":"https://arxiv.org/pdf/2409.18594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14842v2","updated":"2024-09-27T09:52:57Z","published":"2024-09-23T09:20:19Z","title":"HW-TSC's Submission to the CCMT 2024 Machine Translation Tasks","summary":" This paper presents the submission of Huawei Translation Services Center\n(HW-TSC) to machine translation tasks of the 20th China Conference on Machine\nTranslation (CCMT 2024). We participate in the bilingual machine translation\ntask and multi-domain machine translation task. For these two translation\ntasks, we use training strategies such as regularized dropout, bidirectional\ntraining, data diversification, forward translation, back translation,\nalternated training, curriculum learning, and transductive ensemble learning to\ntrain neural machine translation (NMT) models based on the deep Transformer-big\narchitecture. Furthermore, to explore whether large language model (LLM) can\nhelp improve the translation quality of NMT systems, we use supervised\nfine-tuning to train llama2-13b as an Automatic post-editing (APE) model to\nimprove the translation results of the NMT model on the multi-domain machine\ntranslation task. By using these plyometric strategies, our submission achieves\na competitive result in the final evaluation.\n","authors":["Zhanglin Wu","Yuanchang Luo","Daimeng Wei","Jiawei Zheng","Bin Wei","Zongyao Li","Hengchao Shang","Jiaxin Guo","Shaojun Li","Weidong Zhang","Ning Xie","Hao Yang"],"pdf_url":"https://arxiv.org/pdf/2409.14842v2.pdf","comment":"14 pages, 2 figures, 6 Tables, CCMT2024. arXiv admin note:\n substantial text overlap with arXiv:2409.14800"},{"id":"http://arxiv.org/abs/2409.18583v1","updated":"2024-09-27T09:41:29Z","published":"2024-09-27T09:41:29Z","title":"Hit the Sweet Spot! Span-Level Ensemble for Large Language Models","summary":" Ensembling various LLMs to unlock their complementary potential and leverage\ntheir individual strengths is highly valuable. Previous studies typically focus\non two main paradigms: sample-level and token-level ensembles. Sample-level\nensemble methods either select or blend fully generated outputs, which hinders\ndynamic correction and enhancement of outputs during the generation process. On\nthe other hand, token-level ensemble methods enable real-time correction\nthrough fine-grained ensemble at each generation step. However, the information\ncarried by an individual token is quite limited, leading to suboptimal\ndecisions at each step. To address these issues, we propose SweetSpan, a\nspan-level ensemble method that effectively balances the need for real-time\nadjustments and the information required for accurate ensemble decisions. Our\napproach involves two key steps: First, we have each candidate model\nindependently generate candidate spans based on the shared prefix. Second, we\ncalculate perplexity scores to facilitate mutual evaluation among the candidate\nmodels and achieve robust span selection by filtering out unfaithful scores. To\ncomprehensively evaluate ensemble methods, we propose a new challenging setting\n(ensemble models with significant performance gaps) in addition to the standard\nsetting (ensemble the best-performing models) to assess the performance of\nmodel ensembles in more realistic scenarios. Experimental results in both\nstandard and challenging settings across various language generation tasks\ndemonstrate the effectiveness, robustness, and versatility of our approach\ncompared with previous ensemble methods.\n","authors":["Yangyifan Xu","Jianghao Chen","Junhong Wu","Jiajun Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14364v2","updated":"2024-09-27T09:13:19Z","published":"2024-09-22T08:51:18Z","title":"More Effective LLM Compressed Tokens with Uniformly Spread Position\n Identifiers and Compression Loss","summary":" Compressing Transformer inputs into compressd tokens allows running LLMs with\nimproved speed and cost efficiency. Based on the compression method ICAE, we\ncarefully examine the position identifier choices for compressed tokens and\nalso propose a new compression loss. We demonstrate empirically that our\nproposed methods achieve significantly higher compression ratios (15x compared\nto 4x for ICAE), while being able to attain comparable reconstruction\nperformance.\n","authors":["Runsong Zhao","Pengcheng Huang","Xinyu Liu","Chunyang Xiao","Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.14364v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05417v2","updated":"2024-09-27T09:03:05Z","published":"2024-05-08T20:37:56Z","title":"Fishing for Magikarp: Automatically Detecting Under-trained Tokens in\n Large Language Models","summary":" The disconnect between tokenizer creation and model training in language\nmodels allows for specific inputs, such as the infamous SolidGoldMagikarp\ntoken, to induce unwanted model behaviour. Although such `glitch tokens',\ntokens present in the tokenizer vocabulary but that are nearly or entirely\nabsent during model training, have been observed across various models, a\nreliable method to identify and address them has been missing. We present a\ncomprehensive analysis of Large Language Model tokenizers, specifically\ntargeting this issue of detecting under-trained tokens. Through a combination\nof tokenizer analysis, model weight-based indicators, and prompting techniques,\nwe develop novel and effective methods for automatically detecting these\nproblematic tokens. Our findings demonstrate the prevalence of such tokens\nacross a diverse set of models and provide insights into improving the\nefficiency and safety of language models.\n","authors":["Sander Land","Max Bartolo"],"pdf_url":"https://arxiv.org/pdf/2405.05417v2.pdf","comment":"16 pages, 6 figures. Accepted at EMNLP 2024, main track. For\n associated code, see https://github.com/cohere-ai/magikarp/"},{"id":"http://arxiv.org/abs/2303.11192v2","updated":"2024-09-27T08:58:04Z","published":"2023-03-20T15:22:11Z","title":"Multimodal Shannon Game with Images","summary":" The Shannon game has long been used as a thought experiment in linguistics\nand NLP, asking participants to guess the next letter in a sentence based on\nits preceding context. We extend the game by introducing an optional extra\nmodality in the form of image information. To investigate the impact of\nmultimodal information in this game, we use human participants and a language\nmodel (LM, GPT-2).\n We show that the addition of image information improves both self-reported\nconfidence and accuracy for both humans and LM. Certain word classes, such as\nnouns and determiners, benefit more from the additional modality information.\nThe priming effect in both humans and the LM becomes more apparent as the\ncontext size (extra modality information + sentence context) increases. These\nfindings highlight the potential of multimodal information in improving\nlanguage understanding and modeling.\n","authors":["Vilém Zouhar","Sunit Bhattacharya","Ondřej Bojar"],"pdf_url":"https://arxiv.org/pdf/2303.11192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19181v3","updated":"2024-09-27T08:57:34Z","published":"2024-03-28T07:22:16Z","title":"Make Large Language Model a Better Ranker","summary":" Large Language Models (LLMs) demonstrate robust capabilities across various\nfields, leading to a paradigm shift in LLM-enhanced Recommender System (RS).\nResearch to date focuses on point-wise and pair-wise recommendation paradigms,\nwhich are inefficient for LLM-based recommenders due to high computational\ncosts. However, existing list-wise approaches also fall short in ranking tasks\ndue to misalignment between ranking objectives and next-token prediction.\nMoreover, these LLM-based methods struggle to effectively address the order\nrelation among candidates, particularly given the scale of ratings. To address\nthese challenges, this paper introduces the large language model framework with\nAligned Listwise Ranking Objectives (ALRO). ALRO is designed to bridge the gap\nbetween the capabilities of LLMs and the nuanced requirements of ranking tasks.\nSpecifically, ALRO employs explicit feedback in a listwise manner by\nintroducing soft lambda loss, a customized adaptation of lambda loss designed\nfor optimizing order relations. This mechanism provides more accurate\noptimization goals, enhancing the ranking process. Additionally, ALRO\nincorporates a permutation-sensitive learning mechanism that addresses position\nbias, a prevalent issue in generative models, without imposing additional\ncomputational burdens during inference. Our evaluative studies reveal that ALRO\noutperforms both existing embedding-based recommendation methods and LLM-based\nrecommendation baselines.\n","authors":["Wen-Shuo Chao","Zhi Zheng","Hengshu Zhu","Hao Liu"],"pdf_url":"https://arxiv.org/pdf/2403.19181v3.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.18548v1","updated":"2024-09-27T08:34:42Z","published":"2024-09-27T08:34:42Z","title":"Research on Predicting Public Opinion Event Heat Levels Based on Large\n Language Models","summary":" In recent years, with the rapid development of large language models, serval\nmodels such as GPT-4o have demonstrated extraordinary capabilities, surpassing\nhuman performance in various language tasks. As a result, many researchers have\nbegun exploring their potential applications in the field of public opinion\nanalysis. This study proposes a novel large-language-models-based method for\npublic opinion event heat level prediction. First, we preprocessed and\nclassified 62,836 Chinese hot event data collected between July 2022 and\nDecember 2023. Then, based on each event's online dissemination heat index, we\nused the MiniBatchKMeans algorithm to automatically cluster the events and\ncategorize them into four heat levels (ranging from low heat to very high\nheat). Next, we randomly selected 250 events from each heat level, totalling\n1,000 events, to build the evaluation dataset. During the evaluation process,\nwe employed various large language models to assess their accuracy in\npredicting event heat levels in two scenarios: without reference cases and with\nsimilar case references. The results showed that GPT-4o and DeepseekV2\nperformed the best in the latter case, achieving prediction accuracies of 41.4%\nand 41.5%, respectively. Although the overall prediction accuracy remains\nrelatively low, it is worth noting that for low-heat (Level 1) events, the\nprediction accuracies of these two models reached 73.6% and 70.4%,\nrespectively. Additionally, the prediction accuracy showed a downward trend\nfrom Level 1 to Level 4, which correlates with the uneven distribution of data\nacross the heat levels in the actual dataset. This suggests that with the more\nrobust dataset, public opinion event heat level prediction based on large\nlanguage models will have significant research potential for the future.\n","authors":["Yi Ren","Tianyi Zhang","Weibin Li","DuoMu Zhou","Chenhao Qin","FangCheng Dong"],"pdf_url":"https://arxiv.org/pdf/2409.18548v1.pdf","comment":"conference"},{"id":"http://arxiv.org/abs/2210.06996v3","updated":"2024-09-27T08:33:31Z","published":"2022-10-13T13:04:16Z","title":"DICTDIS: Dictionary Constrained Disambiguation for Improved NMT","summary":" Domain-specific neural machine translation (NMT) systems (e.g., in\neducational applications) are socially significant with the potential to help\nmake information accessible to a diverse set of users in multilingual\nsocieties. It is desirable that such NMT systems be lexically constrained and\ndraw from domain-specific dictionaries. Dictionaries could present multiple\ncandidate translations for a source word/phrase due to the polysemous nature of\nwords. The onus is then on the NMT model to choose the contextually most\nappropriate candidate. Prior work has largely ignored this problem and focused\non the single candidate constraint setting wherein the target word or phrase is\nreplaced by a single constraint. In this work we present DictDis, a lexically\nconstrained NMT system that disambiguates between multiple candidate\ntranslations derived from dictionaries. We achieve this by augmenting training\ndata with multiple dictionary candidates to actively encourage disambiguation\nduring training by implicitly aligning multiple candidate constraints. We\ndemonstrate the utility of DictDis via extensive experiments on English-Hindi\nand English-German sentences in a variety of domains including regulatory,\nfinance, engineering. We also present comparisons on standard benchmark test\ndatasets. In comparison with existing approaches for lexically constrained and\nunconstrained NMT, we demonstrate superior performance with respect to\nconstraint copy and disambiguation related measures on all domains while also\nobtaining improved fluency of up to 2-3 BLEU points on some domains.\n","authors":["Ayush Maheshwari","Preethi Jyothi","Ganesh Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2210.06996v3.pdf","comment":"In Findings of EMNLP, 2024"},{"id":"http://arxiv.org/abs/2401.13246v4","updated":"2024-09-27T08:26:01Z","published":"2024-01-24T06:10:51Z","title":"SEER: Facilitating Structured Reasoning and Explanation via\n Reinforcement Learning","summary":" Elucidating the reasoning process with structured explanations from question\nto answer is crucial, as it significantly enhances the interpretability,\ntraceability, and trustworthiness of question-answering (QA) systems. However,\nstructured explanations demand models to perform intricately structured\nreasoning, which poses great challenges. Most existing methods focus on\nsingle-step reasoning through supervised learning, ignoring logical\ndependencies between steps. Moreover, existing reinforcement learning (RL)\nbased methods overlook the structured relationships, underutilizing the\npotential of RL in structured reasoning. In this paper, we propose SEER, a\nnovel method that maximizes a structure-based return to facilitate structured\nreasoning and explanation. Our proposed structure-based return precisely\ndescribes the hierarchical and branching structure inherent in structured\nreasoning, effectively capturing the intricate relationships between different\nreasoning steps. In addition, we introduce a fine-grained reward function to\nmeticulously delineate diverse reasoning steps. Extensive experiments show that\nSEER significantly outperforms state-of-the-art methods, achieving an absolute\nimprovement of 6.9% over RL-based methods on EntailmentBank, a 4.4% average\nimprovement on STREET benchmark, and exhibiting outstanding efficiency and\ncross-dataset generalization performance. Our code is available at\nhttps://github.com/Chen-GX/SEER.\n","authors":["Guoxin Chen","Kexin Tang","Chao Yang","Fuying Ye","Yu Qiao","Yiming Qian"],"pdf_url":"https://arxiv.org/pdf/2401.13246v4.pdf","comment":"Camera ready version for ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2403.09972v3","updated":"2024-09-27T08:22:21Z","published":"2024-03-15T02:38:26Z","title":"Think Twice Before Trusting: Self-Detection for Large Language Models\n through Comprehensive Answer Reflection","summary":" Self-detection for Large Language Models (LLMs) seeks to evaluate the\ntrustworthiness of the LLM's output by leveraging its own capabilities, thereby\nalleviating the issue of output hallucination. However, existing self-detection\napproaches only retrospectively evaluate answers generated by LLM, typically\nleading to the over-trust in incorrectly generated answers. To tackle this\nlimitation, we propose a novel self-detection paradigm that considers the\ncomprehensive answer space beyond LLM-generated answers. It thoroughly compares\nthe trustworthiness of multiple candidate answers to mitigate the over-trust in\nLLM-generated incorrect answers. Building upon this paradigm, we introduce a\ntwo-step framework, which firstly instructs LLM to reflect and provide\njustifications for each candidate answer, and then aggregates the\njustifications for comprehensive target answer evaluation. This framework can\nbe seamlessly integrated with existing approaches for superior self-detection.\nExtensive experiments on six datasets spanning three tasks demonstrate the\neffectiveness of the proposed framework.\n","authors":["Moxin Li","Wenjie Wang","Fuli Feng","Fengbin Zhu","Qifan Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2403.09972v3.pdf","comment":"EMNLP findings 2024"},{"id":"http://arxiv.org/abs/2409.18538v1","updated":"2024-09-27T08:17:53Z","published":"2024-09-27T08:17:53Z","title":"A Survey on Complex Tasks for Goal-Directed Interactive Agents","summary":" Goal-directed interactive agents, which autonomously complete tasks through\ninteractions with their environment, can assist humans in various domains of\ntheir daily lives. Recent advances in large language models (LLMs) led to a\nsurge of new, more and more challenging tasks to evaluate such agents. To\nproperly contextualize performance across these tasks, it is imperative to\nunderstand the different challenges they pose to agents. To this end, this\nsurvey compiles relevant tasks and environments for evaluating goal-directed\ninteractive agents, structuring them along dimensions relevant for\nunderstanding current obstacles. An up-to-date compilation of relevant\nresources can be found on our project website:\nhttps://coli-saar.github.io/interactive-agents.\n","authors":["Mareike Hartmann","Alexander Koller"],"pdf_url":"https://arxiv.org/pdf/2409.18538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03553v3","updated":"2024-09-27T08:16:28Z","published":"2024-05-06T15:20:30Z","title":"AlphaMath Almost Zero: Process Supervision without Process","summary":" Although recent advancements in large language models (LLMs) have\nsignificantly improved their performance on various tasks, they still face\nchallenges with complex and symbolic multi-step reasoning, particularly in\nmathematical reasoning. To bolster the mathematical reasoning capabilities of\nLLMs, most existing efforts concentrate on seeking assistance from either\ndomain experts or GPT-4 for high-quality process-supervised data, which is not\nonly expensive but also labor-intensive. In our study, we propose an innovative\nframework, AlphaMath, that bypasses the need for process annotations (from\nhumans or GPTs) by leveraging Monte Carlo Tree Search (MCTS). This framework\nfocuses on unleashing the potential of a well-pretrained LLM to autonomously\nenhance its mathematical reasoning. Specifically, we integrate a value model\nwith the LLM, automatically generating both process supervision and step-level\nevaluation signals in MCTS. Furthermore, we propose an efficient inference\nstrategy, step-level beam search, where the value model is crafted to assist\nthe policy model (i.e., LLM) in navigating more effective reasoning paths,\nrather than solely relying on prior probabilities. The experimental results on\nboth in-domain and out-of-domain datasets demonstrate that even without GPT-4\nor human-annotated process supervision, our AlphaMath framework achieves\ncomparable or superior results to previous state-of-the-art methods.\n","authors":["Guoxin Chen","Minpeng Liao","Chengxi Li","Kai Fan"],"pdf_url":"https://arxiv.org/pdf/2405.03553v3.pdf","comment":"Camera ready version for NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.10858v2","updated":"2024-09-27T08:03:07Z","published":"2024-06-16T09:06:17Z","title":"Step-level Value Preference Optimization for Mathematical Reasoning","summary":" Direct Preference Optimization (DPO) using an implicit reward model has\nproven to be an effective alternative to reinforcement learning from human\nfeedback (RLHF) for fine-tuning preference aligned large language models\n(LLMs). However, the overall preference annotations of responses do not fully\ncapture the fine-grained quality of model outputs in complex multi-step\nreasoning tasks, such as mathematical reasoning. To address this limitation, we\nintroduce a novel algorithm called Step-level Value Preference Optimization\n(SVPO). Our approach employs Monte Carlo Tree Search (MCTS) to automatically\nannotate step-level preferences for multi-step reasoning. Furthermore, from the\nperspective of learning-to-rank, we train an explicit value model to replicate\nthe behavior of the implicit reward model, complementing standard preference\noptimization. This value model enables the LLM to generate higher reward\nresponses with minimal cost during inference. Experimental results demonstrate\nthat our method achieves state-of-the-art performance on both in-domain and\nout-of-domain mathematical reasoning benchmarks. Our code is available at\n\\url{https://github.com/MARIO-Math-Reasoning/Super_MARIO}.\n","authors":["Guoxin Chen","Minpeng Liao","Chengxi Li","Kai Fan"],"pdf_url":"https://arxiv.org/pdf/2406.10858v2.pdf","comment":"Camera ready version for EMNLP2024-Findings"},{"id":"http://arxiv.org/abs/2409.18512v1","updated":"2024-09-27T07:46:52Z","published":"2024-09-27T07:46:52Z","title":"EmoPro: A Prompt Selection Strategy for Emotional Expression in LM-based\n Speech Synthesis","summary":" Recent advancements in speech synthesis models, trained on extensive\ndatasets, have demonstrated remarkable zero-shot capabilities. These models can\ncontrol content, timbre, and emotion in generated speech based on prompt\ninputs. Despite these advancements, the choice of prompts significantly impacts\nthe output quality, yet most existing selection schemes do not adequately\naddress the control of emotional intensity. To address this question, this\npaper proposes a two-stage prompt selection strategy EmoPro, which is\nspecifically designed for emotionally controllable speech synthesis. This\nstrategy focuses on selecting highly expressive and high-quality prompts by\nevaluating them from four perspectives: emotional expression strength, speech\nquality, text-emotion consistency, and model generation performance.\nExperimental results show that prompts selected using the proposed method\nresult in more emotionally expressive and engaging synthesized speech compared\nto those obtained through baseline. Audio samples and codes will be available\nat https://whyrrrrun.github.io/EmoPro/.\n","authors":["Haoyu Wang","Chunyu Qiang","Tianrui Wang","Cheng Gong","Qiuyu Liu","Yu Jiang","Xiaobao Wang","Chenyang Wang","Chen Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18511v1","updated":"2024-09-27T07:46:06Z","published":"2024-09-27T07:46:06Z","title":"Do We Need Domain-Specific Embedding Models? An Empirical Investigation","summary":" Embedding models play a crucial role in representing and retrieving\ninformation across various NLP applications. Recent advancements in Large\nLanguage Models (LLMs) have further enhanced the performance of embedding\nmodels, which are trained on massive amounts of text covering almost every\ndomain. These models are often benchmarked on general-purpose datasets like\nMassive Text Embedding Benchmark (MTEB), where they demonstrate superior\nperformance. However, a critical question arises: Is the development of\ndomain-specific embedding models necessary when general-purpose models are\ntrained on vast corpora that already include specialized domain texts? In this\npaper, we empirically investigate this question, choosing the finance domain as\nan example. We introduce the Finance Massive Text Embedding Benchmark\n(FinMTEB), a counterpart to MTEB that consists of financial domain-specific\ntext datasets. We evaluate the performance of seven state-of-the-art embedding\nmodels on FinMTEB and observe a significant performance drop compared to their\nperformance on MTEB. To account for the possibility that this drop is driven by\nFinMTEB's higher complexity, we propose four measures to quantify dataset\ncomplexity and control for this factor in our analysis. Our analysis provides\ncompelling evidence that state-of-the-art embedding models struggle to capture\ndomain-specific linguistic and semantic patterns, even when trained on large\ngeneral-purpose corpora. This study sheds light on the necessity of developing\ndomain-specific embedding models in the LLM era, offering valuable insights for\nresearchers and practitioners.\n","authors":["Yixuan Tang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2409.18511v1.pdf","comment":"https://github.com/yixuantt/FinMTEB"},{"id":"http://arxiv.org/abs/2403.04222v2","updated":"2024-09-27T07:08:10Z","published":"2024-03-07T04:50:38Z","title":"Self-Evaluation of Large Language Model based on Glass-box Features","summary":" The proliferation of open-source Large Language Models (LLMs) underscores the\npressing need for evaluation methods. Existing works primarily rely on external\nevaluators, focusing on training and prompting strategies. However, a crucial\naspect, model-aware glass-box features, is overlooked. In this study, we\nexplore the utility of glass-box features under the scenario of\nself-evaluation, namely applying an LLM to evaluate its own output. We\ninvestigate various glass-box feature groups and discovered that the softmax\ndistribution serves as a reliable quality indicator for self-evaluation.\nExperimental results on public benchmarks validate the feasibility of\nself-evaluation of LLMs using glass-box features.\n","authors":["Hui Huang","Yingqi Qu","Jing Liu","Muyun Yang","Bing Xu","Tiejun Zhao","Wenpeng Lu"],"pdf_url":"https://arxiv.org/pdf/2403.04222v2.pdf","comment":"accepted as Findings of EMNLP2024"},{"id":"http://arxiv.org/abs/2409.18486v1","updated":"2024-09-27T06:57:00Z","published":"2024-09-27T06:57:00Z","title":"Evaluation of OpenAI o1: Opportunities and Challenges of AGI","summary":" This comprehensive study evaluates the performance of OpenAI's o1-preview\nlarge language model across a diverse array of complex reasoning tasks,\nspanning multiple domains, including computer science, mathematics, natural\nsciences, medicine, linguistics, and social sciences. Through rigorous testing,\no1-preview demonstrated remarkable capabilities, often achieving human-level or\nsuperior performance in areas ranging from coding challenges to scientific\nreasoning and from language processing to creative problem-solving. Key\nfindings include:\n -83.3% success rate in solving complex competitive programming problems,\nsurpassing many human experts.\n -Superior ability in generating coherent and accurate radiology reports,\noutperforming other evaluated models.\n -100% accuracy in high school-level mathematical reasoning tasks, providing\ndetailed step-by-step solutions.\n -Advanced natural language inference capabilities across general and\nspecialized domains like medicine.\n -Impressive performance in chip design tasks, outperforming specialized\nmodels in areas such as EDA script generation and bug analysis.\n -Remarkable proficiency in anthropology and geology, demonstrating deep\nunderstanding and reasoning in these specialized fields.\n -Strong capabilities in quantitative investing. O1 has comprehensive\nfinancial knowledge and statistical modeling skills.\n -Effective performance in social media analysis, including sentiment analysis\nand emotion recognition.\n The model excelled particularly in tasks requiring intricate reasoning and\nknowledge integration across various fields. While some limitations were\nobserved, including occasional errors on simpler problems and challenges with\ncertain highly specialized concepts, the overall results indicate significant\nprogress towards artificial general intelligence.\n","authors":["Tianyang Zhong","Zhengliang Liu","Yi Pan","Yutong Zhang","Yifan Zhou","Shizhe Liang","Zihao Wu","Yanjun Lyu","Peng Shu","Xiaowei Yu","Chao Cao","Hanqi Jiang","Hanxu Chen","Yiwei Li","Junhao Chen","Huawen Hu","Yihen Liu","Huaqin Zhao","Shaochen Xu","Haixing Dai","Lin Zhao","Ruidong Zhang","Wei Zhao","Zhenyuan Yang","Jingyuan Chen","Peilong Wang","Wei Ruan","Hui Wang","Huan Zhao","Jing Zhang","Yiming Ren","Shihuan Qin","Tong Chen","Jiaxi Li","Arif Hassan Zidan","Afrar Jahin","Minheng Chen","Sichen Xia","Jason Holmes","Yan Zhuang","Jiaqi Wang","Bochen Xu","Weiran Xia","Jichao Yu","Kaibo Tang","Yaxuan Yang","Bolun Sun","Tao Yang","Guoyu Lu","Xianqiao Wang","Lilong Chai","He Li","Jin Lu","Lichao Sun","Xin Zhang","Bao Ge","Xintao Hu","Lian Zhang","Hua Zhou","Lu Zhang","Shu Zhang","Ninghao Liu","Bei Jiang","Linglong Kong","Zhen Xiang","Yudan Ren","Jun Liu","Xi Jiang","Yu Bao","Wei Zhang","Xiang Li","Gang Li","Wei Liu","Dinggang Shen","Andrea Sikora","Xiaoming Zhai","Dajiang Zhu","Tianming Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12842v3","updated":"2024-09-27T06:25:33Z","published":"2024-02-20T09:10:08Z","title":"PromptKD: Distilling Student-Friendly Knowledge for Generative Language\n Models via Prompt Tuning","summary":" Recent advancements in large language models (LLMs) have raised concerns\nabout inference costs, increasing the need for research into model compression.\nWhile knowledge distillation (KD) is a prominent method for this, research on\nKD for generative language models like LLMs is relatively sparse, and the\napproach of distilling student-friendly knowledge, which has shown promising\nperformance in KD for classification models, remains unexplored in generative\nlanguage models. To explore this approach, we propose PromptKD, a simple yet\neffective method that utilizes prompt tuning - for the first time in KD - to\nenable generative language models to transfer student-friendly knowledge.\nUnlike previous works in classification that require fine-tuning the entire\nteacher model for extracting student-friendly knowledge, PromptKD achieves\nsimilar effects by adding a small number of prompt tokens and tuning only the\nprompt with student guidance. Extensive experiments on instruction-following\ndatasets show that PromptKD achieves state-of-the-art performance while adding\nonly 0.0007% of the teacher's parameters as prompts. Further analysis suggests\nthat distilling student-friendly knowledge alleviates exposure bias effectively\nthroughout the entire training process, leading to performance enhancements.\n","authors":["Gyeongman Kim","Doohyuk Jang","Eunho Yang"],"pdf_url":"https://arxiv.org/pdf/2402.12842v3.pdf","comment":"EMNLP 2024 Findings. Our project page: https://promptkd.github.io"},{"id":"http://arxiv.org/abs/2409.18472v1","updated":"2024-09-27T06:18:55Z","published":"2024-09-27T06:18:55Z","title":"URIEL+: Enhancing Linguistic Inclusion and Usability in a Typological\n and Multilingual Knowledge Base","summary":" URIEL is a knowledge base offering geographical, phylogenetic, and\ntypological vector representations for 7970 languages. It includes distance\nmeasures between these vectors for 4005 languages, which are accessible via the\nlang2vec tool. Despite being frequently cited, URIEL is limited in terms of\nlinguistic inclusion and overall usability. To tackle these challenges, we\nintroduce URIEL+, an enhanced version of URIEL and lang2vec addressing these\nlimitations. In addition to expanding typological feature coverage for 2898\nlanguages, URIEL+ improves user experience with robust, customizable distance\ncalculations to better suit the needs of the users. These upgrades also offer\ncompetitive performance on downstream tasks and provide distances that better\nalign with linguistic distance studies.\n","authors":["Aditya Khan","Mason Shipton","David Anugraha","Kaiyao Duan","Phuong H. Hoang","Eric Khiu","A. Seza Doğruöz","En-Shiun Annie Lee"],"pdf_url":"https://arxiv.org/pdf/2409.18472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09298v2","updated":"2024-09-27T05:55:01Z","published":"2023-09-17T15:19:29Z","title":"OWL: A Large Language Model for IT Operations","summary":" With the rapid development of IT operations, it has become increasingly\ncrucial to efficiently manage and analyze large volumes of data for practical\napplications. The techniques of Natural Language Processing (NLP) have shown\nremarkable capabilities for various tasks, including named entity recognition,\nmachine translation and dialogue systems. Recently, Large Language Models\n(LLMs) have achieved significant improvements across various NLP downstream\ntasks. However, there is a lack of specialized LLMs for IT operations. In this\npaper, we introduce the OWL, a large language model trained on our collected\nOWL-Instruct dataset with a wide range of IT-related information, where the\nmixture-of-adapter strategy is proposed to improve the parameter-efficient\ntuning across different domains or tasks. Furthermore, we evaluate the\nperformance of our OWL on the OWL-Bench established by us and open IT-related\nbenchmarks. OWL demonstrates superior performance results on IT tasks, which\noutperforms existing models by significant margins. Moreover, we hope that the\nfindings of our work will provide more insights to revolutionize the techniques\nof IT operations with specialized LLMs.\n","authors":["Hongcheng Guo","Jian Yang","Jiaheng Liu","Liqun Yang","Linzheng Chai","Jiaqi Bai","Junran Peng","Xiaorong Hu","Chao Chen","Dongfeng Zhang","Xu Shi","Tieqiao Zheng","Liangfan Zheng","Bo Zhang","Ke Xu","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2309.09298v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2402.17311v2","updated":"2024-09-27T05:38:03Z","published":"2024-02-27T08:33:31Z","title":"SKT5SciSumm -- Revisiting Extractive-Generative Approach for\n Multi-Document Scientific Summarization","summary":" Summarization for scientific text has shown significant benefits both for the\nresearch community and human society. Given the fact that the nature of\nscientific text is distinctive and the input of the multi-document\nsummarization task is substantially long, the task requires sufficient\nembedding generation and text truncation without losing important information.\nTo tackle these issues, in this paper, we propose SKT5SciSumm - a hybrid\nframework for multi-document scientific summarization (MDSS). We leverage the\nSentence-Transformer version of Scientific Paper Embeddings using\nCitation-Informed Transformers (SPECTER) to encode and represent textual\nsentences, allowing for efficient extractive summarization using k-means\nclustering. We employ the T5 family of models to generate abstractive summaries\nusing extracted sentences. SKT5SciSumm achieves state-of-the-art performance on\nthe Multi-XScience dataset. Through extensive experiments and evaluation, we\nshowcase the benefits of our model by using less complicated models to achieve\nremarkable results, thereby highlighting its potential in advancing the field\nof multi-document summarization for scientific text.\n","authors":["Huy Quoc To","Ming Liu","Guangyan Huang","Hung-Nghiep Tran","Andr'e Greiner-Petter","Felix Beierle","Akiko Aizawa"],"pdf_url":"https://arxiv.org/pdf/2402.17311v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18454v1","updated":"2024-09-27T05:29:31Z","published":"2024-09-27T05:29:31Z","title":"Leveraging Long-Context Large Language Models for Multi-Document\n Understanding and Summarization in Enterprise Applications","summary":" The rapid increase in unstructured data across various fields has made\nmulti-document comprehension and summarization a critical task. Traditional\napproaches often fail to capture relevant context, maintain logical\nconsistency, and extract essential information from lengthy documents. This\npaper explores the use of Long-context Large Language Models (LLMs) for\nmulti-document summarization, demonstrating their exceptional capacity to grasp\nextensive connections, provide cohesive summaries, and adapt to various\nindustry domains and integration with enterprise applications/systems. The\npaper discusses the workflow of multi-document summarization for effectively\ndeploying long-context LLMs, supported by case studies in legal applications,\nenterprise functions such as HR, finance, and sourcing, as well as in the\nmedical and news domains. These case studies show notable enhancements in both\nefficiency and accuracy. Technical obstacles, such as dataset diversity, model\nscalability, and ethical considerations like bias mitigation and factual\naccuracy, are carefully analyzed. Prospective research avenues are suggested to\naugment the functionalities and applications of long-context LLMs, establishing\nthem as pivotal tools for transforming information processing across diverse\nsectors and enterprise applications.\n","authors":["Aditi Godbole","Jabin Geevarghese George","Smita Shandilya"],"pdf_url":"https://arxiv.org/pdf/2409.18454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03645v3","updated":"2024-09-27T05:27:45Z","published":"2024-07-04T05:35:47Z","title":"Continual Learning Optimizations for Auto-regressive Decoder of\n Multilingual ASR systems","summary":" Continual Learning (CL) involves fine-tuning pre-trained models with new data\nwhile maintaining the performance on the pre-trained data. This is particularly\nrelevant for expanding multilingual ASR (MASR) capabilities. However, existing\nCL methods, mainly designed for computer vision and reinforcement learning\ntasks, often yield sub-optimal results when directly applied to MASR. We\nhypothesise that this is because CL of the auto-regressive decoder in the MASR\nmodel is difficult. To verify this, we propose four optimizations on the\ndecoder. They include decoder-layer gradient surgery, freezing unused token\nembeddings, suppressing output of newly added tokens, and learning rate\nre-scaling. Our experiments on adapting Whisper to 10 unseen languages from the\nCommon Voice dataset demonstrate that these optimizations reduce the Average\nWord Error Rate (AWER) of pretrained languages from 14.2% to 12.4% compared\nwith Experience Replay, without compromising the AWER of new languages.\n","authors":["Chin Yuen Kwok","Jia Qi Yip","Eng Siong Chng"],"pdf_url":"https://arxiv.org/pdf/2407.03645v3.pdf","comment":"Proceedings of Interspeech"},{"id":"http://arxiv.org/abs/2406.13997v2","updated":"2024-09-27T05:20:37Z","published":"2024-06-20T04:52:19Z","title":"\"Global is Good, Local is Bad?\": Understanding Brand Bias in LLMs","summary":" Many recent studies have investigated social biases in LLMs but brand bias\nhas received little attention. This research examines the biases exhibited by\nLLMs towards different brands, a significant concern given the widespread use\nof LLMs in affected use cases such as product recommendation and market\nanalysis. Biased models may perpetuate societal inequalities, unfairly favoring\nestablished global brands while marginalizing local ones. Using a curated\ndataset across four brand categories, we probe the behavior of LLMs in this\nspace. We find a consistent pattern of bias in this space -- both in terms of\ndisproportionately associating global brands with positive attributes and\ndisproportionately recommending luxury gifts for individuals in high-income\ncountries. We also find LLMs are subject to country-of-origin effects which may\nboost local brand preference in LLM outputs in specific contexts.\n","authors":["Mahammed Kamruzzaman","Hieu Minh Nguyen","Gene Louis Kim"],"pdf_url":"https://arxiv.org/pdf/2406.13997v2.pdf","comment":"Accepted at EMNLP-2024 (main)"},{"id":"http://arxiv.org/abs/2409.18446v1","updated":"2024-09-27T05:06:43Z","published":"2024-09-27T05:06:43Z","title":"Exploring Language Model Generalization in Low-Resource Extractive QA","summary":" In this paper, we investigate Extractive Question Answering (EQA) with Large\nLanguage Models (LLMs) under domain drift, i.e., can LLMs generalize well to\nclosed-domains that require specific knowledge such as medicine and law in a\nzero-shot fashion without additional in-domain training? To this end, we devise\na series of experiments to empirically explain the performance gap. Our\nfindings suggest that: a) LLMs struggle with dataset demands of closed-domains\nsuch as retrieving long answer-spans; b) Certain LLMs, despite showing strong\noverall performance, display weaknesses in meeting basic requirements as\ndiscriminating between domain-specific senses of words which we link to\npre-processing decisions; c) Scaling model parameters is not always effective\nfor cross-domain generalization; and d) Closed-domain datasets are\nquantitatively much different than open-domain EQA datasets and current LLMs\nstruggle to deal with them. Our findings point out important directions for\nimproving existing LLMs.\n","authors":["Saptarshi Sengupta","Wenpeng Yin","Preslav Nakov","Shreya Ghosh","Suhang Wang"],"pdf_url":"https://arxiv.org/pdf/2409.18446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18433v1","updated":"2024-09-27T03:49:56Z","published":"2024-09-27T03:49:56Z","title":"Easy2Hard-Bench: Standardized Difficulty Labels for Profiling LLM\n Performance and Generalization","summary":" While generalization over tasks from easy to hard is crucial to profile\nlanguage models (LLMs), the datasets with fine-grained difficulty annotations\nfor each problem across a broad range of complexity are still blank. Aiming to\naddress this limitation, we present Easy2Hard-Bench, a consistently formatted\ncollection of 6 benchmark datasets spanning various domains, such as\nmathematics and programming problems, chess puzzles, and reasoning questions.\nEach problem within these datasets is annotated with numerical difficulty\nscores. To systematically estimate problem difficulties, we collect abundant\nperformance data on attempts to each problem by humans in the real world or\nLLMs on the prominent leaderboard. Leveraging the rich performance data, we\napply well-established difficulty ranking systems, such as Item Response Theory\n(IRT) and Glicko-2 models, to uniformly assign numerical difficulty scores to\nproblems. Moreover, datasets in Easy2Hard-Bench distinguish themselves from\nprevious collections by a higher proportion of challenging problems. Through\nextensive experiments with six state-of-the-art LLMs, we provide a\ncomprehensive analysis of their performance and generalization capabilities\nacross varying levels of difficulty, with the aim of inspiring future research\nin LLM generalization. The datasets are available at\nhttps://huggingface.co/datasets/furonghuang-lab/Easy2Hard-Bench.\n","authors":["Mucong Ding","Chenghao Deng","Jocelyn Choo","Zichu Wu","Aakriti Agrawal","Avi Schwarzschild","Tianyi Zhou","Tom Goldstein","John Langford","Anima Anandkumar","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2409.18433v1.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2401.07977v2","updated":"2024-09-27T03:33:28Z","published":"2024-01-15T21:43:46Z","title":"Towards Efficient Methods in Medical Question Answering using Knowledge\n Graph Embeddings","summary":" In Natural Language Processing (NLP), Machine Reading Comprehension (MRC) is\nthe task of answering a question based on a given context. To handle questions\nin the medical domain, modern language models such as BioBERT, SciBERT and even\nChatGPT are trained on vast amounts of in-domain medical corpora. However,\nin-domain pre-training is expensive in terms of time and resources. In this\npaper, we propose a resource-efficient approach for injecting domain knowledge\ninto a model without relying on such domain-specific pre-training.\n Knowledge graphs are powerful resources for accessing medical information.\nBuilding on existing work, we introduce a method using Multi-Layer Perceptrons\n(MLPs) for aligning and integrating embeddings extracted from medical knowledge\ngraphs with the embedding spaces of pre-trained language models (LMs). The\naligned embeddings are fused with open-domain LMs BERT and RoBERTa that are\nfine-tuned for two MRC tasks, span detection (COVID-QA) and multiple-choice\nquestions (PubMedQA). We compare our method to prior techniques that rely on a\nvocabulary overlap for embedding alignment and show how our method circumvents\nthis requirement to deliver better performance. On both datasets, our method\nallows BERT/RoBERTa to either perform on par (occasionally exceeding) with\nstronger domain-specific models or show improvements in general over prior\ntechniques. With the proposed approach, we signal an alternative method to\nin-domain pre-training to achieve domain proficiency.\n","authors":["Saptarshi Sengupta","Connor Heaton","Suhan Cui","Soumalya Sarkar","Prasenjit Mitra"],"pdf_url":"https://arxiv.org/pdf/2401.07977v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18428v1","updated":"2024-09-27T03:31:32Z","published":"2024-09-27T03:31:32Z","title":"Improving Multilingual ASR in the Wild Using Simple N-best Re-ranking","summary":" Multilingual Automatic Speech Recognition (ASR) models are typically\nevaluated in a setting where the ground-truth language of the speech utterance\nis known, however, this is often not the case for most practical settings.\nAutomatic Spoken Language Identification (SLID) models are not perfect and\nmisclassifications have a substantial impact on the final ASR accuracy. In this\npaper, we present a simple and effective N-best re-ranking approach to improve\nmultilingual ASR accuracy for several prominent acoustic models by employing\nexternal features such as language models and text-based language\nidentification models. Our results on FLEURS using the MMS and Whisper models\nshow spoken language identification accuracy improvements of 8.7% and 6.1%,\nrespectively and word error rates which are 3.3% and 2.0% lower on these\nbenchmarks.\n","authors":["Brian Yan","Vineel Pratap","Shinji Watanabe","Michael Auli"],"pdf_url":"https://arxiv.org/pdf/2409.18428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11016v2","updated":"2024-09-27T03:23:53Z","published":"2024-06-27T01:52:05Z","title":"LongLaMP: A Benchmark for Personalized Long-form Text Generation","summary":" Long-text generation is seemingly ubiquitous in real-world applications of\nlarge language models such as generating an email or writing a review. Despite\nthe fundamental importance and prevalence of long-text generation in many\npractical applications, existing work on personalized generation has focused on\nthe generation of very short text. To overcome these limitations, we study the\nproblem of personalized long-text generation, that is, generating long-text\nthat is personalized for a specific user while being practically useful for the\nvast majority of real-world applications that naturally require the generation\nof longer text. In this work, we demonstrate the importance of user-specific\npersonalization for long-text generation tasks and develop the Long-text\nLanguage Model Personalization (LongLaMP) Benchmark. LongLaMP provides a\ncomprehensive and diverse evaluation framework for personalized long-text\ngeneration. Extensive experiments on LongLaMP for zero-shot and fine-tuned\nlanguage tasks demonstrate the effectiveness of the proposed benchmark and its\nutility for developing and evaluating techniques for personalized long-text\ngeneration across a wide variety of long-text generation tasks. The results\nhighlight the importance of personalization across a wide variety of long-text\ngeneration tasks. Finally, we release the benchmark for others to use for this\nimportant problem.\n","authors":["Ishita Kumar","Snigdha Viswanathan","Sushrita Yerra","Alireza Salemi","Ryan A. Rossi","Franck Dernoncourt","Hanieh Deilamsalehy","Xiang Chen","Ruiyi Zhang","Shubham Agarwal","Nedim Lipka","Chein Van Nguyen","Thien Huu Nguyen","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2407.11016v2.pdf","comment":"9 pages, 4 figures, 20 tables(including appendix) submitted to EMNLP"},{"id":"http://arxiv.org/abs/2409.18417v1","updated":"2024-09-27T03:15:07Z","published":"2024-09-27T03:15:07Z","title":"VickreyFeedback: Cost-efficient Data Construction for Reinforcement\n Learning from Human Feedback","summary":" This paper addresses the cost-efficiency aspect of Reinforcement Learning\nfrom Human Feedback (RLHF). RLHF leverages datasets of human preferences over\noutputs of large language models (LLM) to instill human expectations into LLMs.\nWhile preference annotation comes with a monetized cost, the economic utility\nof a preference dataset has not been considered by far. What exacerbates this\nsituation is that given complex intransitive or cyclic relationships in\npreference datasets, existing algorithms for fine-tuning LLMs are still far\nfrom capturing comprehensive preferences. This raises severe cost-efficiency\nconcerns in production environments, where preference data accumulate over\ntime. In this paper, we see the fine-tuning of LLMs as a monetized economy and\nintroduce an auction mechanism to improve the efficiency of the preference data\ncollection in dollar terms. We show that introducing an auction mechanism can\nplay an essential role in enhancing the cost-efficiency of RLHF while\nmaintaining satisfactory model performance. Experimental results demonstrate\nthat our proposed auction-based protocol is cost-efficient for fine-tuning LLMs\nby concentrating on high-quality feedback.\n","authors":["Guoxi Zhang","Jiuding Duan"],"pdf_url":"https://arxiv.org/pdf/2409.18417v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.18412v1","updated":"2024-09-27T03:00:29Z","published":"2024-09-27T03:00:29Z","title":"SciDFM: A Large Language Model with Mixture-of-Experts for Science","summary":" Recently, there has been a significant upsurge of interest in leveraging\nlarge language models (LLMs) to assist scientific discovery. However, most LLMs\nonly focus on general science, while they lack domain-specific knowledge, such\nas chemical molecules and amino acid sequences. To bridge these gaps, we\nintroduce SciDFM, a mixture-of-experts LLM, which is trained from scratch and\nis able to conduct college-level scientific reasoning and understand molecules\nand amino acid sequences. We collect a large-scale training corpus containing\nnumerous scientific papers and books from different disciplines as well as data\nfrom domain-specific databases. We further fine-tune the pre-trained model on\nlots of instruction data to improve performances on downstream benchmarks. From\nexperiment results, we show that SciDFM achieves strong performance on general\nscientific benchmarks such as SciEval and SciQ, and it reaches a SOTA\nperformance on domain-specific benchmarks among models of similar size. We\nfurther analyze the expert layers and show that the results of expert selection\nvary with data from different disciplines. To benefit the broader research\ncommunity, we open-source SciDFM at\nhttps://huggingface.co/OpenDFM/SciDFM-MoE-A5.6B-v1.0.\n","authors":["Liangtai Sun","Danyu Luo","Da Ma","Zihan Zhao","Baocai Chen","Zhennan Shen","Su Zhu","Lu Chen","Xin Chen","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2409.18412v1.pdf","comment":"12 pages, 1 figure, 9 tables. Technical Report, Under Review"},{"id":"http://arxiv.org/abs/2301.00234v5","updated":"2024-09-27T02:55:06Z","published":"2022-12-31T15:57:09Z","title":"A Survey on In-context Learning","summary":" With the increasing capabilities of large language models (LLMs), in-context\nlearning (ICL) has emerged as a new paradigm for natural language processing\n(NLP), where LLMs make predictions based on contexts augmented with a few\nexamples. It has been a significant trend to explore ICL to evaluate and\nextrapolate the ability of LLMs. In this paper, we aim to survey and summarize\nthe progress and challenges of ICL. We first present a formal definition of ICL\nand clarify its correlation to related studies. Then, we organize and discuss\nadvanced techniques, including training strategies, prompt designing\nstrategies, and related analysis. Additionally, we explore various ICL\napplication scenarios, such as data engineering and knowledge updating.\nFinally, we address the challenges of ICL and suggest potential directions for\nfurther research. We hope that our work can encourage more research on\nuncovering how ICL works and improving ICL.\n","authors":["Qingxiu Dong","Lei Li","Damai Dai","Ce Zheng","Jingyuan Ma","Rui Li","Heming Xia","Jingjing Xu","Zhiyong Wu","Tianyu Liu","Baobao Chang","Xu Sun","Lei Li","Zhifang Sui"],"pdf_url":"https://arxiv.org/pdf/2301.00234v5.pdf","comment":"Update"},{"id":"http://arxiv.org/abs/2409.18365v1","updated":"2024-09-27T00:49:27Z","published":"2024-09-27T00:49:27Z","title":"Defect Prediction with Content-based Features","summary":" Traditional defect prediction approaches often use metrics that measure the\ncomplexity of the design or implementing code of a software system, such as the\nnumber of lines of code in a source file. In this paper, we explore a different\napproach based on content of source code. Our key assumption is that source\ncode of a software system contains information about its technical aspects and\nthose aspects might have different levels of defect-proneness. Thus,\ncontent-based features such as words, topics, data types, and package names\nextracted from a source code file could be used to predict its defects. We have\nperformed an extensive empirical evaluation and found that: i) such\ncontent-based features have higher predictive power than code complexity\nmetrics and ii) the use of feature selection, reduction, and combination\nfurther improves the prediction performance.\n","authors":["Hung Viet Pham","Tung Thanh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.18365v1.pdf","comment":null}],"General Literature":[{"id":"http://arxiv.org/abs/2301.09771v6","updated":"2024-09-27T06:57:04Z","published":"2023-01-24T00:57:37Z","title":"Automation and AI Technology in Surface Mining With a Brief Introduction\n to Open-Pit Operations in the Pilbara","summary":" This survey article provides a synopsis on some of the engineering problems,\ntechnological innovations, robotic development and automation efforts\nencountered in the mining industry -- particularly in the Pilbara iron-ore\nregion of Western Australia. The goal is to paint the technology landscape and\nhighlight issues relevant to an engineering audience to raise awareness of AI\nand automation trends in mining. It assumes the reader has no prior knowledge\nof mining and builds context gradually through focused discussion and short\nsummaries of common open-pit mining operations. The principal activities that\ntake place may be categorized in terms of resource development, mine-, rail-\nand port operations. From mineral exploration to ore shipment, there are\nroughly nine steps in between. These include: geological assessment, mine\nplanning and development, production drilling and assaying, blasting and\nexcavation, transportation of ore and waste, crush and screen, stockpile and\nload-out, rail network distribution, and ore-car dumping. The objective is to\ndescribe these processes and provide insights on some of the\nchallenges/opportunities from the perspective of a decade-long\nindustry-university R&D partnership.\n","authors":["Raymond Leung","Andrew J Hill","Arman Melkumyan"],"pdf_url":"https://arxiv.org/pdf/2301.09771v6.pdf","comment":"Accepted manuscript. Paper provides insights on state-of-the-art\n technologies and future trends. Keywords: Mining automation, robotics,\n intelligent systems, machine learning, remote sensing, geostatistics,\n planning, scheduling, optimization, modelling, geology, complex systems.\n Document: 21 pages, 6 figures, 2 tables. 2024 Update: Added ICRA conference\n poster + slides as ancilliary files"}]},"2024-09-25T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2409.17429v1","updated":"2024-09-25T23:32:59Z","published":"2024-09-25T23:32:59Z","title":"Real-World Data Inspired Interactive Connected Traffic Scenario\n Generation","summary":" Simulation is a crucial step in ensuring accurate, efficient, and realistic\nConnected and Autonomous Vehicles (CAVs) testing and validation. As the\nadoption of CAV accelerates, the integration of real-world data into simulation\nenvironments becomes increasingly critical. Among various technologies utilized\nby CAVs, Vehicle-to-Everything (V2X) communication plays a crucial role in\nensuring a seamless transmission of information between CAVs, infrastructure,\nand other road users. However, most existing studies have focused on developing\nand testing communication protocols, resource allocation strategies, and data\ndissemination techniques in V2X. There is a gap where real-world V2X data is\nintegrated into simulations to generate diverse and high-fidelity traffic\nscenarios. To fulfill this research gap, we leverage real-world Signal Phase\nand Timing (SPaT) data from Roadside Units (RSUs) to enhance the fidelity of\nCAV simulations. Moreover, we developed an algorithm that enables Autonomous\nVehicles (AVs) to respond dynamically to real-time traffic signal data,\nsimulating realistic V2X communication scenarios. Such high-fidelity simulation\nenvironments can generate multimodal data, including trajectory, semantic\ncamera, depth camera, and bird's eye view data for various traffic scenarios.\nThe generated scenarios and data provide invaluable insights into AVs'\ninteractions with traffic infrastructure and other road users. This work aims\nto bridge the gap between theoretical research and practical deployment of\nCAVs, facilitating the development of smarter and safer transportation systems.\n","authors":["Junwei You","Pei Li","Yang Cheng","Keshu Wu","Rui Gan","Steven T. Parker","Bin Ran"],"pdf_url":"https://arxiv.org/pdf/2409.17429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17395v1","updated":"2024-09-25T22:13:50Z","published":"2024-09-25T22:13:50Z","title":"An Anatomy-Aware Shared Control Approach for Assisted Teleoperation of\n Lung Ultrasound Examinations","summary":" The introduction of artificial intelligence and robotics in telehealth is\nenabling personalised treatment and supporting teleoperated procedures such as\nlung ultrasound, which has gained attention during the COVID-19 pandemic.\nAlthough fully autonomous systems face challenges due to anatomical\nvariability, teleoperated systems appear to be more practical in current\nhealthcare settings. This paper presents an anatomy-aware control framework for\nteleoperated lung ultrasound. Using biomechanically accurate 3D models such as\nSMPL and SKEL, the system provides a real-time visual feedback and applies\nvirtual constraints to assist in precise probe placement tasks. Evaluations on\nfive subjects show the accuracy of the biomechanical models and the efficiency\nof the system in improving probe placement and reducing procedure time compared\nto traditional teleoperation. The results demonstrate that the proposed\nframework enhances the physician's capabilities in executing remote lung\nultrasound examinations, towards more objective and repeatable acquisitions.\n","authors":["Davide Nardi","Edoardo Lamon","Luca Beber","Daniele Fontanelli","Matteo Saveriano","Luigi Palopoli"],"pdf_url":"https://arxiv.org/pdf/2409.17395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17389v1","updated":"2024-09-25T22:06:09Z","published":"2024-09-25T22:06:09Z","title":"Safe Leaf Manipulation for Accurate Shape and Pose Estimation of\n Occluded Fruits","summary":" Fruit monitoring plays an important role in crop management, and rising\nglobal fruit consumption combined with labor shortages necessitates automated\nmonitoring with robots. However, occlusions from plant foliage often hinder\naccurate shape and pose estimation. Therefore, we propose an active fruit shape\nand pose estimation method that physically manipulates occluding leaves to\nreveal hidden fruits. This paper introduces a framework that plans robot\nactions to maximize visibility and minimize leaf damage. We developed a novel\nscene-consistent shape completion technique to improve fruit estimation under\nheavy occlusion and utilize a perception-driven deformation graph model to\npredict leaf deformation during planning. Experiments on artificial and real\nsweet pepper plants demonstrate that our method enables robots to safely move\nleaves aside, exposing fruits for accurate shape and pose estimation,\noutperforming baseline methods. Project page:\nhttps://shaoxiongyao.github.io/lmap-ssc/.\n","authors":["Shaoxiong Yao","Sicong Pan","Maren Bennewitz","Kris Hauser"],"pdf_url":"https://arxiv.org/pdf/2409.17389v1.pdf","comment":"Shaoxiong Yao and Sicong Pan have equal contributions. Submitted to\n ICRA 2025"},{"id":"http://arxiv.org/abs/2409.17379v1","updated":"2024-09-25T21:46:44Z","published":"2024-09-25T21:46:44Z","title":"Decentralized Nonlinear Model Predictive Control for Safe Collision\n Avoidance in Quadrotor Teams with Limited Detection Range","summary":" Multi-quadrotor systems face significant challenges in decentralized control,\nparticularly with safety and coordination under sensing and communication\nlimitations. State-of-the-art methods leverage Control Barrier Functions (CBFs)\nto provide safety guarantees but often neglect actuation constraints and\nlimited detection range. To address these gaps, we propose a novel\ndecentralized Nonlinear Model Predictive Control (NMPC) that integrates\nExponential CBFs (ECBFs) to enhance safety and optimality in multi-quadrotor\nsystems. We provide both conservative and practical minimum bounds of the range\nthat preserve the safety guarantees of the ECBFs. We validate our approach\nthrough extensive simulations with up to 10 quadrotors and 20 obstacles, as\nwell as real-world experiments with 3 quadrotors. Results demonstrate the\neffectiveness of the proposed framework in realistic settings, highlighting its\npotential for reliable quadrotor teams operations.\n","authors":["Manohari Goarin","Guanrui Li","Alessandro Saviolo","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2409.17379v1.pdf","comment":"7 pages, 5 figures, Submitted to the IEEE International Conference on\n Robotics and Automation (ICRA) 2025"},{"id":"http://arxiv.org/abs/2304.04919v2","updated":"2024-09-25T21:17:00Z","published":"2023-04-11T01:15:25Z","title":"Design, Integration, and Field Evaluation of a Robotic Blossom Thinning\n System for Tree Fruit Crops","summary":" The US apple industry relies heavily on semi-skilled manual labor force for\nessential field operations such as training, pruning, blossom and green fruit\nthinning, and harvesting. Blossom thinning is one of the crucial crop load\nmanagement practices to achieve desired crop load, fruit quality, and return\nbloom. While several techniques such as chemical, and mechanical thinning are\navailable for large-scale blossom thinning such approaches often yield\nunpredictable thinning results and may cause damage the canopy, spurs, and leaf\ntissue. Hence, growers still depend on laborious, labor intensive and expensive\nmanual hand blossom thinning for desired thinning outcomes. This research\npresents a robotic solution for blossom thinning in apple orchards using a\ncomputer vision system with artificial intelligence, a six degrees of freedom\nrobotic manipulator, and an electrically actuated miniature end-effector for\nrobotic blossom thinning. The integrated robotic system was evaluated in a\ncommercial apple orchard which showed promising results for targeted and\nselective blossom thinning. Two thinning approaches, center and boundary\nthinning, were investigated to evaluate the system ability to remove varying\nproportion of flowers from apple flower clusters. During boundary thinning the\nend effector was actuated around the cluster boundary while center thinning\ninvolved end-effector actuation only at the cluster centroid for a fixed\nduration of 2 seconds. The boundary thinning approach thinned 67.2% of flowers\nfrom the targeted clusters with a cycle time of 9.0 seconds per cluster,\nwhereas center thinning approach thinned 59.4% of flowers with a cycle time of\n7.2 seconds per cluster. When commercially adopted, the proposed system could\nhelp address problems faced by apple growers with current hand, chemical, and\nmechanical blossom thinning approaches.\n","authors":["Uddhav Bhattarai","Qin Zhang","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2304.04919v2.pdf","comment":"Accepted for publication in the Journal of Field Robotics"},{"id":"http://arxiv.org/abs/2409.17359v1","updated":"2024-09-25T21:08:25Z","published":"2024-09-25T21:08:25Z","title":"Data-driven Probabilistic Trajectory Learning with High Temporal\n Resolution in Terminal Airspace","summary":" Predicting flight trajectories is a research area that holds significant\nmerit. In this paper, we propose a data-driven learning framework, that\nleverages the predictive and feature extraction capabilities of the mixture\nmodels and seq2seq-based neural networks while addressing prevalent challenges\ncaused by error propagation and dimensionality reduction. After training with\nthis framework, the learned model can improve long-step prediction accuracy\nsignificantly given the past trajectories and the context information. The\naccuracy and effectiveness of the approach are evaluated by comparing the\npredicted trajectories with the ground truth. The results indicate that the\nproposed method has outperformed the state-of-the-art predicting methods on a\nterminal airspace flight trajectory dataset. The trajectories generated by the\nproposed method have a higher temporal resolution(1 timestep per second vs 0.1\ntimestep per second) and are closer to the ground truth.\n","authors":["Jun Xiang","Jun Chen"],"pdf_url":"https://arxiv.org/pdf/2409.17359v1.pdf","comment":"Submitted to AIAA-JAIS"},{"id":"http://arxiv.org/abs/2409.17345v1","updated":"2024-09-25T20:45:19Z","published":"2024-09-25T20:45:19Z","title":"SeaSplat: Representing Underwater Scenes with 3D Gaussian Splatting and\n a Physically Grounded Image Formation Model","summary":" We introduce SeaSplat, a method to enable real-time rendering of underwater\nscenes leveraging recent advances in 3D radiance fields. Underwater scenes are\nchallenging visual environments, as rendering through a medium such as water\nintroduces both range and color dependent effects on image capture. We\nconstrain 3D Gaussian Splatting (3DGS), a recent advance in radiance fields\nenabling rapid training and real-time rendering of full 3D scenes, with a\nphysically grounded underwater image formation model. Applying SeaSplat to the\nreal-world scenes from SeaThru-NeRF dataset, a scene collected by an underwater\nvehicle in the US Virgin Islands, and simulation-degraded real-world scenes,\nnot only do we see increased quantitative performance on rendering novel\nviewpoints from the scene with the medium present, but are also able to recover\nthe underlying true color of the scene and restore renders to be without the\npresence of the intervening medium. We show that the underwater image formation\nhelps learn scene structure, with better depth maps, as well as show that our\nimprovements maintain the significant computational improvements afforded by\nleveraging a 3D Gaussian representation.\n","authors":["Daniel Yang","John J. Leonard","Yogesh Girdhar"],"pdf_url":"https://arxiv.org/pdf/2409.17345v1.pdf","comment":"Project page here: https://seasplat.github.io"},{"id":"http://arxiv.org/abs/2409.17340v1","updated":"2024-09-25T20:28:57Z","published":"2024-09-25T20:28:57Z","title":"Koopman-driven grip force prediction through EMG sensing","summary":" Loss of hand function due to conditions like stroke or multiple sclerosis\nsignificantly impacts daily activities. Robotic rehabilitation provides tools\nto restore hand function, while novel methods based on surface electromyography\n(sEMG) enable the adaptation of the device's force output according to the\nuser's condition, thereby improving rehabilitation outcomes. This study aims to\nachieve accurate force estimations during medium wrap grasps using a single\nsEMG sensor pair, thereby addressing the challenge of escalating sensor\nrequirements for precise predictions. We conducted sEMG measurements on 13\nsubjects at two forearm positions, validating results with a hand dynamometer.\nWe established flexible signal-processing steps, yielding high peak\ncross-correlations between the processed sEMG signal (representing meaningful\nmuscle activity) and grip force. Influential parameters were subsequently\nidentified through sensitivity analysis. Leveraging a novel data-driven Koopman\noperator theory-based approach and problem-specific data lifting techniques, we\ndevised a methodology for the estimation and short-term prediction of grip\nforce from processed sEMG signals. A weighted mean absolute percentage error\n(wMAPE) of approx. 5.5% was achieved for the estimated grip force, whereas\npredictions with a 0.5-second prediction horizon resulted in a wMAPE of approx.\n17.9%. The methodology proved robust regarding precise electrode positioning,\nas the effect of sensing position on error metrics was non-significant. The\nalgorithm executes exceptionally fast, processing, estimating, and predicting a\n0.5-second sEMG signal batch in just approx. 30 ms, facilitating real-time\nimplementation.\n","authors":["Tomislav Bazina","Ervin Kamenar","Maria Fonoberova","Igor Mezić"],"pdf_url":"https://arxiv.org/pdf/2409.17340v1.pdf","comment":"11 pages, 8 figures, journal"},{"id":"http://arxiv.org/abs/2409.16030v2","updated":"2024-09-25T18:51:25Z","published":"2024-09-24T12:29:44Z","title":"MHRC: Closed-loop Decentralized Multi-Heterogeneous Robot Collaboration\n with Large Language Models","summary":" The integration of large language models (LLMs) with robotics has\nsignificantly advanced robots' abilities in perception, cognition, and task\nplanning. The use of natural language interfaces offers a unified approach for\nexpressing the capability differences of heterogeneous robots, facilitating\ncommunication between them, and enabling seamless task allocation and\ncollaboration. Currently, the utilization of LLMs to achieve decentralized\nmulti-heterogeneous robot collaborative tasks remains an under-explored area of\nresearch. In this paper, we introduce a novel framework that utilizes LLMs to\nachieve decentralized collaboration among multiple heterogeneous robots. Our\nframework supports three robot categories, mobile robots, manipulation robots,\nand mobile manipulation robots, working together to complete tasks such as\nexploration, transportation, and organization. We developed a rich set of\ntextual feedback mechanisms and chain-of-thought (CoT) prompts to enhance task\nplanning efficiency and overall system performance. The mobile manipulation\nrobot can adjust its base position flexibly, ensuring optimal conditions for\ngrasping tasks. The manipulation robot can comprehend task requirements, seek\nassistance when necessary, and handle objects appropriately. Meanwhile, the\nmobile robot can explore the environment extensively, map object locations, and\ncommunicate this information to the mobile manipulation robot, thus improving\ntask execution efficiency. We evaluated the framework using PyBullet, creating\nscenarios with three different room layouts and three distinct operational\ntasks. We tested various LLM models and conducted ablation studies to assess\nthe contributions of different modules. The experimental results confirm the\neffectiveness and necessity of our proposed framework.\n","authors":["Wenhao Yu","Jie Peng","Yueliang Ying","Sai Li","Jianmin Ji","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.16030v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17277v1","updated":"2024-09-25T18:43:58Z","published":"2024-09-25T18:43:58Z","title":"Building Real-time Awareness of Out-of-distribution in Trajectory\n Prediction for Autonomous Vehicles","summary":" Trajectory prediction describes the motions of surrounding moving obstacles\nfor an autonomous vehicle; it plays a crucial role in enabling timely\ndecision-making, such as collision avoidance and trajectory replanning.\nAccurate trajectory planning is the key to reliable vehicle deployments in\nopen-world environment, where unstructured obstacles bring in uncertainties\nthat are impossible to fully capture by training data. For traditional machine\nlearning tasks, such uncertainties are often addressed reasonably well via\nmethods such as continual learning. On the one hand, naively applying those\nmethods to trajectory prediction can result in continuous data collection and\nfrequent model updates, which can be resource-intensive. On the other hand, the\npredicted trajectories can be far away from the true trajectories, leading to\nunsafe decision-making. In this paper, we aim to establish real-time awareness\nof out-of-distribution in trajectory prediction for autonomous vehicles. We\nfocus on the challenging and practically relevant setting where the\nout-of-distribution is deceptive, that is, the one not easily detectable by\nhuman intuition. Drawing on the well-established techniques of sequential\nanalysis, we build real-time awareness of out-of-distribution by monitoring\nprediction errors using the quickest change point detection (QCD). Our\nsolutions are lightweight and can handle the occurrence of out-of-distribution\nat any time during trajectory prediction inference. Experimental results on\nmultiple real-world datasets using a benchmark trajectory prediction model\ndemonstrate the effectiveness of our methods.\n","authors":[" Tongfei"," Guo","Taposh Banerjee","Rui Liu","Lili Su"],"pdf_url":"https://arxiv.org/pdf/2409.17277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17262v1","updated":"2024-09-25T18:20:05Z","published":"2024-09-25T18:20:05Z","title":"CROSS-GAiT: Cross-Attention-Based Multimodal Representation Fusion for\n Parametric Gait Adaptation in Complex Terrains","summary":" We present CROSS-GAiT, a novel algorithm for quadruped robots that uses Cross\nAttention to fuse terrain representations derived from visual and time-series\ninputs, including linear accelerations, angular velocities, and joint efforts.\nThese fused representations are used to adjust the robot's step height and hip\nsplay, enabling adaptive gaits that respond dynamically to varying terrain\nconditions. We generate these terrain representations by processing visual\ninputs through a masked Vision Transformer (ViT) encoder and time-series data\nthrough a dilated causal convolutional encoder. The cross-attention mechanism\nthen selects and integrates the most relevant features from each modality,\ncombining terrain characteristics with robot dynamics for better-informed gait\nadjustments. CROSS-GAiT uses the combined representation to dynamically adjust\ngait parameters in response to varying and unpredictable terrains. We train\nCROSS-GAiT on data from diverse terrains, including asphalt, concrete, brick\npavements, grass, dense vegetation, pebbles, gravel, and sand. Our algorithm\ngeneralizes well and adapts to unseen environmental conditions, enhancing\nreal-time navigation performance. CROSS-GAiT was implemented on a Ghost\nRobotics Vision 60 robot and extensively tested in complex terrains with high\nvegetation density, uneven/unstable surfaces, sand banks, deformable\nsubstrates, etc. We observe at least a 7.04% reduction in IMU energy density\nand a 27.3% reduction in total joint effort, which directly correlates with\nincreased stability and reduced energy usage when compared to state-of-the-art\nmethods. Furthermore, CROSS-GAiT demonstrates at least a 64.5% increase in\nsuccess rate and a 4.91% reduction in time to reach the goal in four complex\nscenarios. Additionally, the learned representations perform 4.48% better than\nthe state-of-the-art on a terrain classification task.\n","authors":["Gershom Seneviratne","Kasun Weerakoon","Mohamed Elnoor","Vignesh Rajgopal","Harshavarthan Varatharajan","Mohamed Khalid M Jaffar","Jason Pusey","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2409.17262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17131v1","updated":"2024-09-25T17:49:02Z","published":"2024-09-25T17:49:02Z","title":"Enhancing robot reliability for health-care facilities by means of\n Human-Aware Navigation Planning","summary":" With the aim of enabling robots to cooperate with humans, carry out\nhuman-like tasks, or navigate among humans, we need to ensure that they are\nequipped with the ability to comprehend human behaviors and use the extracted\nknowledge for intelligent decision-making. This ability is particularly\nimportant in the safety-critical and human-centred environment of health-care\ninstitutions. In the field of robotic navigation, the most cutting-edge\napproaches to enhancing robot reliability in the application domain of\nhealthcare facilities and in general pertain to augmenting navigation systems\nwith human-aware properties. To implement this in our work, the Co-operative\nHuman-Aware Navigation planner has been integrated into the ROS-based\ndifferential-drive robot MARRtina and exhaustively challenged within various\nsimulated contexts and scenarios (mainly modelling the situations relevant in\nthe medical domain) to draw attention to the integrated system's benefits and\nidentify its drawbacks or instances of poor performance while exploring the\nscope of system capabilities and creating a full characterization of its\napplicability. The simulation results are then presented to medical experts,\nand the enhanced robot acceptability within the domain is validated with them\nas the robot is further planned for deployment.\n","authors":["Olga E. Sorokoletova","Lucca Iocchi"],"pdf_url":"https://arxiv.org/pdf/2409.17131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17126v1","updated":"2024-09-25T17:42:20Z","published":"2024-09-25T17:42:20Z","title":"Blox-Net: Generative Design-for-Robot-Assembly Using VLM Supervision,\n Physics Simulation, and a Robot with Reset","summary":" Generative AI systems have shown impressive capabilities in creating text,\ncode, and images. Inspired by the rich history of research in industrial\n''Design for Assembly'', we introduce a novel problem: Generative\nDesign-for-Robot-Assembly (GDfRA). The task is to generate an assembly based on\na natural language prompt (e.g., ''giraffe'') and an image of available\nphysical components, such as 3D-printed blocks. The output is an assembly, a\nspatial arrangement of these components, and instructions for a robot to build\nthis assembly. The output must 1) resemble the requested object and 2) be\nreliably assembled by a 6 DoF robot arm with a suction gripper. We then present\nBlox-Net, a GDfRA system that combines generative vision language models with\nwell-established methods in computer vision, simulation, perturbation analysis,\nmotion planning, and physical robot experimentation to solve a class of GDfRA\nproblems with minimal human supervision. Blox-Net achieved a Top-1 accuracy of\n63.5% in the ''recognizability'' of its designed assemblies (eg, resembling\ngiraffe as judged by a VLM). These designs, after automated perturbation\nredesign, were reliably assembled by a robot, achieving near-perfect success\nacross 10 consecutive assembly iterations with human intervention only during\nreset prior to assembly. Surprisingly, this entire design process from textual\nword (''giraffe'') to reliable physical assembly is performed with zero human\nintervention.\n","authors":["Andrew Goldberg","Kavish Kondap","Tianshuang Qiu","Zehan Ma","Letian Fu","Justin Kerr","Huang Huang","Kaiyuan Chen","Kuan Fang","Ken Goldberg"],"pdf_url":"https://arxiv.org/pdf/2409.17126v1.pdf","comment":"8 pages, 7 Figures"},{"id":"http://arxiv.org/abs/2409.17124v1","updated":"2024-09-25T17:38:20Z","published":"2024-09-25T17:38:20Z","title":"PokeFlex: Towards a Real-World Dataset of Deformable Objects for Robotic\n Manipulation","summary":" Advancing robotic manipulation of deformable objects can enable automation of\nrepetitive tasks across multiple industries, from food processing to textiles\nand healthcare. Yet robots struggle with the high dimensionality of deformable\nobjects and their complex dynamics. While data-driven methods have shown\npotential for solving manipulation tasks, their application in the domain of\ndeformable objects has been constrained by the lack of data. To address this,\nwe propose PokeFlex, a pilot dataset featuring real-world 3D mesh data of\nactively deformed objects, together with the corresponding forces and torques\napplied by a robotic arm, using a simple poking strategy. Deformations are\ncaptured with a professional volumetric capture system that allows for complete\n360-degree reconstruction. The PokeFlex dataset consists of five deformable\nobjects with varying stiffness and shapes. Additionally, we leverage the\nPokeFlex dataset to train a vision model for online 3D mesh reconstruction from\na single image and a template mesh. We refer readers to the supplementary\nmaterial and to our website ( https://pokeflex-dataset.github.io/ ) for demos\nand examples of our dataset.\n","authors":["Jan Obrist","Miguel Zamora","Hehui Zheng","Juan Zarate","Robert K. Katzschmann","Stelian Coros"],"pdf_url":"https://arxiv.org/pdf/2409.17124v1.pdf","comment":"Extended Abstract, 40th Anniversary of the IEEE International\n Conference on Robotics and Automation. (ICRA@40 Rotterdam 2024)"},{"id":"http://arxiv.org/abs/2409.17116v1","updated":"2024-09-25T17:28:36Z","published":"2024-09-25T17:28:36Z","title":"Hierarchical Tri-manual Planning for Vision-assisted Fruit Harvesting\n with Quadrupedal Robots","summary":" This paper addresses the challenge of developing a multi-arm quadrupedal\nrobot capable of efficiently harvesting fruit in complex, natural environments.\nTo overcome the inherent limitations of traditional bimanual manipulation, we\nintroduce the first three-arm quadrupedal robot LocoHarv-3 and propose a novel\nhierarchical tri-manual planning approach, enabling automated fruit harvesting\nwith collision-free trajectories. Our comprehensive semi-autonomous framework\nintegrates teleoperation, supported by LiDAR-based odometry and mapping, with\nlearning-based visual perception for accurate fruit detection and pose\nestimation. Validation is conducted through a series of controlled indoor\nexperiments using motion capture and extensive field tests in natural settings.\nResults demonstrate a 90\\% success rate in in-lab settings with a single\nattempt, and field trials further verify the system's robustness and efficiency\nin more challenging real-world environments.\n","authors":["Zhichao Liu","Jingzong Zhou","Konstantinos Karydis"],"pdf_url":"https://arxiv.org/pdf/2409.17116v1.pdf","comment":"7 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.17114v1","updated":"2024-09-25T17:27:52Z","published":"2024-09-25T17:27:52Z","title":"Towards human-like kinematics in industrial robotic arms: a case study\n on a UR3 robot","summary":" Safety in industrial robotic environments is a hot research topic in the area\nof human-robot interaction (HRI). Up to now, a robotic arm on an assembly line\ninteracts with other machines away from human workers. Nowadays, robotic arm\nmanufactures are aimed to their robots could increasingly perform tasks\ncollaborating with humans. One of the ways to improve this collaboration is by\nmaking the movement of robots more humanlike. This way, it would be easier for\na human to foresee the movement of the robot and approach it without fear of\ncontact. The main difference between the movement of a human and of a robotic\narm is that the former has a bell-shaped speed profile while the latter has a\nuniform speed one. To generate this speed profile, the kinematic theory of\nrapid human movements and its Sigma-Lognormal model has been used. This model\nis widely used to explain most of the basic phenomena related to the control of\nhuman movements. Both human-like and robotic-like movements are transferred to\nthe UR3 robot. In this paper we detail the how the UR3 robot was programmed to\nproduce both kinds of movement. The dissimilarities result between the input\nmotion and output motion to the robot confirm the possibility to develop\nhuman-like velocities in the UR3 robot.\n","authors":["Adam Wolniakowski","Kanstantsin Miatliuk","Jose J. Quintana","Miguel A. Ferrer","Moises Diaz"],"pdf_url":"https://arxiv.org/pdf/2409.17114v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17111v1","updated":"2024-09-25T17:25:53Z","published":"2024-09-25T17:25:53Z","title":"Self-Sensing for Proprioception and Contact Detection in Soft Robots\n Using Shape Memory Alloy Artificial Muscles","summary":" Estimating a soft robot's pose and applied forces, also called\nproprioception, is crucial for safe interaction of the robot with its\nenvironment. However, most solutions for soft robot proprioception use\ndedicated sensors, particularly for external forces, which introduce design\ntrade-offs, rigidity, and risk of failure. This work presents an approach for\npose estimation and contact detection for soft robots actuated by shape memory\nalloy (SMA) artificial muscles, using no dedicated force sensors. Our framework\nuses the unique material properties of SMAs to self-sense their internal\nstress, via offboard measurements of their electrical resistance and in-situ\ntemperature readings, in an existing fully-soft limb design. We demonstrate\nthat a simple polynomial regression model on these measurements is sufficient\nto predict the robot's pose, under no-contact conditions. Then, we show that if\nan additional measurement of the true pose is available (e.g. from an\nalready-in-place bending sensor), it is possible to predict a binary\ncontact/no-contact using multiple combinations of self-sensing signals. Our\nhardware tests verify our hypothesis via a contact detection test with a human\noperator. This proof-of-concept validates that self-sensing signals in soft\nSMA-actuated soft robots can be used for proprioception and contact detection,\nand suggests a direction for integrating proprioception into soft robots\nwithout design compromises. Future work could employ machine learning for\nenhanced accuracy.\n","authors":["Ran Jing","Meredith L. Anderson","Juan C. Pacheco Garcia","Andrew P. Sabelhaus"],"pdf_url":"https://arxiv.org/pdf/2409.17111v1.pdf","comment":"6 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.17079v1","updated":"2024-09-25T16:44:53Z","published":"2024-09-25T16:44:53Z","title":"Collision-free time-optimal path parameterization for multi-robot teams","summary":" Coordinating the motion of multiple robots in cluttered environments remains\na computationally challenging task. We study the problem of minimizing the\nexecution time of a set of geometric paths by a team of robots with\nstate-dependent actuation constraints. We propose a Time-Optimal Path\nParameterization (TOPP) algorithm for multiple car-like agents, where the\nmodulation of the timing of every robot along its assigned path is employed to\nensure collision avoidance and dynamic feasibility. This is achieved through\nthe use of a priority queue to determine the order of trajectory execution for\neach robot while taking into account all possible collisions with higher\npriority robots in a spatiotemporal graph. We show a 10-20% reduction in\nmakespan against existing state-of-the-art methods and validate our approach\nthrough simulations and hardware experiments.\n","authors":["Katherine Mao","Igor Spasojevic","Malakhi Hopkins","M. Ani Hsieh","Vijay Kumar"],"pdf_url":"https://arxiv.org/pdf/2409.17079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17208v1","updated":"2024-09-25T16:15:06Z","published":"2024-09-25T16:15:06Z","title":"2024 BRAVO Challenge Track 1 1st Place Report: Evaluating Robustness of\n Vision Foundation Models for Semantic Segmentation","summary":" In this report, we present our solution for Track 1 of the 2024 BRAVO\nChallenge, where a model is trained on Cityscapes and its robustness is\nevaluated on several out-of-distribution datasets. Our solution leverages the\npowerful representations learned by vision foundation models, by attaching a\nsimple segmentation decoder to DINOv2 and fine-tuning the entire model. This\napproach outperforms more complex existing approaches, and achieves 1st place\nin the challenge. Our code is publicly available at\nhttps://github.com/tue-mps/benchmark-vfm-ss.\n","authors":["Tommie Kerssies","Daan de Geus","Gijs Dubbelman"],"pdf_url":"https://arxiv.org/pdf/2409.17208v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2409.15107"},{"id":"http://arxiv.org/abs/2409.17004v1","updated":"2024-09-25T15:07:47Z","published":"2024-09-25T15:07:47Z","title":"Semantically-Driven Disambiguation for Human-Robot Interaction","summary":" Ambiguities are common in human-robot interaction, especially when a robot\nfollows user instructions in a large collocated space. For instance, when the\nuser asks the robot to find an object in a home environment, the object might\nbe in several places depending on its varying semantic properties (e.g., a bowl\ncan be in the kitchen cabinet or on the dining room table, depending on whether\nit is clean/dirty, full/empty and the other objects around it). Previous works\non object semantics have predicted such relationships using one shot-inferences\nwhich are likely to fail for ambiguous or partially understood instructions.\nThis paper focuses on this gap and suggests a semantically-driven\ndisambiguation approach by utilizing follow-up clarifications to handle such\nuncertainties. To achieve this, we first obtain semantic knowledge embeddings,\nand then these embeddings are used to generate clarifying questions by\nfollowing an iterative process. The evaluation of our method shows that our\napproach is model agnostic, i.e., applicable to different semantic embedding\nmodels, and follow-up clarifications improve the performance regardless of the\nembedding model. Additionally, our ablation studies show the significance of\ninformative clarifications and iterative predictions to enhance system\naccuracies.\n","authors":["Fethiye Irmak Dogan","Weiyu Liu","Iolanda Leite","Sonia Chernova"],"pdf_url":"https://arxiv.org/pdf/2409.17004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16999v1","updated":"2024-09-25T15:04:21Z","published":"2024-09-25T15:04:21Z","title":"WasteGAN: Data Augmentation for Robotic Waste Sorting through Generative\n Adversarial Networks","summary":" Robotic waste sorting poses significant challenges in both perception and\nmanipulation, given the extreme variability of objects that should be\nrecognized on a cluttered conveyor belt. While deep learning has proven\neffective in solving complex tasks, the necessity for extensive data collection\nand labeling limits its applicability in real-world scenarios like waste\nsorting. To tackle this issue, we introduce a data augmentation method based on\na novel GAN architecture called wasteGAN. The proposed method allows to\nincrease the performance of semantic segmentation models, starting from a very\nlimited bunch of labeled examples, such as few as 100. The key innovations of\nwasteGAN include a novel loss function, a novel activation function, and a\nlarger generator block. Overall, such innovations helps the network to learn\nfrom limited number of examples and synthesize data that better mirrors\nreal-world distributions. We then leverage the higher-quality segmentation\nmasks predicted from models trained on the wasteGAN synthetic data to compute\nsemantic-aware grasp poses, enabling a robotic arm to effectively recognizing\ncontaminants and separating waste in a real-world scenario. Through\ncomprehensive evaluation encompassing dataset-based assessments and real-world\nexperiments, our methodology demonstrated promising potential for robotic waste\nsorting, yielding performance gains of up to 5.8\\% in picking contaminants. The\nproject page is available at https://github.com/bach05/wasteGAN.git\n","authors":["Alberto Bacchin","Leonardo Barcellona","Matteo Terreran","Stefano Ghidoni","Emanuele Menegatti","Takuya Kiyokawa"],"pdf_url":"https://arxiv.org/pdf/2409.16999v1.pdf","comment":"Accepted at 2024 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2309.12784v4","updated":"2024-09-25T14:47:58Z","published":"2023-09-22T10:51:49Z","title":"Learning to Walk and Fly with Adversarial Motion Priors","summary":" Robot multimodal locomotion encompasses the ability to transition between\nwalking and flying, representing a significant challenge in robotics. This work\npresents an approach that enables automatic smooth transitions between legged\nand aerial locomotion. Leveraging the concept of Adversarial Motion Priors, our\nmethod allows the robot to imitate motion datasets and accomplish the desired\ntask without the need for complex reward functions. The robot learns walking\npatterns from human-like gaits and aerial locomotion patterns from motions\nobtained using trajectory optimization. Through this process, the robot adapts\nthe locomotion scheme based on environmental feedback using reinforcement\nlearning, with the spontaneous emergence of mode-switching behavior. The\nresults highlight the potential for achieving multimodal locomotion in aerial\nhumanoid robotics through automatic control of walking and flying modes, paving\nthe way for applications in diverse domains such as search and rescue,\nsurveillance, and exploration missions. This research contributes to advancing\nthe capabilities of aerial humanoid robots in terms of versatile locomotion in\nvarious environments.\n","authors":["Giuseppe L'Erario","Drew Hanover","Angel Romero","Yunlong Song","Gabriele Nava","Paolo Maria Viceconte","Daniele Pucci","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2309.12784v4.pdf","comment":"This paper has been accepted for publication at the IEEE/RSJ\n International Conference on Intelligent Robots and Systems (IROS), Abu Dhabi,\n 2024"},{"id":"http://arxiv.org/abs/2409.16976v1","updated":"2024-09-25T14:38:56Z","published":"2024-09-25T14:38:56Z","title":"Hydraulic Volumetric Soft Everting Vine Robot Steering Mechanism for\n Underwater Exploration","summary":" Despite a significant proportion of the Earth being covered in water,\nexploration of what lies below has been limited due to the challenges and\ndifficulties inherent in the process. Current state of the art robots such as\nRemotely Operated Vehicles (ROVs) and Autonomous Underwater Vehicles (AUVs) are\nbulky, rigid and unable to conform to their environment. Soft robotics offers\nsolutions to this issue. Fluid-actuated eversion or growing robots, in\nparticular, are a good example. While current eversion robots have found many\napplications on land, their inherent properties make them particularly well\nsuited to underwater environments. An important factor when considering\nunderwater eversion robots is the establishment of a suitable steering\nmechanism that can enable the robot to change direction as required. This\nproject proposes a design for an eversion robot that is capable of steering\nwhile underwater, through the use of bending pouches, a design commonly seen in\nthe literature on land-based eversion robots. These bending pouches contract to\nenable directional change. Similar to their land-based counterparts, the\nunderwater eversion robot uses the same fluid in the medium it operates in to\nachieve extension and bending but also to additionally aid in neutral buoyancy.\nThe actuation method of bending pouches meant that robots needed to fully\nextend before steering was possible. Three robots, with the same design and\ndimensions were constructed from polyethylene tubes and tested. Our research\nshows that although the soft eversion robot design in this paper was not\ncapable of consistently generating the same amounts of bending for the\ninflation volume, it still achieved suitable bending at a range of inflation\nvolumes and was observed to bend to a maximum angle of 68 degrees at 2000 ml,\nwhich is in line with the bending angles reported for land-based eversion\nrobots in the literature.\n","authors":["Danyaal Kaleel","Benoit Clement","Kaspar Althoefer"],"pdf_url":"https://arxiv.org/pdf/2409.16976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16972v1","updated":"2024-09-25T14:32:59Z","published":"2024-09-25T14:32:59Z","title":"Efficient Submap-based Autonomous MAV Exploration using Visual-Inertial\n SLAM Configurable for LiDARs or Depth Cameras","summary":" Autonomous exploration of unknown space is an essential component for the\ndeployment of mobile robots in the real world. Safe navigation is crucial for\nall robotics applications and requires accurate and consistent maps of the\nrobot's surroundings. To achieve full autonomy and allow deployment in a wide\nvariety of environments, the robot must rely on on-board state estimation which\nis prone to drift over time. We propose a Micro Aerial Vehicle (MAV)\nexploration framework based on local submaps to allow retaining global\nconsistency by applying loop-closure corrections to the relative submap poses.\nTo enable large-scale exploration we efficiently compute global,\nenvironment-wide frontiers from the local submap frontiers and use a\nsampling-based next-best-view exploration planner. Our method seamlessly\nsupports using either a LiDAR sensor or a depth camera, making it suitable for\ndifferent kinds of MAV platforms. We perform comparative evaluations in\nsimulation against a state-of-the-art submap-based exploration framework to\nshowcase the efficiency and reconstruction quality of our approach. Finally, we\ndemonstrate the applicability of our method to real-world MAVs, one equipped\nwith a LiDAR and the other with a depth camera. Video available at\nhttps://youtu.be/Uf5fwmYcuq4 .\n","authors":["Sotiris Papatheodorou","Simon Boche","Sebastián Barbas Laina","Stefan Leutenegger"],"pdf_url":"https://arxiv.org/pdf/2409.16972v1.pdf","comment":"7 pages, 8 figures, for the accompanying video see\n https://youtu.be/Uf5fwmYcuq4"},{"id":"http://arxiv.org/abs/2409.16967v1","updated":"2024-09-25T14:27:37Z","published":"2024-09-25T14:27:37Z","title":"Multi-Robot Informative Path Planning for Efficient Target Mapping using\n Deep Reinforcement Learning","summary":" Autonomous robots are being employed in several mapping and data collection\ntasks due to their efficiency and low labor costs. In these tasks, the robots\nare required to map targets-of-interest in an unknown environment while\nconstrained to a given resource budget such as path length or mission time.\nThis is a challenging problem as each robot has to not only detect and avoid\ncollisions from static obstacles in the environment but also has to model other\nrobots' trajectories to avoid inter-robot collisions. We propose a novel deep\nreinforcement learning approach for multi-robot informative path planning to\nmap targets-of-interest in an unknown 3D environment. A key aspect of our\napproach is an augmented graph that models other robots' trajectories to enable\nplanning for communication and inter-robot collision avoidance. We train our\ndecentralized reinforcement learning policy via the centralized training and\ndecentralized execution paradigm. Once trained, our policy is also scalable to\nvarying number of robots and does not require re-training. Our approach\noutperforms other state-of-the-art multi-robot target mapping approaches by\n33.75% in terms of the number of discovered targets-of-interest. We open-source\nour code and model at: https://github.com/AccGen99/marl_ipp\n","authors":["Apoorva Vashisth","Dipam Patel","Damon Conover","Aniket Bera"],"pdf_url":"https://arxiv.org/pdf/2409.16967v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2402.04894"},{"id":"http://arxiv.org/abs/2408.16559v2","updated":"2024-09-25T14:23:50Z","published":"2024-08-29T14:25:11Z","title":"DroneWiS: Automated Simulation Testing of small Unmanned Aerial Systems\n in Realistic Windy Conditions","summary":" The continuous evolution of small Unmanned Aerial Systems (sUAS) demands\nadvanced testing methodologies to ensure their safe and reliable operations in\nthe real-world. To push the boundaries of sUAS simulation testing in realistic\nenvironments, we previously developed the DroneReqValidator (DRV) platform,\nallowing developers to automatically conduct simulation testing in digital twin\nof earth. In this paper, we present DRV 2.0, which introduces a novel component\ncalled DroneWiS (Drone Wind Simulation). DroneWiS allows sUAS developers to\nautomatically simulate realistic windy conditions and test the resilience of\nsUAS against wind. Unlike current state-of-the-art simulation tools such as\nGazebo and AirSim that only simulate basic wind conditions, DroneWiS leverages\nComputational Fluid Dynamics (CFD) to compute the unique wind flows caused by\nthe interaction of wind with the objects in the environment such as buildings\nand uneven terrains. This simulation capability provides deeper insights to\ndevelopers about the navigation capability of sUAS in challenging and realistic\nwindy conditions. DroneWiS equips sUAS developers with a powerful tool to test,\ndebug, and improve the reliability and safety of sUAS in real-world. A working\ndemonstration is available at https://youtu.be/khBHEBST8Wc\n","authors":["Bohan Zhang","Ankit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2408.16559v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18600v2","updated":"2024-09-25T14:20:39Z","published":"2024-03-27T14:22:40Z","title":"RAP: Retrieval-Augmented Planner for Adaptive Procedure Planning in\n Instructional Videos","summary":" Procedure Planning in instructional videos entails generating a sequence of\naction steps based on visual observations of the initial and target states.\nDespite the rapid progress in this task, there remain several critical\nchallenges to be solved: (1) Adaptive procedures: Prior works hold an\nunrealistic assumption that the number of action steps is known and fixed,\nleading to non-generalizable models in real-world scenarios where the sequence\nlength varies. (2) Temporal relation: Understanding the step temporal relation\nknowledge is essential in producing reasonable and executable plans. (3)\nAnnotation cost: Annotating instructional videos with step-level labels (i.e.,\ntimestamp) or sequence-level labels (i.e., action category) is demanding and\nlabor-intensive, limiting its generalizability to large-scale datasets. In this\nwork, we propose a new and practical setting, called adaptive procedure\nplanning in instructional videos, where the procedure length is not fixed or\npre-determined. To address these challenges, we introduce Retrieval-Augmented\nPlanner (RAP) model. Specifically, for adaptive procedures, RAP adaptively\ndetermines the conclusion of actions using an auto-regressive model\narchitecture. For temporal relation, RAP establishes an external memory module\nto explicitly retrieve the most relevant state-action pairs from the training\nvideos and revises the generated procedures. To tackle high annotation cost,\nRAP utilizes a weakly-supervised learning manner to expand the training dataset\nto other task-relevant, unannotated videos by generating pseudo labels for\naction steps. Experiments on CrossTask and COIN benchmarks show the superiority\nof RAP over traditional fixed-length models, establishing it as a strong\nbaseline solution for adaptive procedure planning.\n","authors":["Ali Zare","Yulei Niu","Hammad Ayyubi","Shih-fu Chang"],"pdf_url":"https://arxiv.org/pdf/2403.18600v2.pdf","comment":"Accepted in ECCV 2024"},{"id":"http://arxiv.org/abs/2409.16957v1","updated":"2024-09-25T14:13:00Z","published":"2024-09-25T14:13:00Z","title":"DualLQR: Efficient Grasping of Oscillating Apples using Task\n Parameterized Learning from Demonstration","summary":" Learning from Demonstration offers great potential for robots to learn to\nperform agricultural tasks, specifically selective harvesting. One of the\nchallenges is that the target fruit can be oscillating while approaching.\nGrasping oscillating targets has two requirements: 1) close tracking of the\ntarget during the final approach for damage-free grasping, and 2) the complete\npath should be as short as possible for improved efficiency. We propose a new\nmethod called DualLQR. In this method, we use a finite horizon Linear Quadratic\nRegulator (LQR) on a moving target, without the need of refitting the LQR. To\nmake this possible, we use a dual LQR setup, with an LQR running in two\nseperate reference frames. Through extensive simulation testing, it was found\nthat the state-of-art method barely meets the required final accuracy without\noscillations and drops below the required accuracy with an oscillating target.\nDualLQR was found to be able to meet the required final accuracy even with high\noscillations, with an accuracy increase of 60% for high orientation\noscillations. Further testing on a real-world apple grasping task showed that\nDualLQR was able to successfully grasp oscillating apples, with a success rate\nof 99%.\n","authors":["Robert van de Ven","Ard Nieuwenhuizen","Eldert J. van Henten","Gert Kootstra"],"pdf_url":"https://arxiv.org/pdf/2409.16957v1.pdf","comment":"Submitted to ICRA2025"},{"id":"http://arxiv.org/abs/2409.16950v1","updated":"2024-09-25T14:03:58Z","published":"2024-09-25T14:03:58Z","title":"Dynamic Obstacle Avoidance through Uncertainty-Based Adaptive Planning\n with Diffusion","summary":" By framing reinforcement learning as a sequence modeling problem, recent work\nhas enabled the use of generative models, such as diffusion models, for\nplanning. While these models are effective in predicting long-horizon state\ntrajectories in deterministic environments, they face challenges in dynamic\nsettings with moving obstacles. Effective collision avoidance demands\ncontinuous monitoring and adaptive decision-making. While replanning at every\ntimestep could ensure safety, it introduces substantial computational overhead\ndue to the repetitive prediction of overlapping state sequences -- a process\nthat is particularly costly with diffusion models, known for their intensive\niterative sampling procedure. We propose an adaptive generative planning\napproach that dynamically adjusts replanning frequency based on the uncertainty\nof action predictions. Our method minimizes the need for frequent,\ncomputationally expensive, and redundant replanning while maintaining robust\ncollision avoidance performance. In experiments, we obtain a 13.5% increase in\nthe mean trajectory length and a 12.7% increase in mean reward over\nlong-horizon planning, indicating a reduction in collision rates and an\nimproved ability to navigate the environment safely.\n","authors":["Vineet Punyamoorty","Pascal Jutras-Dubé","Ruqi Zhang","Vaneet Aggarwal","Damon Conover","Aniket Bera"],"pdf_url":"https://arxiv.org/pdf/2409.16950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16944v1","updated":"2024-09-25T13:56:08Z","published":"2024-09-25T13:56:08Z","title":"Go-SLAM: Grounded Object Segmentation and Localization with Gaussian\n Splatting SLAM","summary":" We introduce Go-SLAM, a novel framework that utilizes 3D Gaussian Splatting\nSLAM to reconstruct dynamic environments while embedding object-level\ninformation within the scene representations. This framework employs advanced\nobject segmentation techniques, assigning a unique identifier to each Gaussian\nsplat that corresponds to the object it represents. Consequently, our system\nfacilitates open-vocabulary querying, allowing users to locate objects using\nnatural language descriptions. Furthermore, the framework features an optimal\npath generation module that calculates efficient navigation paths for robots\ntoward queried objects, considering obstacles and environmental uncertainties.\nComprehensive evaluations in various scene settings demonstrate the\neffectiveness of our approach in delivering high-fidelity scene\nreconstructions, precise object segmentation, flexible object querying, and\nefficient robot path planning. This work represents an additional step forward\nin bridging the gap between 3D scene reconstruction, semantic object\nunderstanding, and real-time environment interactions.\n","authors":["Phu Pham","Dipam Patel","Damon Conover","Aniket Bera"],"pdf_url":"https://arxiv.org/pdf/2409.16944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16942v1","updated":"2024-09-25T13:54:28Z","published":"2024-09-25T13:54:28Z","title":"Performance assessment of ADAS in a representative subset of critical\n traffic situations","summary":" As a variety of automated collision prevention systems gain presence within\npersonal vehicles, rating and differentiating the automated safety performance\nof car models has become increasingly important for consumers, manufacturers,\nand insurers. In 2023, Swiss Re and partners initiated an eight-month long\nvehicle testing campaign conducted on a recognized UNECE type approval\nauthority and Euro NCAP accredited proving ground in Germany. The campaign\nexposed twelve mass-produced vehicle models and one prototype vehicle fitted\nwith collision prevention systems to a selection of safety-critical traffic\nscenarios representative of United States and European Union accident\nlandscape. In this paper, we compare and evaluate the relative safety\nperformance of these thirteen collision prevention systems (hardware and\nsoftware stack) as demonstrated by this testing campaign. We first introduce a\nnew scoring system which represents a test system's predicted impact on overall\nreal-world collision frequency and reduction of collision impact energy,\nweighted based on the real-world relevance of the test scenario. Next, we\nintroduce a novel metric that quantifies the realism of the protocol and\nconfirm that our test protocol is a plausible representation of real-world\ndriving. Finally, we find that the prototype system in its pre-release state\noutperforms the mass-produced (post-consumer-release) vehicles in the majority\nof the tested scenarios on the test track.\n","authors":["Luigi Di Lillo","Andrea Triscari","Xilin Zhou","Robert Dyro","Ruolin Li","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2409.16942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15866v2","updated":"2024-09-25T13:47:44Z","published":"2024-09-24T08:40:04Z","title":"Multi-UAV Pursuit-Evasion with Online Planning in Unknown Environments\n by Deep Reinforcement Learning","summary":" Multi-UAV pursuit-evasion, where pursuers aim to capture evaders, poses a key\nchallenge for UAV swarm intelligence. Multi-agent reinforcement learning (MARL)\nhas demonstrated potential in modeling cooperative behaviors, but most RL-based\napproaches remain constrained to simplified simulations with limited dynamics\nor fixed scenarios. Previous attempts to deploy RL policy to real-world\npursuit-evasion are largely restricted to two-dimensional scenarios, such as\nground vehicles or UAVs at fixed altitudes. In this paper, we address multi-UAV\npursuit-evasion by considering UAV dynamics and physical constraints. We\nintroduce an evader prediction-enhanced network to tackle partial observability\nin cooperative strategy learning. Additionally, we propose an adaptive\nenvironment generator within MARL training, enabling higher exploration\nefficiency and better policy generalization across diverse scenarios.\nSimulations show our method significantly outperforms all baselines in\nchallenging scenarios, generalizing to unseen scenarios with a 100% capture\nrate. Finally, we derive a feasible policy via a two-stage reward refinement\nand deploy the policy on real quadrotors in a zero-shot manner. To our\nknowledge, this is the first work to derive and deploy an RL-based policy using\ncollective thrust and body rates control commands for multi-UAV pursuit-evasion\nin unknown environments. The open-source code and videos are available at\nhttps://sites.google.com/view/pursuit-evasion-rl.\n","authors":["Jiayu Chen","Chao Yu","Guosheng Li","Wenhao Tang","Xinyi Yang","Botian Xu","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2409.15866v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16915v1","updated":"2024-09-25T13:20:33Z","published":"2024-09-25T13:20:33Z","title":"Let's Make a Splan: Risk-Aware Trajectory Optimization in a Normalized\n Gaussian Splat","summary":" Neural Radiance Fields and Gaussian Splatting have transformed the field of\ncomputer vision by enabling photo-realistic representation of complex scenes.\nDespite this success, they have seen only limited use in real-world robotics\ntasks such as trajectory optimization. Two key factors have contributed to this\nlimited success. First, it is challenging to reason about collisions in\nradiance models. Second, it is difficult to perform inference of radiance\nmodels fast enough for real-time trajectory synthesis. This paper addresses\nthese challenges by proposing SPLANNING, a risk-aware trajectory optimizer that\noperates in a Gaussian Splatting model. This paper first derives a method for\nrigorously upper-bounding the probability of collision between a robot and a\nradiance field. Second, this paper introduces a normalized reformulation of\nGaussian Splatting that enables the efficient computation of the collision\nbound in a Gaussian Splat. Third, a method is presented to optimize\ntrajectories while avoiding collisions with a scene represented by a Gaussian\nSplat. Experiments demonstrate that SPLANNING outperforms state-of-the-art\nmethods in generating collision-free trajectories in highly cluttered\nenvironments. The proposed system is also tested on a real-world robot\nmanipulator. A project page is available at\nhttps://roahmlab.github.io/splanning.\n","authors":["Jonathan Michaux","Seth Isaacson","Challen Enninful Adu","Adam Li","Rahul Kashyap Swayampakula","Parker Ewen","Sean Rice","Katherine A. Skinner","Ram Vasudevan"],"pdf_url":"https://arxiv.org/pdf/2409.16915v1.pdf","comment":"First two authors contributed equally. Project Page:\n https://roahmlab.github.io/splanning"},{"id":"http://arxiv.org/abs/2305.00126v3","updated":"2024-09-25T13:13:32Z","published":"2023-04-28T23:43:10Z","title":"Event-Free Moving Object Segmentation from Moving Ego Vehicle","summary":" Moving object segmentation (MOS) in dynamic scenes is an important,\nchallenging, but under-explored research topic for autonomous driving,\nespecially for sequences obtained from moving ego vehicles. Most segmentation\nmethods leverage motion cues obtained from optical flow maps. However, since\nthese methods are often based on optical flows that are pre-computed from\nsuccessive RGB frames, this neglects the temporal consideration of events\noccurring within the inter-frame, consequently constraining its ability to\ndiscern objects exhibiting relative staticity but genuinely in motion. To\naddress these limitations, we propose to exploit event cameras for better video\nunderstanding, which provide rich motion cues without relying on optical flow.\nTo foster research in this area, we first introduce a novel large-scale dataset\ncalled DSEC-MOS for moving object segmentation from moving ego vehicles, which\nis the first of its kind. For benchmarking, we select various mainstream\nmethods and rigorously evaluate them on our dataset. Subsequently, we devise\nEmoFormer, a novel network able to exploit the event data. For this purpose, we\nfuse the event temporal prior with spatial semantic maps to distinguish\ngenuinely moving objects from the static background, adding another level of\ndense supervision around our object of interest. Our proposed network relies\nonly on event data for training but does not require event input during\ninference, making it directly comparable to frame-only methods in terms of\nefficiency and more widely usable in many application cases. The exhaustive\ncomparison highlights a significant performance improvement of our method over\nall other methods. The source code and dataset are publicly available at:\nhttps://github.com/ZZY-Zhou/DSEC-MOS.\n","authors":["Zhuyun Zhou","Zongwei Wu","Danda Pani Paudel","Rémi Boutteau","Fan Yang","Luc Van Gool","Radu Timofte","Dominique Ginhac"],"pdf_url":"https://arxiv.org/pdf/2305.00126v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16900v1","updated":"2024-09-25T13:09:23Z","published":"2024-09-25T13:09:23Z","title":"A Roadmap for Embodied and Social Grounding in LLMs","summary":" The fusion of Large Language Models (LLMs) and robotic systems has led to a\ntransformative paradigm in the robotic field, offering unparalleled\ncapabilities not only in the communication domain but also in skills like\nmultimodal input handling, high-level reasoning, and plan generation. The\ngrounding of LLMs knowledge into the empirical world has been considered a\ncrucial pathway to exploit the efficiency of LLMs in robotics. Nevertheless,\nconnecting LLMs' representations to the external world with multimodal\napproaches or with robots' bodies is not enough to let them understand the\nmeaning of the language they are manipulating. Taking inspiration from humans,\nthis work draws attention to three necessary elements for an agent to grasp and\nexperience the world. The roadmap for LLMs grounding is envisaged in an active\nbodily system as the reference point for experiencing the environment, a\ntemporally structured experience for a coherent, self-related interaction with\nthe external world, and social skills to acquire a common-grounded shared\nexperience.\n","authors":["Sara Incao","Carlo Mazzola","Giulia Belgiovine","Alessandra Sciutti"],"pdf_url":"https://arxiv.org/pdf/2409.16900v1.pdf","comment":"Accepted Version of a conference paper presented at Robophilosophy\n Conference 2024"},{"id":"http://arxiv.org/abs/2409.16899v1","updated":"2024-09-25T13:08:43Z","published":"2024-09-25T13:08:43Z","title":"Robotic Backchanneling in Online Conversation Facilitation: A\n Cross-Generational Study","summary":" Japan faces many challenges related to its aging society, including\nincreasing rates of cognitive decline in the population and a shortage of\ncaregivers. Efforts have begun to explore solutions using artificial\nintelligence (AI), especially socially embodied intelligent agents and robots\nthat can communicate with people. Yet, there has been little research on the\ncompatibility of these agents with older adults in various everyday situations.\nTo this end, we conducted a user study to evaluate a robot that functions as a\nfacilitator for a group conversation protocol designed to prevent cognitive\ndecline. We modified the robot to use backchannelling, a natural human way of\nspeaking, to increase receptiveness of the robot and enjoyment of the group\nconversation experience. We conducted a cross-generational study with young\nadults and older adults. Qualitative analyses indicated that younger adults\nperceived the backchannelling version of the robot as kinder, more trustworthy,\nand more acceptable than the non-backchannelling robot. Finally, we found that\nthe robot's backchannelling elicited nonverbal backchanneling in older\nparticipants.\n","authors":["Sota Kobuki","Katie Seaborn","Seiki Tokunaga","Kosuke Fukumori","Shun Hidaka","Kazuhiro Tamura","Koji Inoue","Tatsuya Kawahara","Mihoko Otake-Mastuura"],"pdf_url":"https://arxiv.org/pdf/2409.16899v1.pdf","comment":"Published at Proceedings of the 2023 32nd IEEE International\n Conference on Robot and Human Interactive Communication (RO-MAN 2023)"},{"id":"http://arxiv.org/abs/2409.16882v1","updated":"2024-09-25T12:50:01Z","published":"2024-09-25T12:50:01Z","title":"Revisiting Space Mission Planning: A Reinforcement Learning-Guided\n Approach for Multi-Debris Rendezvous","summary":" This research introduces a novel application of a masked Proximal Policy\nOptimization (PPO) algorithm from the field of deep reinforcement learning\n(RL), for determining the most efficient sequence of space debris visitation,\nutilizing the Lambert solver as per Izzo's adaptation for individual\nrendezvous. The aim is to optimize the sequence in which all the given debris\nshould be visited to get the least total time for rendezvous for the entire\nmission. A neural network (NN) policy is developed, trained on simulated space\nmissions with varying debris fields. After training, the neural network\ncalculates approximately optimal paths using Izzo's adaptation of Lambert\nmaneuvers. Performance is evaluated against standard heuristics in mission\nplanning. The reinforcement learning approach demonstrates a significant\nimprovement in planning efficiency by optimizing the sequence for debris\nrendezvous, reducing the total mission time by an average of approximately\n{10.96\\%} and {13.66\\%} compared to the Genetic and Greedy algorithms,\nrespectively. The model on average identifies the most time-efficient sequence\nfor debris visitation across various simulated scenarios with the fastest\ncomputational speed. This approach signifies a step forward in enhancing\nmission planning strategies for space debris clearance.\n","authors":["Agni Bandyopadhyay","Guenther Waxenegger-Wilfing"],"pdf_url":"https://arxiv.org/pdf/2409.16882v1.pdf","comment":"Accepted for publication at the 2024 International Conference on\n Space Robotics (iSpaRo)"},{"id":"http://arxiv.org/abs/2409.16879v1","updated":"2024-09-25T12:44:13Z","published":"2024-09-25T12:44:13Z","title":"GRACE: Generating Socially Appropriate Robot Actions Leveraging LLMs and\n Human Explanations","summary":" When operating in human environments, robots need to handle complex tasks\nwhile both adhering to social norms and accommodating individual preferences.\nFor instance, based on common sense knowledge, a household robot can predict\nthat it should avoid vacuuming during a social gathering, but it may still be\nuncertain whether it should vacuum before or after having guests. In such\ncases, integrating common-sense knowledge with human preferences, often\nconveyed through human explanations, is fundamental yet a challenge for\nexisting systems. In this paper, we introduce GRACE, a novel approach\naddressing this while generating socially appropriate robot actions. GRACE\nleverages common sense knowledge from Large Language Models (LLMs), and it\nintegrates this knowledge with human explanations through a generative network\narchitecture. The bidirectional structure of GRACE enables robots to refine and\nenhance LLM predictions by utilizing human explanations and makes robots\ncapable of generating such explanations for human-specified actions. Our\nexperimental evaluations show that integrating human explanations boosts\nGRACE's performance, where it outperforms several baselines and provides\nsensible explanations.\n","authors":["Fethiye Irmak Dogan","Umut Ozyurt","Gizem Cinar","Hatice Gunes"],"pdf_url":"https://arxiv.org/pdf/2409.16879v1.pdf","comment":"Under review for 2025 IEEE International Conference on Robotics &\n Automation (ICRA), Supplementary video: https://youtu.be/3gP3euwNBjQ"},{"id":"http://arxiv.org/abs/2409.02636v2","updated":"2024-09-25T12:25:21Z","published":"2024-09-04T11:59:53Z","title":"Mamba as a motion encoder for robotic imitation learning","summary":" Recent advancements in imitation learning, particularly with the integration\nof LLM techniques, are set to significantly improve robots' dexterity and\nadaptability. This paper proposes using Mamba, a state-of-the-art architecture\nwith potential applications in LLMs, for robotic imitation learning,\nhighlighting its ability to function as an encoder that effectively captures\ncontextual information. By reducing the dimensionality of the state space,\nMamba operates similarly to an autoencoder. It effectively compresses the\nsequential information into state variables while preserving the essential\ntemporal dynamics necessary for accurate motion prediction. Experimental\nresults in tasks such as cup placing and case loading demonstrate that despite\nexhibiting higher estimation errors, Mamba achieves superior success rates\ncompared to Transformers in practical task execution. This performance is\nattributed to Mamba's structure, which encompasses the state space model.\nAdditionally, the study investigates Mamba's capacity to serve as a real-time\nmotion generator with a limited amount of training data.\n","authors":["Toshiaki Tsuji"],"pdf_url":"https://arxiv.org/pdf/2409.02636v2.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.16862v1","updated":"2024-09-25T12:20:45Z","published":"2024-09-25T12:20:45Z","title":"Behavior evolution-inspired approach to walking gait reinforcement\n training for quadruped robots","summary":" Reinforcement learning method is extremely competitive in gait generation\ntechniques for quadrupedal robot, which is mainly due to the fact that\nstochastic exploration in reinforcement training is beneficial to achieve an\nautonomous gait. Nevertheless, although incremental reinforcement learning is\nemployed to improve training success and movement smoothness by relying on the\ncontinuity inherent during limb movements, challenges remain in adapting gait\npolicy to diverse terrain and external disturbance. Inspired by the association\nbetween reinforcement learning and the evolution of animal motion behavior, a\nself-improvement mechanism for reference gait is introduced in this paper to\nenable incremental learning of action and self-improvement of reference action\ntogether to imitate the evolution of animal motion behavior. Further, a new\nframework for reinforcement training of quadruped gait is proposed. In this\nframework, genetic algorithm is specifically adopted to perform global\nprobabilistic search for the initial value of the arbitrary foot trajectory to\nupdate the reference trajectory with better fitness. Subsequently, the improved\nreference gait is used for incremental reinforcement learning of gait. The\nabove process is repeatedly and alternatively executed to finally train the\ngait policy. The analysis considering terrain, model dimensions, and locomotion\ncondition is presented in detail based on simulation, and the results show that\nthe framework is significantly more adaptive to terrain compared to regular\nincremental reinforcement learning.\n","authors":["Yu Wang","Wenchuan Jia","Yi Sun","Dong He"],"pdf_url":"https://arxiv.org/pdf/2409.16862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16851v1","updated":"2024-09-25T11:57:02Z","published":"2024-09-25T11:57:02Z","title":"Communication Backbone Reconfiguration with Connectivity Maintenance","summary":" The exchange of information is key in applications that involve multiple\nagents, such as search and rescue, military operations, and disaster response.\nIn this work, we propose a simple and effective trajectory planning framework\nthat tackles the design, deployment, and reconfiguration of a communication\nbackbone by reframing the problem of networked multi-agent motion planning as a\nmanipulator motion planning problem. Our approach works for backbones of\nvariable configurations both in terms of the number of robots utilized and the\ndistance limit between each robot. While research has been conducted on\nconnection-restricted navigation for multi-robot systems in the last years, the\nfield of manipulators is arguably more developed both in theory and practice.\nHence, our methodology facilitates practical applications built on top of\nwidely available motion planning algorithms and frameworks for manipulators.\n","authors":["Leonardo Santos","Caio C. G. Ribeiro","Douglas G. Macharet"],"pdf_url":"https://arxiv.org/pdf/2409.16851v1.pdf","comment":"Submitted to IEEE Latin America Transactions"},{"id":"http://arxiv.org/abs/2409.16847v1","updated":"2024-09-25T11:54:24Z","published":"2024-09-25T11:54:24Z","title":"CREVE: An Acceleration-based Constraint Approach for Robust Radar\n Ego-Velocity Estimation","summary":" Ego-velocity estimation from point cloud measurements of a millimeter-wave\nfrequency-modulated continuous wave (mmWave FMCW) radar has become a crucial\ncomponent of radar-inertial odometry (RIO) systems. Conventional approaches\noften perform poorly when the number of point cloud outliers exceeds that of\ninliers. In this paper, we propose CREVE, an acceleration-based inequality\nconstraints filter that leverages additional measurements from an inertial\nmeasurement unit (IMU) to achieve robust ego-velocity estimations. To further\nenhance accuracy and robustness against sensor errors, we introduce a practical\naccelerometer bias estimation method and a parameter adaptation rule. The\neffectiveness of the proposed method is evaluated using five open-source drone\ndatasets. Experimental results demonstrate that our algorithm significantly\noutperforms three existing state-of-the-art methods, achieving reductions in\nabsolute trajectory error of approximately 53%, 84%, and 35% compared to them.\n","authors":["Hoang Viet Do","Bo Sung Ko","Jin Woo Song"],"pdf_url":"https://arxiv.org/pdf/2409.16847v1.pdf","comment":"7 pages, conference"},{"id":"http://arxiv.org/abs/2308.04977v2","updated":"2024-09-25T11:37:07Z","published":"2023-08-09T14:22:10Z","title":"An explicit construction of Kaleidocycles by elliptic theta functions","summary":" We consider the configuration space of points on the two-dimensional sphere\nthat satisfy a specific system of quadratic equations. We construct periodic\norbits in this configuration space using elliptic theta functions and show that\nthey satisfy semi-discrete analogues of mKdV and sine-Gordon equations. The\nconfiguration space we investigate corresponds to the state space of a linkage\nmechanism known as the Kaleidocycle, and the constructed orbits describe the\ncharacteristic motion of the Kaleidocycle. Our approach is founded on the\nrelationship between the deformation of spatial curves and integrable systems,\noffering an intriguing example where an integrable system generates an orbit in\nthe space of real solutions to polynomial equations defined by geometric\nconstraints.\n","authors":["Shizuo Kaji","Kenji Kajiwara","Shota Shigetomi"],"pdf_url":"https://arxiv.org/pdf/2308.04977v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16834v1","updated":"2024-09-25T11:33:51Z","published":"2024-09-25T11:33:51Z","title":"Conditional Generative Denoiser for Nighttime UAV Tracking","summary":" State-of-the-art (SOTA) visual object tracking methods have significantly\nenhanced the autonomy of unmanned aerial vehicles (UAVs). However, in low-light\nconditions, the presence of irregular real noise from the environments severely\ndegrades the performance of these SOTA methods. Moreover, existing SOTA\ndenoising techniques often fail to meet the real-time processing requirements\nwhen deployed as plug-and-play denoisers for UAV tracking. To address this\nchallenge, this work proposes a novel conditional generative denoiser\n(CGDenoiser), which breaks free from the limitations of traditional\ndeterministic paradigms and generates the noise conditioning on the input,\nsubsequently removing it. To better align the input dimensions and accelerate\ninference, a novel nested residual Transformer conditionalizer is developed.\nFurthermore, an innovative multi-kernel conditional refiner is designed to\npertinently refine the denoised output. Extensive experiments show that\nCGDenoiser promotes the tracking precision of the SOTA tracker by 18.18\\% on\nDarkTrack2021 whereas working 5.8 times faster than the second well-performed\ndenoiser. Real-world tests with complex challenges also prove the effectiveness\nand practicality of CGDenoiser. Code, video demo and supplementary proof for\nCGDenoier are now available at:\n\\url{https://github.com/vision4robotics/CGDenoiser}.\n","authors":["Yucheng Wang","Changhong Fu","Kunhan Lu","Liangliang Yao","Haobo Zuo"],"pdf_url":"https://arxiv.org/pdf/2409.16834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16830v1","updated":"2024-09-25T11:30:59Z","published":"2024-09-25T11:30:59Z","title":"OffRIPP: Offline RL-based Informative Path Planning","summary":" Informative path planning (IPP) is a crucial task in robotics, where agents\nmust design paths to gather valuable information about a target environment\nwhile adhering to resource constraints. Reinforcement learning (RL) has been\nshown to be effective for IPP, however, it requires environment interactions,\nwhich are risky and expensive in practice. To address this problem, we propose\nan offline RL-based IPP framework that optimizes information gain without\nrequiring real-time interaction during training, offering safety and\ncost-efficiency by avoiding interaction, as well as superior performance and\nfast computation during execution -- key advantages of RL. Our framework\nleverages batch-constrained reinforcement learning to mitigate extrapolation\nerrors, enabling the agent to learn from pre-collected datasets generated by\narbitrary algorithms. We validate the framework through extensive simulations\nand real-world experiments. The numerical results show that our framework\noutperforms the baselines, demonstrating the effectiveness of the proposed\napproach.\n","authors":["Srikar Babu Gadipudi","Srujan Deolasee","Siva Kailas","Wenhao Luo","Katia Sycara","Woojun Kim"],"pdf_url":"https://arxiv.org/pdf/2409.16830v1.pdf","comment":"7 pages, 6 figures, submitted to ICRA 2025"},{"id":"http://arxiv.org/abs/2409.16828v1","updated":"2024-09-25T11:29:26Z","published":"2024-09-25T11:29:26Z","title":"On the role of Artificial Intelligence methods in modern\n force-controlled manufacturing robotic tasks","summary":" This position paper explores the integration of Artificial Intelligence (AI)\ninto force-controlled robotic tasks within the scope of advanced manufacturing,\na cornerstone of Industry 4.0. AI's role in enhancing robotic manipulators -\nkey drivers in the Fourth Industrial Revolution - is rapidly leading to\nsignificant innovations in smart manufacturing. The objective of this article\nis to frame these innovations in practical force-controlled applications - e.g.\ndeburring, polishing, and assembly tasks like peg-in-hole (PiH) - highlighting\ntheir necessity for maintaining high-quality production standards. By reporting\non recent AI-based methodologies, this article contrasts them and identifies\ncurrent challenges to be addressed in future research. The analysis concludes\nwith a perspective on future research directions, emphasizing the need for\ncommon performance metrics to validate AI techniques, integration of various\nenhancements for performance optimization, and the importance of validating\nthem in relevant scenarios. These future directions aim to provide consistency\nwith already adopted approaches, so as to be compatible with manufacturing\nstandards, increasing the relevance of AI-driven methods in both academic and\nindustrial contexts.\n","authors":["Vincenzo Petrone","Enrico Ferrentino","Pasquale Chiacchio"],"pdf_url":"https://arxiv.org/pdf/2409.16828v1.pdf","comment":"To be published in Proceedings of the 20th International Conference\n on Informatics in Control, Automation and Robotics (ICINCO)"},{"id":"http://arxiv.org/abs/2409.16810v1","updated":"2024-09-25T11:02:30Z","published":"2024-09-25T11:02:30Z","title":"Inline Photometrically Calibrated Hybrid Visual SLAM","summary":" This paper presents an integrated approach to Visual SLAM, merging online\nsequential photometric calibration within a Hybrid direct-indirect visual SLAM\n(H-SLAM). Photometric calibration helps normalize pixel intensity values under\ndifferent lighting conditions, and thereby improves the direct component of our\nH-SLAM. A tangential benefit also results to the indirect component of H-SLAM\ngiven that the detected features are more stable across variable lighting\nconditions. Our proposed photometrically calibrated H-SLAM is tested on several\ndatasets, including the TUM monoVO as well as on a dataset we created.\nCalibrated H-SLAM outperforms other state of the art direct, indirect, and\nhybrid Visual SLAM systems in all the experiments. Furthermore, in online SLAM\ntested at our site, it also significantly outperformed the other SLAM Systems.\n","authors":["Nicolas Abboud","Malak Sayour","Imad H. Elhajj","John Zelek","Daniel Asmar"],"pdf_url":"https://arxiv.org/pdf/2409.16810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16802v1","updated":"2024-09-25T10:37:16Z","published":"2024-09-25T10:37:16Z","title":"Do We Need iPhone Moment or Xiaomi Moment for Robots? Design of\n Affordable Home Robots for Health Monitoring","summary":" In this paper, we study cost-effective home robot solutions which are\ndesigned for home health monitoring. The recent advancements in Artificial\nIntelligence (AI) have significantly advanced the capabilities of the robots,\nenabling them to better and efficiently understand and interact with their\nsurroundings. The most common robots currently used in homes are toy robots and\ncleaning robots. While these are relatively affordable, their functionalities\nare very limited. On the other hand, humanoid and quadruped robots offer more\nsophisticated features and capabilities, albeit at a much higher cost. Another\ncategory is educational robots, which provide educators with the flexibility to\nattach various sensors and integrate different design methods with the\nintegrated operating systems. However, the challenge still exists in bridging\nthe gap between affordability and functionality. Our research aims to address\nthis by exploring the potential of developing advanced yet affordable and\naccessible robots for home robots, aiming for health monitoring, by using edge\ncomputing techniques and taking advantage of existing computing resources for\nhome robots, such as mobile phones.\n","authors":["Bo Wei","Yaya Bian","Mingcen Gao"],"pdf_url":"https://arxiv.org/pdf/2409.16802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16800v1","updated":"2024-09-25T10:34:37Z","published":"2024-09-25T10:34:37Z","title":"Programming of Skill-based Robots","summary":" Manufacturing is facing ever changing market demands, with faster innovation\ncycles resulting to growing agility and flexibility requirements. Industry 4.0\nhas been transforming the manufacturing world towards digital automation and\nthe importance of software has increased drastically. Easy and fast task\nprogramming and execution in robot - sensor systems become a prerequisite for\nagile and flexible automation and in this paper, we propose such a system. Our\nsolution relies on a robot skill library, which provides the user with high\nlevel and parametrized operations, i.e., robot skills, for task programming and\nexecution. Programming actions results to a control recipe in a neutral product\ncontext and is based on use of product CAD models or alternatively\ncollaborative use of pointers and tracking sensor with real parts. Practical\ntests are also reported to show the feasibility of our approach.\n","authors":["Taneli Lohi","Samuli Soutukorva","Tapio Heikkilä"],"pdf_url":"https://arxiv.org/pdf/2409.16800v1.pdf","comment":"IEEE ICIEA 2024"},{"id":"http://arxiv.org/abs/2409.16784v1","updated":"2024-09-25T09:47:31Z","published":"2024-09-25T09:47:31Z","title":"World Model-based Perception for Visual Legged Locomotion","summary":" Legged locomotion over various terrains is challenging and requires precise\nperception of the robot and its surroundings from both proprioception and\nvision. However, learning directly from high-dimensional visual input is often\ndata-inefficient and intricate. To address this issue, traditional methods\nattempt to learn a teacher policy with access to privileged information first\nand then learn a student policy to imitate the teacher's behavior with visual\ninput. Despite some progress, this imitation framework prevents the student\npolicy from achieving optimal performance due to the information gap between\ninputs. Furthermore, the learning process is unnatural since animals\nintuitively learn to traverse different terrains based on their understanding\nof the world without privileged knowledge. Inspired by this natural ability, we\npropose a simple yet effective method, World Model-based Perception (WMP),\nwhich builds a world model of the environment and learns a policy based on the\nworld model. We illustrate that though completely trained in simulation, the\nworld model can make accurate predictions of real-world trajectories, thus\nproviding informative signals for the policy controller. Extensive simulated\nand real-world experiments demonstrate that WMP outperforms state-of-the-art\nbaselines in traversability and robustness. Videos and Code are available at:\nhttps://wmp-loco.github.io/.\n","authors":["Hang Lai","Jiahang Cao","Jiafeng Xu","Hongtao Wu","Yunfeng Lin","Tao Kong","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.16784v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2409.16154v2","updated":"2024-09-25T09:00:27Z","published":"2024-09-24T14:58:27Z","title":"Efficient Motion Prediction: A Lightweight & Accurate Trajectory\n Prediction Model With Fast Training and Inference Speed","summary":" For efficient and safe autonomous driving, it is essential that autonomous\nvehicles can predict the motion of other traffic agents. While highly accurate,\ncurrent motion prediction models often impose significant challenges in terms\nof training resource requirements and deployment on embedded hardware. We\npropose a new efficient motion prediction model, which achieves highly\ncompetitive benchmark results while training only a few hours on a single GPU.\nDue to our lightweight architectural choices and the focus on reducing the\nrequired training resources, our model can easily be applied to custom\ndatasets. Furthermore, its low inference latency makes it particularly suitable\nfor deployment in autonomous applications with limited computing resources.\n","authors":["Alexander Prutsch","Horst Bischof","Horst Possegger"],"pdf_url":"https://arxiv.org/pdf/2409.16154v2.pdf","comment":"Accepted to IROS 2024"},{"id":"http://arxiv.org/abs/2409.16720v1","updated":"2024-09-25T08:09:52Z","published":"2024-09-25T08:09:52Z","title":"Dashing for the Golden Snitch: Multi-Drone Time-Optimal Motion Planning\n with Multi-Agent Reinforcement Learning","summary":" Recent innovations in autonomous drones have facilitated time-optimal flight\nin single-drone configurations and enhanced maneuverability in multi-drone\nsystems through the application of optimal control and learning-based methods.\nHowever, few studies have achieved time-optimal motion planning for multi-drone\nsystems, particularly during highly agile maneuvers or in dynamic scenarios.\nThis paper presents a decentralized policy network for time-optimal multi-drone\nflight using multi-agent reinforcement learning. To strike a balance between\nflight efficiency and collision avoidance, we introduce a soft collision\npenalty inspired by optimization-based methods. By customizing PPO in a\ncentralized training, decentralized execution (CTDE) fashion, we unlock higher\nefficiency and stability in training, while ensuring lightweight\nimplementation. Extensive simulations show that, despite slight performance\ntrade-offs compared to single-drone systems, our multi-drone approach maintains\nnear-time-optimal performance with low collision rates. Real-world experiments\nvalidate our method, with two quadrotors using the same network as simulation\nachieving a maximum speed of 13.65 m/s and a maximum body rate of 13.4 rad/s in\na 5.5 m * 5.5 m * 2.0 m space across various tracks, relying entirely on\nonboard computation.\n","authors":["Xian Wang","Jin Zhou","Yuanli Feng","Jiahao Mei","Jiming Chen","Shuo Li"],"pdf_url":"https://arxiv.org/pdf/2409.16720v1.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.16718v1","updated":"2024-09-25T08:07:18Z","published":"2024-09-25T08:07:18Z","title":"Vision-Language Model Fine-Tuning via Simple Parameter-Efficient\n Modification","summary":" Recent advances in fine-tuning Vision-Language Models (VLMs) have witnessed\nthe success of prompt tuning and adapter tuning, while the classic model\nfine-tuning on inherent parameters seems to be overlooked. It is believed that\nfine-tuning the parameters of VLMs with few-shot samples corrupts the\npre-trained knowledge since fine-tuning the CLIP model even degrades\nperformance. In this paper, we revisit this viewpoint, and propose a new\nperspective: fine-tuning the specific parameters instead of all will uncover\nthe power of classic model fine-tuning on VLMs. Through our meticulous study,\nwe propose ClipFit, a simple yet effective method to fine-tune CLIP without\nintroducing any overhead of extra parameters. We demonstrate that by only\nfine-tuning the specific bias terms and normalization layers, ClipFit can\nimprove the performance of zero-shot CLIP by 7.27\\% average harmonic mean\naccuracy. Lastly, to understand how fine-tuning in CLIPFit affects the\npre-trained models, we conducted extensive experimental analyses w.r.t. changes\nin internal parameters and representations. We found that low-level text bias\nlayers and the first layer normalization layer change much more than other\nlayers. The code is available at \\url{https://github.com/minglllli/CLIPFit}.\n","authors":["Ming Li","Jike Zhong","Chenxin Li","Liuzhuozheng Li","Nie Lin","Masashi Sugiyama"],"pdf_url":"https://arxiv.org/pdf/2409.16718v1.pdf","comment":"EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2409.16680v1","updated":"2024-09-25T07:11:00Z","published":"2024-09-25T07:11:00Z","title":"Online 6DoF Pose Estimation in Forests using Cross-View Factor Graph\n Optimisation and Deep Learned Re-localisation","summary":" This paper presents a novel approach for robust global localisation and 6DoF\npose estimation of ground robots in forest environments by leveraging\ncross-view factor graph optimisation and deep-learned re-localisation. The\nproposed method addresses the challenges of aligning aerial and ground data for\npose estimation, which is crucial for accurate point-to-point navigation in\nGPS-denied environments. By integrating information from both perspectives into\na factor graph framework, our approach effectively estimates the robot's global\nposition and orientation. We validate the performance of our method through\nextensive experiments in diverse forest scenarios, demonstrating its\nsuperiority over existing baselines in terms of accuracy and robustness in\nthese challenging environments. Experimental results show that our proposed\nlocalisation system can achieve drift-free localisation with bounded\npositioning errors, ensuring reliable and safe robot navigation under canopies.\n","authors":["Lucas Carvalho de Lima","Ethan Griffiths","Maryam Haghighat","Simon Denman","Clinton Fookes","Paulo Borges","Michael Brünig","Milad Ramezani"],"pdf_url":"https://arxiv.org/pdf/2409.16680v1.pdf","comment":"7 pages, 4 figures, Submitted to ICRA2025"},{"id":"http://arxiv.org/abs/2407.07618v2","updated":"2024-09-25T07:09:11Z","published":"2024-07-10T12:56:44Z","title":"Cosserat Rods for Modeling Tendon-Driven Robotic Catheter Systems","summary":" Tendon-driven robotic catheters are capable of precise execution of minimally\ninvasive cardiac procedures including ablations and imaging. These procedures\nrequire accurate mathematical models of not only the catheter and tendons but\nalso their interactions with surrounding tissue and vasculature in order to\ncontrol the robot path and interaction. This paper presents a mechanical model\nof a tendon-driven robotic catheter system based on Cosserat rods and\nintegrated with a stable, implicit Euler scheme. We implement the Cosserat rod\nas a model for a simple catheter centerline and validate its physical accuracy\nagainst a large deformation analytical model and experimental data. The\ncatheter model is then supplemented by adding a second Cosserat rod to model a\nsingle tendon, using penalty forces to define the constraints of the\ntendon-catheter system. All the model parameters are defined by the catheter\nproperties established by the design. The combined model is validated against\nexperimental data to confirm its physical accuracy. This model represents a new\ncontribution to the field of robotic catheter modeling in which both the\ntendons and catheter are modeled by mechanical Cosserat rods and\nfully-validated against experimental data in the case of the single rod system.\n","authors":["Pierre-Frédéric Villard","Thomas M. Waite","Robert D. Howe"],"pdf_url":"https://arxiv.org/pdf/2407.07618v2.pdf","comment":"24 pages, 23 figures"},{"id":"http://arxiv.org/abs/2308.03257v4","updated":"2024-09-25T07:09:05Z","published":"2023-08-07T02:28:31Z","title":"TempFuser: Learning Agile, Tactical, and Acrobatic Flight Maneuvers\n Using a Long Short-Term Temporal Fusion Transformer","summary":" Dogfighting is a challenging scenario in aerial applications that requires a\ncomprehensive understanding of both strategic maneuvers and the aerodynamics of\nagile aircraft. The aerial agent needs to not only understand tactically\nevolving maneuvers of fighter jets from a long-term perspective but also react\nto rapidly changing aerodynamics of aircraft from a short-term viewpoint. In\nthis paper, we introduce TempFuser, a novel long short-term temporal fusion\ntransformer architecture that can learn agile, tactical, and acrobatic flight\nmaneuvers in complex dogfight problems. Our approach integrates two distinct\ntemporal transition embeddings into a transformer-based network to\ncomprehensively capture both the long-term tactics and short-term agility of\naerial agents. By incorporating these perspectives, our policy network\ngenerates end-to-end flight commands that secure dominant positions over the\nlong term and effectively outmaneuver agile opponents. After training in a\nhigh-fidelity flight simulator, our model successfully learns to execute\nstrategic maneuvers, outperforming baseline policy models against various types\nof opponent aircraft. Notably, our model exhibits human-like acrobatic\nmaneuvers even when facing adversaries with superior specifications, all\nwithout relying on prior knowledge. Moreover, it demonstrates robust pursuit\nperformance in challenging supersonic and low-altitude situations. Demo videos\nare available at https://sites.google.com/view/tempfuser.\n","authors":["Hyunki Seong","David Hyunchul Shim"],"pdf_url":"https://arxiv.org/pdf/2308.03257v4.pdf","comment":"8 pages, 7 figures. Accepted for publication in IEEE Robotics and\n Automation Letters (RA-L). Copyright 2024 IEEE. Personal use is permitted.\n For other uses, permission from IEEE is required"},{"id":"http://arxiv.org/abs/2409.16665v1","updated":"2024-09-25T06:50:31Z","published":"2024-09-25T06:50:31Z","title":"Multirotor Nonlinear Model Predictive Control based on Visual Servoing\n of Evolving Features","summary":" This article presents a Visual Servoing Nonlinear Model Predictive Control\n(NMPC) scheme for autonomously tracking a moving target using multirotor\nUnmanned Aerial Vehicles (UAVs). The scheme is developed for surveillance and\ntracking of contour-based areas with evolving features. NMPC is used to manage\ninput and state constraints, while additional barrier functions are\nincorporated in order to ensure system safety and optimal performance. The\nproposed control scheme is designed based on the extraction and implementation\nof the full dynamic model of the features describing the target and the state\nvariables. Real-time simulations and experiments using a quadrotor UAV equipped\nwith a camera demonstrate the effectiveness of the proposed strategy.\n","authors":["Sotirios N. Aspragkathos","Panagiotis Rousseas","George C. Karras","Kostas J. Kyriakopoulos"],"pdf_url":"https://arxiv.org/pdf/2409.16665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15146v2","updated":"2024-09-25T05:59:08Z","published":"2024-09-23T15:53:41Z","title":"COHERENT: Collaboration of Heterogeneous Multi-Robot System with Large\n Language Models","summary":" Leveraging the powerful reasoning capabilities of large language models\n(LLMs), recent LLM-based robot task planning methods yield promising results.\nHowever, they mainly focus on single or multiple homogeneous robots on simple\ntasks. Practically, complex long-horizon tasks always require collaborations\namong multiple heterogeneous robots especially with more complex action spaces,\nwhich makes these tasks more challenging. To this end, we propose COHERENT, a\nnovel LLM-based task planning framework for collaboration of heterogeneous\nmulti-robot systems including quadrotors, robotic dogs, and robotic arms.\nSpecifically, a Proposal-Execution-Feedback-Adjustment (PEFA) mechanism is\ndesigned to decompose and assign actions for individual robots, where a\ncentralized task assigner makes a task planning proposal to decompose the\ncomplex task into subtasks, and then assigns subtasks to robot executors. Each\nrobot executor selects a feasible action to implement the assigned subtask and\nreports self-reflection feedback to the task assigner for plan adjustment. The\nPEFA loops until the task is completed. Moreover, we create a challenging\nheterogeneous multi-robot task planning benchmark encompassing 100 complex\nlong-horizon tasks. The experimental results show that our work surpasses the\nprevious methods by a large margin in terms of success rate and execution\nefficiency. The experimental videos, code, and benchmark are released at\nhttps://github.com/MrKeee/COHERENT.\n","authors":["Kehui Liu","Zixin Tang","Dong Wang","Zhigang Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2409.15146v2.pdf","comment":"7 pages, 5 figures. Submitted to IEEE International Conference on\n Robotics and Automation (ICRA), 2025"},{"id":"http://arxiv.org/abs/2409.16611v1","updated":"2024-09-25T04:24:46Z","published":"2024-09-25T04:24:46Z","title":"Achieving Stable High-Speed Locomotion for Humanoid Robots with Deep\n Reinforcement Learning","summary":" Humanoid robots offer significant versatility for performing a wide range of\ntasks, yet their basic ability to walk and run, especially at high velocities,\nremains a challenge. This letter presents a novel method that combines deep\nreinforcement learning with kinodynamic priors to achieve stable locomotion\ncontrol (KSLC). KSLC promotes coordinated arm movements to counteract\ndestabilizing forces, enhancing overall stability. Compared to the baseline\nmethod, KSLC provides more accurate tracking of commanded velocities and better\ngeneralization in velocity control. In simulation tests, the KSLC-enabled\nhumanoid robot successfully tracked a target velocity of 3.5 m/s with reduced\nfluctuations. Sim-to-sim validation in a high-fidelity environment further\nconfirmed its robust performance, highlighting its potential for real-world\napplications.\n","authors":["Xinming Zhang","Xianghui Wang","Lerong Zhang","Guodong Guo","Xiaoyu Shen","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.16611v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2405.06964v2","updated":"2024-09-25T04:21:06Z","published":"2024-05-11T09:18:37Z","title":"ManiFoundation Model for General-Purpose Robotic Manipulation of Contact\n Synthesis with Arbitrary Objects and Robots","summary":" To substantially enhance robot intelligence, there is a pressing need to\ndevelop a large model that enables general-purpose robots to proficiently\nundertake a broad spectrum of manipulation tasks, akin to the versatile\ntask-planning ability exhibited by LLMs. The vast diversity in objects, robots,\nand manipulation tasks presents huge challenges. Our work introduces a\ncomprehensive framework to develop a foundation model for general robotic\nmanipulation that formalizes a manipulation task as contact synthesis.\nSpecifically, our model takes as input object and robot manipulator point\nclouds, object physical attributes, target motions, and manipulation region\nmasks. It outputs contact points on the object and associated contact forces or\npost-contact motions for robots to achieve the desired manipulation task. We\nperform extensive experiments both in the simulation and real-world settings,\nmanipulating articulated rigid objects, rigid objects, and deformable objects\nthat vary in dimensionality, ranging from one-dimensional objects like ropes to\ntwo-dimensional objects like cloth and extending to three-dimensional objects\nsuch as plasticine. Our model achieves average success rates of around 90\\%.\nSupplementary materials and videos are available on our project website at\nhttps://manifoundationmodel.github.io/.\n","authors":["Zhixuan Xu","Chongkai Gao","Zixuan Liu","Gang Yang","Chenrui Tie","Haozhuo Zheng","Haoyu Zhou","Weikun Peng","Debang Wang","Tianrun Hu","Tianyi Chen","Zhouliang Yu","Lin Shao"],"pdf_url":"https://arxiv.org/pdf/2405.06964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17278v2","updated":"2024-09-25T03:59:55Z","published":"2024-05-27T15:40:24Z","title":"EF-Calib: Spatiotemporal Calibration of Event- and Frame-Based Cameras\n Using Continuous-Time Trajectories","summary":" Event camera, a bio-inspired asynchronous triggered camera, offers promising\nprospects for fusion with frame-based cameras owing to its low latency and high\ndynamic range. However, calibrating stereo vision systems that incorporate both\nevent and frame-based cameras remains a significant challenge. In this letter,\nwe present EF-Calib, a spatiotemporal calibration framework for event- and\nframe-based cameras using continuous-time trajectories. A novel calibration\npattern applicable to both camera types and the corresponding event recognition\nalgorithm is proposed. Leveraging the asynchronous nature of events, a\nderivable piece-wise B-spline to represent camera pose continuously is\nintroduced, enabling calibration for intrinsic parameters, extrinsic\nparameters, and time offset, with analytical Jacobians provided. Various\nexperiments are carried out to evaluate the calibration performance of\nEF-Calib, including calibration experiments for intrinsic parameters, extrinsic\nparameters, and time offset. Experimental results show that EF-Calib achieves\nthe most accurate intrinsic parameters compared to current SOTA, the close\naccuracy of the extrinsic parameters compared to the frame-based results, and\naccurate time offset estimation. EF-Calib provides a convenient and accurate\ntoolbox for calibrating the system that fuses events and frames. The code of\nthis paper will also be open-sourced at: https://github.com/wsakobe/EF-Calib.\n","authors":["Shaoan Wang","Zhanhua Xin","Yaoqing Hu","Dongyue Li","Mingzhu Zhu","Junzhi Yu"],"pdf_url":"https://arxiv.org/pdf/2405.17278v2.pdf","comment":"Accepted by IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2409.16595v1","updated":"2024-09-25T03:40:01Z","published":"2024-09-25T03:40:01Z","title":"Robo-Platform: A Robotic System for Recording Sensors and Controlling\n Robots","summary":" Mobile smartphones compactly provide sensors such as cameras, IMUs, GNSS\nmeasurement units, and wireless and wired communication channels required for\nrobotics projects. They are affordable, portable, and programmable, which makes\nthem ideal for testing, data acquisition, controlling mobile robots, and many\nother robotic applications. A robotic system is proposed in this paper,\nconsisting of an Android phone, a microcontroller board attached to the phone\nvia USB, and a remote wireless controller station. In the data acquisition\nmode, the Android device can record a dataset of a diverse configuration of\nmultiple cameras, IMUs, GNSS units, and external USB ADC channels in the rawest\nformat used for, but not limited to, pose estimation and scene reconstruction\napplications. In robot control mode, the Android phone, a microcontroller\nboard, and other peripherals constitute the mobile or stationary robotic\nsystem. This system is controlled using a remote server connected over Wi-Fi or\nBluetooth. Experiments show that although the SLAM and AR applications can\nutilize the acquired data, the proposed system can pave the way for more\nadvanced algorithms for processing these noisy and sporadic measurements.\nMoreover, the characteristics of the communication media are studied, and two\nexample robotic projects, which involve controlling a toy car and a quadcopter,\nare included.\n","authors":["Masoud Dayani Najafabadi"],"pdf_url":"https://arxiv.org/pdf/2409.16595v1.pdf","comment":"Project repository: https://github.com/m-dayani/robo-platform Youtube\n Video: https://youtu.be/BTQ4yLB1bak Dataset:\n https://drive.google.com/drive/folders/1OZqdA1xa-SyJ64qL_TibqhtwhR1fWWrx?usp=sharing"},{"id":"http://arxiv.org/abs/2409.08695v3","updated":"2024-09-25T03:34:45Z","published":"2024-09-13T10:27:27Z","title":"Precision Aquaculture: An Integrated Computer Vision and IoT Approach\n for Optimized Tilapia Feeding","summary":" Traditional fish farming practices often lead to inefficient feeding,\nresulting in environmental issues and reduced productivity. We developed an\ninnovative system combining computer vision and IoT technologies for precise\nTilapia feeding. Our solution uses real-time IoT sensors to monitor water\nquality parameters and computer vision algorithms to analyze fish size and\ncount, determining optimal feed amounts. A mobile app enables remote monitoring\nand control. We utilized YOLOv8 for keypoint detection to measure Tilapia\nweight from length, achieving \\textbf{94\\%} precision on 3,500 annotated\nimages. Pixel-based measurements were converted to centimeters using depth\nestimation for accurate feeding calculations. Our method, with data collection\nmirroring inference conditions, significantly improved results. Preliminary\nestimates suggest this approach could increase production up to 58 times\ncompared to traditional farms. Our models, code, and dataset are\nopen-source~\\footnote{The code, dataset, and models are available upon\nreasonable request.\n","authors":["Rania Hossam","Ahmed Heakl","Walid Gomaa"],"pdf_url":"https://arxiv.org/pdf/2409.08695v3.pdf","comment":"8 pages, 6 figures, 3 tables, 21th International Conference on\n Informatics in Control, Automation, and Robotics"},{"id":"http://arxiv.org/abs/2409.16578v1","updated":"2024-09-25T03:15:17Z","published":"2024-09-25T03:15:17Z","title":"FLaRe: Achieving Masterful and Adaptive Robot Policies with Large-Scale\n Reinforcement Learning Fine-Tuning","summary":" In recent years, the Robotics field has initiated several efforts toward\nbuilding generalist robot policies through large-scale multi-task Behavior\nCloning. However, direct deployments of these policies have led to\nunsatisfactory performance, where the policy struggles with unseen states and\ntasks. How can we break through the performance plateau of these models and\nelevate their capabilities to new heights? In this paper, we propose FLaRe, a\nlarge-scale Reinforcement Learning fine-tuning framework that integrates robust\npre-trained representations, large-scale training, and gradient stabilization\ntechniques. Our method aligns pre-trained policies towards task completion,\nachieving state-of-the-art (SoTA) performance both on previously demonstrated\nand on entirely novel tasks and embodiments. Specifically, on a set of\nlong-horizon mobile manipulation tasks, FLaRe achieves an average success rate\nof 79.5% in unseen environments, with absolute improvements of +23.6% in\nsimulation and +30.7% on real robots over prior SoTA methods. By utilizing only\nsparse rewards, our approach can enable generalizing to new capabilities beyond\nthe pretraining data with minimal human effort. Moreover, we demonstrate rapid\nadaptation to new embodiments and behaviors with less than a day of\nfine-tuning. Videos can be found on the project website at\nhttps://robot-flare.github.io/\n","authors":["Jiaheng Hu","Rose Hendrix","Ali Farhadi","Aniruddha Kembhavi","Roberto Martin-Martin","Peter Stone","Kuo-Hao Zeng","Kiana Ehsan"],"pdf_url":"https://arxiv.org/pdf/2409.16578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16577v1","updated":"2024-09-25T03:15:09Z","published":"2024-09-25T03:15:09Z","title":"Reactive Multi-Robot Navigation in Outdoor Environments Through\n Uncertainty-Aware Active Learning of Human Preference Landscape","summary":" Compared with single robots, Multi-Robot Systems (MRS) can perform missions\nmore efficiently due to the presence of multiple members with diverse\ncapabilities. However, deploying an MRS in wide real-world environments is\nstill challenging due to uncertain and various obstacles (e.g., building\nclusters and trees). With a limited understanding of environmental uncertainty\non performance, an MRS cannot flexibly adjust its behaviors (e.g., teaming,\nload sharing, trajectory planning) to ensure both environment adaptation and\ntask accomplishments. In this work, a novel joint preference landscape learning\nand behavior adjusting framework (PLBA) is designed. PLBA efficiently\nintegrates real-time human guidance to MRS coordination and utilizes Sparse\nVariational Gaussian Processes with Varying Output Noise to quickly assess\nhuman preferences by leveraging spatial correlations between environment\ncharacteristics. An optimization-based behavior-adjusting method then safely\nadapts MRS behaviors to environments. To validate PLBA's effectiveness in MRS\nbehavior adaption, a flood disaster search and rescue task was designed. 20\nhuman users provided 1764 feedback based on human preferences obtained from MRS\nbehaviors related to \"task quality\", \"task progress\", \"robot safety\". The\nprediction accuracy and adaptation speed results show the effectiveness of PLBA\nin preference learning and MRS behavior adaption.\n","authors":["Chao Huang","Wenshuo Zang","Carlo Pinciroli","Zhi Jane Li","Taposh Banerjee","Lili Su","Rui Liu"],"pdf_url":"https://arxiv.org/pdf/2409.16577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16573v1","updated":"2024-09-25T03:03:34Z","published":"2024-09-25T03:03:34Z","title":"Task-driven SLAM Benchmarking","summary":" For assistive robots, one critical use case of SLAM is to support\nlocalization as they navigate through an environment completing tasks. Current\nSLAM benchmarks do not consider task-based deployments where repeatability\n(precision) is more critical than accuracy. To address this gap, we propose a\ntask-driven benchmarking framework for evaluating SLAM methods. The framework\naccounts for SLAM's mapping capabilities, employs precision as a key metric,\nand has low resource requirements to implement. Testing of state-of-the-art\nSLAM methods in both simulated and real-world scenarios provides insights into\nthe performance properties of modern SLAM solutions. In particular, it shows\nthat passive stereo SLAM operates at a level of precision comparable to\nLiDAR-based SLAM in typical indoor environments. The benchmarking approach\noffers a more relevant and accurate assessment of SLAM performance in\ntask-driven applications.\n","authors":["Yanwei Du","Shiyu Feng","Carlton G. Cort","Patricio A. Vela"],"pdf_url":"https://arxiv.org/pdf/2409.16573v1.pdf","comment":"7 pages, 7 figures, 1 table. Submitted to ICRA2025"},{"id":"http://arxiv.org/abs/2409.16566v1","updated":"2024-09-25T02:36:22Z","published":"2024-09-25T02:36:22Z","title":"PANOS: Payload-Aware Navigation in Offroad Scenarios","summary":" Nature has evolved humans to walk on different terrains by developing a\ndetailed understanding of their physical characteristics. Similarly, legged\nrobots need to develop their capability to walk on complex terrains with a\nvariety of task-dependent payloads to achieve their goals. However,\nconventional terrain adaptation methods are susceptible to failure with varying\npayloads. In this work, we introduce PANOS, a weakly supervised approach that\nintegrates proprioception and exteroception from onboard sensing to achieve a\nstable gait while walking by a legged robot over various terrains. Our work\nalso provides evidence of its adaptability over varying payloads. We evaluate\nour method on multiple terrains and payloads using a legged robot. PANOS\nimproves the stability up to 44% without any payload and 53% with 15 lbs\npayload. We also notice a reduction in the vibration cost of 20% with the\npayload for various terrain types when compared to state-of-the-art methods.\n","authors":["Kartikeya Singh","Yash Turkar","Christo Aluckal","Charuvarahan Adhivarahan","Karthik Dantu"],"pdf_url":"https://arxiv.org/pdf/2409.16566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14365v2","updated":"2024-09-25T00:46:59Z","published":"2024-09-22T08:53:50Z","title":"D3RoMa: Disparity Diffusion-based Depth Sensing for Material-Agnostic\n Robotic Manipulation","summary":" Depth sensing is an important problem for 3D vision-based robotics. Yet, a\nreal-world active stereo or ToF depth camera often produces noisy and\nincomplete depth which bottlenecks robot performances. In this work, we propose\nD3RoMa, a learning-based depth estimation framework on stereo image pairs that\npredicts clean and accurate depth in diverse indoor scenes, even in the most\nchallenging scenarios with translucent or specular surfaces where classical\ndepth sensing completely fails. Key to our method is that we unify depth\nestimation and restoration into an image-to-image translation problem by\npredicting the disparity map with a denoising diffusion probabilistic model. At\ninference time, we further incorporated a left-right consistency constraint as\nclassifier guidance to the diffusion process. Our framework combines recently\nadvanced learning-based approaches and geometric constraints from traditional\nstereo vision. For model training, we create a large scene-level synthetic\ndataset with diverse transparent and specular objects to compensate for\nexisting tabletop datasets. The trained model can be directly applied to\nreal-world in-the-wild scenes and achieve state-of-the-art performance in\nmultiple public depth estimation benchmarks. Further experiments in real\nenvironments show that accurate depth prediction significantly improves robotic\nmanipulation in various scenarios.\n","authors":["Songlin Wei","Haoran Geng","Jiayi Chen","Congyue Deng","Wenbo Cui","Chengyang Zhao","Xiaomeng Fang","Leonidas Guibas","He Wang"],"pdf_url":"https://arxiv.org/pdf/2409.14365v2.pdf","comment":null}],"Systems and Control":[{"id":"http://arxiv.org/abs/2409.17352v1","updated":"2024-09-25T20:58:07Z","published":"2024-09-25T20:58:07Z","title":"On the Interplay of Clustering and Evolution in the Emergence of\n Epidemic Outbreaks","summary":" In an increasingly interconnected world, a key scientific challenge is to\nexamine mechanisms that lead to the widespread propagation of contagions, such\nas misinformation and pathogens, and identify risk factors that can trigger\nlarge-scale outbreaks. Underlying both the spread of disease and misinformation\nepidemics is the evolution of the contagion as it propagates, leading to the\nemergence of different strains, e.g., through genetic mutations in pathogens\nand alterations in the information content. Recent studies have revealed that\nmodels that do not account for heterogeneity in transmission risks associated\nwith different strains of the circulating contagion can lead to inaccurate\npredictions. However, existing results on multi-strain spreading assume that\nthe network has a vanishingly small clustering coefficient, whereas clustering\nis widely known to be a fundamental property of real-world social networks. In\nthis work, we investigate spreading processes that entail evolutionary\nadaptations on random graphs with tunable clustering and arbitrary degree\ndistributions. We derive a mathematical framework to quantify the epidemic\ncharacteristics of a contagion that evolves as it spreads, with the structure\nof the underlying network as given via arbitrary {\\em joint} degree\ndistributions of single-edges and triangles. To the best of our knowledge, our\nwork is the first to jointly analyze the impact of clustering and evolution on\nthe emergence of epidemic outbreaks. We supplement our theoretical finding with\nnumerical simulations and case studies, shedding light on the impact of\nclustering on contagion spread.\n","authors":["Mansi Sood","Hejin Gu","Rashad Eletreby","Swarun Kumar","Chai Wah Wu","Osman Yagan"],"pdf_url":"https://arxiv.org/pdf/2409.17352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13743v4","updated":"2024-09-25T20:24:15Z","published":"2023-09-24T20:27:59Z","title":"Robust Adaptive MPC Using Uncertainty Compensation","summary":" This paper presents an uncertainty compensation-based robust adaptive model\npredictive control (MPC) framework for linear systems with both matched and\nunmatched nonlinear uncertainties subject to both state and input constraints.\nIn particular, the proposed control framework leverages an L1 adaptive\ncontroller (L1AC) to compensate for the matched uncertainties and to provide\nguaranteed uniform bounds on the error between the states and control inputs of\nthe actual system and those of a nominal i.e., uncertainty-free, system. The\nperformance bounds provided by the L1AC are then used to tighten the state and\ncontrol constraints of the actual system, and a model predictive controller is\ndesigned for the nominal system with the tightened constraints. The proposed\ncontrol framework, which we denote as uncertainty compensation-based MPC\n(UC-MPC), guarantees constraint satisfaction and achieves improved performance\ncompared with existing methods. Simulation results on a flight control example\ndemonstrate the benefits of the proposed framework.\n","authors":["Ran Tao","Pan Zhao","Ilya Kolmanovsky","Naira Hovakimyan"],"pdf_url":"https://arxiv.org/pdf/2309.13743v4.pdf","comment":"arXiv admin note: text overlap with arXiv:2208.02985"},{"id":"http://arxiv.org/abs/2404.17541v2","updated":"2024-09-25T19:54:15Z","published":"2024-04-26T17:13:02Z","title":"Applications of Lifted Nonlinear Cuts to Convex Relaxations of the AC\n Power Flow Equations","summary":" We demonstrate that valid inequalities, or lifted nonlinear cuts (LNC), can\nbe projected to tighten the Second Order Cone (SOC), Convex DistFlow (CDF), and\nNetwork Flow (NF) relaxations of the AC Optimal Power Flow (AC-OPF) problem. We\nconduct experiments on 36 cases from the PGLib-OPF library for two objective\nfunctions, (1) power generation maximization and (2) generation cost\nminimization. Significant optimality gap improvements are shown for the\nmaximization problem, where the LNC strengthen the SOC and CDF relaxations in\n100% of the test cases, with average and maximum differences in the optimality\ngaps of 23.1% and 93.5% respectively. The NF relaxation is strengthened in\n79.2% of test cases, with average and maximum differences in the optimality\ngaps of 3.45% and 21.2% respectively. We also study the trade-off between\nrelaxation quality and solve time, demonstrating that the strengthened CDF\nrelaxation outperforms the strengthened SOC formulation in terms of runtime and\nnumber of iterations needed, while the strengthened NF formulation is the most\nscalable with the lowest relaxation quality provided by these LNC.\n","authors":["Sergio I. Bugosen","Robert B. Parker","Carleton Coffrin"],"pdf_url":"https://arxiv.org/pdf/2404.17541v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08718v4","updated":"2024-09-25T19:20:46Z","published":"2024-03-13T17:18:39Z","title":"Probabilistic Metaplasticity for Continual Learning with Memristors","summary":" Edge devices operating in dynamic environments critically need the ability to\ncontinually learn without catastrophic forgetting. The strict resource\nconstraints in these devices pose a major challenge to achieve this, as\ncontinual learning entails memory and computational overhead. Crossbar\narchitectures using memristor devices offer energy efficiency through\ncompute-in-memory and hold promise to address this issue. However, memristors\noften exhibit low precision and high variability in conductance modulation,\nrendering them unsuitable for continual learning solutions that require precise\nmodulation of weight magnitude for consolidation. Current approaches fall short\nto address this challenge directly and rely on auxiliary high-precision memory,\nleading to frequent memory access, high memory overhead, and energy\ndissipation. In this research, we propose probabilistic metaplasticity, which\nconsolidates weights by modulating their update probability rather than\nmagnitude. The proposed mechanism eliminates high-precision modification to\nweight magnitudes and, consequently, the need for auxiliary high-precision\nmemory. We demonstrate the efficacy of the proposed mechanism by integrating\nprobabilistic metaplasticity into a spiking network trained on an error\nthreshold with low-precision memristor weights. Evaluations of continual\nlearning benchmarks show that probabilistic metaplasticity achieves performance\nequivalent to state-of-the-art continual learning models with high-precision\nweights while consuming ~ 67% lower memory for additional parameters and up to\n~ 60x lower energy during parameter updates compared to an auxiliary\nmemory-based solution. The proposed model shows potential for energy-efficient\ncontinual learning with low-precision emerging devices.\n","authors":["Fatima Tuz Zohora","Vedant Karia","Nicholas Soures","Dhireesha Kudithipudi"],"pdf_url":"https://arxiv.org/pdf/2403.08718v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17139v1","updated":"2024-09-25T17:57:04Z","published":"2024-09-25T17:57:04Z","title":"Learning with Dynamics: Autonomous Regulation of UAV Based Communication\n Networks with Dynamic UAV Crew","summary":" Unmanned Aerial Vehicle (UAV) based communication networks (UCNs) are a key\ncomponent in future mobile networking. To handle the dynamic environments in\nUCNs, reinforcement learning (RL) has been a promising solution attributed to\nits strong capability of adaptive decision-making free of the environment\nmodels. However, most existing RL-based research focus on control strategy\ndesign assuming a fixed set of UAVs. Few works have investigated how UCNs\nshould be adaptively regulated when the serving UAVs change dynamically. This\narticle discusses RL-based strategy design for adaptive UCN regulation given a\ndynamic UAV set, addressing both reactive strategies in general UCNs and\nproactive strategies in solar-powered UCNs. An overview of the UCN and the RL\nframework is first provided. Potential research directions with key challenges\nand possible solutions are then elaborated. Some of our recent works are\npresented as case studies to inspire innovative ways to handle dynamic UAV crew\nwith different RL algorithms.\n","authors":["Ran Zhang","Bowei Li","Liyuan Zhang"," Jiang"," Xie","Miao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17139v1.pdf","comment":"7 pages, 6 figures, magazine paper"},{"id":"http://arxiv.org/abs/2409.17132v1","updated":"2024-09-25T17:49:34Z","published":"2024-09-25T17:49:34Z","title":"Complex-Phase, Data-Driven Identification of Grid-Forming Inverter\n Dynamics","summary":" The increasing integration of renewable energy sources (RESs) into power\nsystems requires the deployment of grid-forming inverters to ensure a stable\noperation. Accurate modeling of these devices is necessary. In this paper, a\nsystem identification approach to obtain low-dimensional models of grid-forming\ninverters is presented. The proposed approach is based on a Hammerstein-Wiener\nparametrization of the normal-form model. The normal-form is a gray-box model\nthat utilizes complex frequency and phase to capture non-linear inverter\ndynamics. The model is validated on two well-known control strategies:\ndroop-control and dispatchable virtual oscillators. Simulations and\nhardware-in-the-loop experiments demonstrate that the normal-form accurately\nmodels inverter dynamics across various operating conditions. The approach\nshows great potential for enhancing the modeling of RES-dominated power\nsystems, especially when component models are unavailable or computationally\nexpensive.\n","authors":["Anna Büttner","Hans Würfel","Sebastian Liemann","Johannes Schiffer","Frank Hellmann"],"pdf_url":"https://arxiv.org/pdf/2409.17132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17114v1","updated":"2024-09-25T17:27:52Z","published":"2024-09-25T17:27:52Z","title":"Towards human-like kinematics in industrial robotic arms: a case study\n on a UR3 robot","summary":" Safety in industrial robotic environments is a hot research topic in the area\nof human-robot interaction (HRI). Up to now, a robotic arm on an assembly line\ninteracts with other machines away from human workers. Nowadays, robotic arm\nmanufactures are aimed to their robots could increasingly perform tasks\ncollaborating with humans. One of the ways to improve this collaboration is by\nmaking the movement of robots more humanlike. This way, it would be easier for\na human to foresee the movement of the robot and approach it without fear of\ncontact. The main difference between the movement of a human and of a robotic\narm is that the former has a bell-shaped speed profile while the latter has a\nuniform speed one. To generate this speed profile, the kinematic theory of\nrapid human movements and its Sigma-Lognormal model has been used. This model\nis widely used to explain most of the basic phenomena related to the control of\nhuman movements. Both human-like and robotic-like movements are transferred to\nthe UR3 robot. In this paper we detail the how the UR3 robot was programmed to\nproduce both kinds of movement. The dissimilarities result between the input\nmotion and output motion to the robot confirm the possibility to develop\nhuman-like velocities in the UR3 robot.\n","authors":["Adam Wolniakowski","Kanstantsin Miatliuk","Jose J. Quintana","Miguel A. Ferrer","Moises Diaz"],"pdf_url":"https://arxiv.org/pdf/2409.17114v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17100v1","updated":"2024-09-25T17:12:34Z","published":"2024-09-25T17:12:34Z","title":"Generic Diagonalizability, Structural Functional Observability and\n Output Controllability","summary":" This paper investigates the structural functional observability (SFO) and\nstructural output controllability (SOC) of a class of systems with generically\ndiagonalizable state matrices and explores the associated minimal sensor and\nactuator placement problems. The verification of SOC and the corresponding\nsensor and actuator placement problems, i.e., the problems of determining the\nminimum number of outputs and inputs required to achieve SFO and SOC,\nrespectively, are yet open for general systems, which motivates our focus on a\nclass of systems enabling polynomial-time solutions. In this line, we first\ndefine and characterize generically diagonalizable systems, referring to\nstructured systems for which almost all realizations of the state matrices are\ndiagonalizable. We then develop computationally efficient criteria for SFO and\nSOC within the context of generically diagonalizable systems. Our work expands\nthe class of systems amenable to polynomial-time SOC verification. Thanks to\nthe simplicity of the obtained criteria, we derive closed-form solutions for\ndetermining the minimal sensor placement to achieve SFO and the minimal\nactuator deployment to achieve SOC in such systems, along with efficient\nweighted maximum matching based and weighted maximum flow based algorithms. For\nmore general systems to achieve SFO, an upper bound is given by identifying a\nnon-decreasing property of SFO with respect to a specific class of edge\nadditions, which is shown to be optimal under certain circumstances.\n","authors":["Yuan Zhang","Tyrone Fernando","Mohamed Darouach"],"pdf_url":"https://arxiv.org/pdf/2409.17100v1.pdf","comment":"Under review in a Journal"},{"id":"http://arxiv.org/abs/2409.17082v1","updated":"2024-09-25T16:46:16Z","published":"2024-09-25T16:46:16Z","title":"Energy efficiency analysis as a function of the working voltages in\n supercapacitors","summary":" Supercapacitors are increasingly used as energy storage elements. Unlike\nbatteries, their state of charge has a considerable influence on their voltage\nin normal operation, allowing them to work from zero to their maximum voltage.\nIn this work, a theoretical and practical analysis is proposed of the energy\nefficiency of these devices according to their working voltages. To this end,\nseveral supercapacitors were subjected to charge and discharge cycles until the\nmeasurements of current and voltage stabilized. At this point their energy\nefficiency was calculated. These charge-discharge cycles were carried out: i)\nwithout rest between charging and discharging; and ii) with a rest of several\nminutes between the two stages. Using the information obtained from the tests,\nthe energy efficiency is shown plotted against the minimum and maximum working\nvoltages. By consulting the data and the graphs, the ideal working voltages to\noptimize the energy efficiency of these devices can be obtained.\n","authors":["Jose Quintana","Alejandro Ramos","Moises Diaz","Ignacio Nuez"],"pdf_url":"https://arxiv.org/pdf/2409.17082v1.pdf","comment":"18 pages, 10 figures"},{"id":"http://arxiv.org/abs/2402.00681v2","updated":"2024-09-25T16:26:49Z","published":"2024-02-01T15:44:17Z","title":"Sampling-based Stochastic Data-driven Predictive Control under Data\n Uncertainty","summary":" We present a stochastic constrained output-feedback data-driven predictive\ncontrol scheme for linear time-invariant systems subject to bounded additive\ndisturbances. The approach uses data-driven predictors based on an extension of\nWillems' fundamental lemma and requires only a single persistently exciting\ninput-output data trajectory. Compared to current state-of-the-art approaches,\nwe do not rely on availability of exact disturbance data. Instead, we leverage\na novel parameterization of the unknown disturbance data considering\nconsistency with the measured data and the system class. This allows for\ndeterministic approximation of the chance constraints in a sampling-based\nfashion. A robust constraint on the first predicted step enables recursive\nfeasibility, closed-loop constraint satisfaction, and robust asymptotic\nstability in expectation under standard assumptions. A numerical example\ndemonstrates the efficiency of the proposed control scheme.\n","authors":["Johannes Teutsch","Sebastian Kerz","Dirk Wollherr","Marion Leibold"],"pdf_url":"https://arxiv.org/pdf/2402.00681v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17056v1","updated":"2024-09-25T16:14:36Z","published":"2024-09-25T16:14:36Z","title":"A Novel MOSFET based Single Event Latchup Detection, Current Limiting &\n Self Power Cycling circuit for Spacecraft systems","summary":" Single Event Latch-up (SEL) is one of the prime concerns for CMOS ICs used in\nspace systems. Galactic Cosmic Rays or Solar Energetic Particles (SEP) may\ntrigger the parasitic latch up circuit in CMOS ICs and cause increase in\ncurrent beyond the safe limits thereby presenting a threat of permanent failure\nof the IC. Mitigation of the SEL is always a challenging task. The conventional\nmitigation approaches inherently introduce some response time which presents an\nuncertainty because during this response time the current may exceed the safe\ncurrent limits. This paper presents a novel circuit based on MOSFETs which\nprovides end-to-end complete solution of detecting SEL, limiting the current\nbelow the set threshold and executing power cycling to restore the normal\nfunctioning of the CMOS IC. The proposed circuit has been simulated in MULTISIM\nand the simulation results match very well with the expected behavior of\n(i)current limiting and (ii) the total time duration taken in power cycling to\nbring the SEL sensitive device back to its normal operational state. This\ncircuit can be harnessed by spacecraft system designers to overcome the\ncatastrophic threat of SEL posed by space radiation environment.\n","authors":["Ishan Pandey","Kinshuk Gupta","Vinod Kumar","A. R. Khan","Sandhya V. Kamat"],"pdf_url":"https://arxiv.org/pdf/2409.17056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13191v3","updated":"2024-09-25T15:05:51Z","published":"2024-06-19T03:49:21Z","title":"GPU-Accelerated DCOPF using Gradient-Based Optimization","summary":" DC Optimal Power Flow (DCOPF) is a key operational tool for power system\noperators, and it is embedded as a subproblem in many challenging optimization\nproblems (e.g., line switching). However, traditional CPU-based solve routines\n(e.g., simplex) have saturated in speed and are hard to parallelize. This paper\nfocuses on solving DCOPF problems using gradient-based routines on Graphics\nProcessing Units (GPUs), which have massive parallelization capability. To\nformulate these problems, we pose a Lagrange dual associated with DCOPF (linear\nand quadratic cost curves), and then we explicitly solve the inner (primal)\nminimization problem with a dual norm. The resulting dual problem can be\nefficiently iterated using projected gradient ascent. After solving the dual\nproblem on both CPUs and GPUs to find tight lower bounds, we benchmark against\nGurobi and MOSEK, comparing convergence speed and tightness on the IEEE 2000,\n4601, and 10000 bus systems. We provide reliable and tight lower bounds for\nthese problems with, at best, 5.4x speedup over a conventional solver.\n","authors":["Seide Saba Rafiei","Samuel Chevalier"],"pdf_url":"https://arxiv.org/pdf/2406.13191v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01263v2","updated":"2024-09-25T14:26:15Z","published":"2024-01-02T16:09:42Z","title":"Identification of Additive Continuous-time Systems in Open and Closed\n loop","summary":" When identifying electrical, mechanical, or biological systems, parametric\ncontinuous-time identification methods can lead to interpretable and\nparsimonious models when the model structure aligns with the physical\nproperties of the system. Traditional linear system identification may not\nconsider the most parsimonious model when relying solely on unfactored transfer\nfunctions, which typically result from standard direct approaches. This paper\npresents a novel identification method that delivers additive models for both\nopen and closed-loop setups. The estimators that are derived are shown to be\ngenerically consistent, and can admit the identification of marginally stable\nadditive systems. Numerical simulations show the efficacy of the proposed\napproach, and its performance in identifying a modal representation of a\nflexible beam is verified using experimental data.\n","authors":["Rodrigo A. González","Koen Classens","Cristian R. Rojas","James S. Welsh","Tom Oomen"],"pdf_url":"https://arxiv.org/pdf/2401.01263v2.pdf","comment":"15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.16948v1","updated":"2024-09-25T14:01:53Z","published":"2024-09-25T14:01:53Z","title":"The Power-Oriented Graphs Modeling Technique: From the Fundamental\n Principles to the Systematic, Step-by-Step Modeling of Complex Physical\n Systems","summary":" Modeling physical systems is an essential skill for a control engineer, since\nit enables to achieve a deep understanding of their dynamic behavior and,\nconsequently, the development of effective control strategies. The first part\nof this article provides a tutorial description of the fundamental principles\nand properties of the Power-Oriented Graphs (POG) modeling technique. Various\ncase studies in different energetic domains are then presented to consolidate\nthe fundamental principles, each highlighting different features of the POG\nmodeling technique. The latter is then compared with the other two main\ngraphical modeling techniques available in the literature, namely Bond Graph\n(BG) and Energetic Macroscopic Representation (EMR). The second part of this\narticle assumes once again a tutorial nature, in order to introduce the new\nFast Modeling POG (FMPOG) procedure. The FMPOG, which operates in the POG\nframework, is a methodical step-by-step procedure that enables the readers to\nquickly derive the power-oriented graphical model of physical systems starting\nfrom their schematics. From the power-oriented graphical model, the state-space\nmodel can then be directly determined. To ensure the FMPOG procedure is easily\nusable by the entire community, we apply it to three examples in different\nenergetic domains in this article, guiding the reader step-by-step through the\nderivation of the physical systems models.\n","authors":["Davide Tebaldi","Roberto Zanasi"],"pdf_url":"https://arxiv.org/pdf/2409.16948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12299v2","updated":"2024-09-25T13:43:29Z","published":"2023-10-18T20:05:06Z","title":"Instantaneous Frequency Estimation in Unbalanced Systems Using Affine\n Differential Geometry","summary":" The paper discusses the relationships between electrical and affine\ndifferential geometry quantities, establishing a link between frequency and\ntime derivatives of voltage, through the utilization of affine geometric\ninvariants. Based on this link, a new instantaneous frequency estimation\nformula is proposed, which is particularly suited for unbalanced and\nsingle-phase systems. Several examples as well as measurements based on two\nreal-world events illustrate the findings of the paper.\n","authors":["Ali Alshawabkeh","Georgios Tzounas","Angel Molina-Garcia","Federico Milano"],"pdf_url":"https://arxiv.org/pdf/2310.12299v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17381v2","updated":"2024-09-25T13:02:28Z","published":"2023-10-26T13:25:30Z","title":"Proactive Emergency Collision Avoidance for Automated Driving in Highway\n Scenarios","summary":" Uncertainty in the behavior of other traffic participants is a crucial factor\nin collision avoidance for automated driving; here, stochastic metrics could\navoid overly conservative decisions. This paper introduces a Stochastic Model\nPredictive Control (SMPC) planner for emergency collision avoidance in highway\nscenarios to proactively minimize collision risk while ensuring safety through\nchance constraints. To guarantee that the emergency trajectory can be attained,\nwe incorporate nonlinear tire dynamics in the prediction model of the ego\nvehicle. Further, we exploit Max-Min-Plus-Scaling (MMPS) approximations of the\nnonlinearities to avoid conservatism, enforce proactive collision avoidance,\nand improve computational efficiency in terms of performance and speed.\nConsequently, our contributions include integrating a dynamic ego vehicle model\ninto the SMPC planner, introducing the MMPS approximation for real-time\nimplementation in emergency scenarios, and integrating SMPC with hybridized\nchance constraints and risk minimization. We evaluate our SMPC formulation in\nterms of proactivity and efficiency in various hazardous scenarios. Moreover,\nwe demonstrate the effectiveness of our proposed approach by comparing it with\na state-of-the-art SMPC planner and we validate that the generated trajectories\ncan be attained using a high-fidelity vehicle model in IPG CarMaker.\n","authors":["Leila Gharavi","Azita Dabiri","Jelske Verkuijlen","Bart De Schutter","Simone Baldi"],"pdf_url":"https://arxiv.org/pdf/2310.17381v2.pdf","comment":"14 pages, 11 figures, submitted to IEEE Transactions on Control\n Systems Technology"},{"id":"http://arxiv.org/abs/2409.16875v1","updated":"2024-09-25T12:40:07Z","published":"2024-09-25T12:40:07Z","title":"Feedforward Controllers from Learned Dynamic Local Model Networks with\n Application to Excavator Assistance Functions","summary":" Complicated first principles modelling and controller synthesis can be\nprohibitively slow and expensive for high-mix, low-volume products such as\nhydraulic excavators. Instead, in a data-driven approach, recorded trajectories\nfrom the real system can be used to train local model networks (LMNs), for\nwhich feedforward controllers are derived via feedback linearization. However,\nprevious works required LMNs without zero dynamics for feedback linearization,\nwhich restricts the model structure and thus modelling capacity of LMNs. In\nthis paper, we overcome this restriction by providing a criterion for when\nfeedback linearization of LMNs with zero dynamics yields a valid controller. As\na criterion we propose the bounded-input bounded-output stability of the\nresulting controller. In two additional contributions, we extend this approach\nto consider measured disturbance signals and multiple inputs and outputs. We\nillustrate the effectiveness of our contributions in a hydraulic excavator\ncontrol application with hardware experiments. To this end, we train LMNs from\nrecorded, noisy data and derive feedforward controllers used as part of a\nleveling assistance system on the excavator. In our experiments, incorporating\ndisturbance signals and multiple inputs and outputs enhances tracking\nperformance of the learned controller. A video of our experiments is available\nat https://youtu.be/lrrWBx2ASaE.\n","authors":["Leon Greiser","Ozan Demir","Benjamin Hartmann","Henrik Hose","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2409.16875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02636v2","updated":"2024-09-25T12:25:21Z","published":"2024-09-04T11:59:53Z","title":"Mamba as a motion encoder for robotic imitation learning","summary":" Recent advancements in imitation learning, particularly with the integration\nof LLM techniques, are set to significantly improve robots' dexterity and\nadaptability. This paper proposes using Mamba, a state-of-the-art architecture\nwith potential applications in LLMs, for robotic imitation learning,\nhighlighting its ability to function as an encoder that effectively captures\ncontextual information. By reducing the dimensionality of the state space,\nMamba operates similarly to an autoencoder. It effectively compresses the\nsequential information into state variables while preserving the essential\ntemporal dynamics necessary for accurate motion prediction. Experimental\nresults in tasks such as cup placing and case loading demonstrate that despite\nexhibiting higher estimation errors, Mamba achieves superior success rates\ncompared to Transformers in practical task execution. This performance is\nattributed to Mamba's structure, which encompasses the state space model.\nAdditionally, the study investigates Mamba's capacity to serve as a real-time\nmotion generator with a limited amount of training data.\n","authors":["Toshiaki Tsuji"],"pdf_url":"https://arxiv.org/pdf/2409.02636v2.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.16825v1","updated":"2024-09-25T11:22:30Z","published":"2024-09-25T11:22:30Z","title":"Measurements and System Identification for the Characterization of\n Smooth Muscle Cell Dynamics","summary":" Biological tissue integrity is actively maintained by cells. It is essential\nto comprehend how cells accomplish this in order to stage tissue diseases.\nHowever, addressing the complexity of a cell's system of interrelated\nmechanisms poses a challenge. This necessitates a well-structured\nidentification framework and an effective integration of measurements. Here we\nintroduce the use of state-of-the-art frequency-domain system identification\ntechniques combined with an indentation measurement platform to analyze the\nunderlying mechanisms from the perspective of control system theory. The\nultimate goal is to explore how mechanical and biological factors are related\nin induced Pluripotent Stem Cell-derived vascular smooth muscle cells. We study\non the frequency-domain analysis for the investigation and characterization of\ncellular dynamics of smooth muscle cells from the measured data. The\nmeasurement model in this study exploits the availability of human tissue and\nsamples, enabling fundamental investigations of vascular tissue disease. This\napproach using human cell lines holds significant potential to decrease the\nnecessity for animal-based safety and efficacy studies. The focus of this\nreview is to investigate the cellular dynamics underlying the myogenic response\nand to demonstrate the practicability of employing a nano-indentation\nmeasurement setup for the broadband frequency-domain characterization of\ninduced Pluripotent Stem Cell-derived vascular smooth muscle cells.\n","authors":["Dilan Ozturk","Pepijn Saraber","Kevin Bielawski","Alessandro Giudici","Leon Schurgers","Koen Reesink","Maarten Schoukens"],"pdf_url":"https://arxiv.org/pdf/2409.16825v1.pdf","comment":"6 pages, 9 figures, presented in the Medical Measurements and\n Applications - MeMeA2024 conference"},{"id":"http://arxiv.org/abs/2409.16811v1","updated":"2024-09-25T11:03:15Z","published":"2024-09-25T11:03:15Z","title":"Performance Boundary Analyses for Statistical Multi-QoS Framework Over\n 6G SAGINs","summary":" To enable the cost-effective universal access and the enhancement of current\ncommunication services, the space-air-ground integrated networks (SAGINs) have\nrecently been developed due to its exceptional 3D coverage and the ability to\nguarantee rigorous and multidimensional demands for quality-of-service (QoS)\nprovisioning, including delay and reliability across vast distances. In\nresponse to the complex, heterogeneous, and dynamic serving scenarios and\nstringent performance expectations for 6G SAGINs, it is crucial to undertake\nmodeling, assurance, and analysis of the key technologies, aligned with the\ndiverse demands for QoS provisioning in the non-asymptotic regime, i.e., when\nimplementing finite blocklength coding (FBC) as a new dimension for error-rate\nbounded QoS metric. However, how to design new statistical QoS-driven\nperformance modeling approaches that accurately delineate the complex and\ndynamic behaviors of networks, particularly in terms of constraining both delay\nand error rate, persists as a significant challenge for implementing mURLLC\nwithin 6G SAGINs in the finite blocklength regime. To overcome these\ndifficulties, in this paper we propose to develop a set of analytical modeling\nframeworks for 6G SAGIN in supporting statistical delay and error-rate bounded\nQoS in the finite blocklength regime. First we establish the SAGIN system\narchitecture model. Second, the aggregate interference and decoding error\nprobability functions are modeled and examined through using Laplace transform.\nThird, we introduce modeling techniques aimed at defining\nthe$\\epsilon$-effective capacity function as a crucial metric for facilitating\nstatistical QoS standards with respect to delay and error-rate. To validate the\neffectiveness of the developed performance modeling schemes, we have executed a\nseries of simulations over SAGINs.\n","authors":["Jingqing Wang","Wenchi Cheng","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.16811v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16810v1","updated":"2024-09-25T11:02:30Z","published":"2024-09-25T11:02:30Z","title":"Inline Photometrically Calibrated Hybrid Visual SLAM","summary":" This paper presents an integrated approach to Visual SLAM, merging online\nsequential photometric calibration within a Hybrid direct-indirect visual SLAM\n(H-SLAM). Photometric calibration helps normalize pixel intensity values under\ndifferent lighting conditions, and thereby improves the direct component of our\nH-SLAM. A tangential benefit also results to the indirect component of H-SLAM\ngiven that the detected features are more stable across variable lighting\nconditions. Our proposed photometrically calibrated H-SLAM is tested on several\ndatasets, including the TUM monoVO as well as on a dataset we created.\nCalibrated H-SLAM outperforms other state of the art direct, indirect, and\nhybrid Visual SLAM systems in all the experiments. Furthermore, in online SLAM\ntested at our site, it also significantly outperformed the other SLAM Systems.\n","authors":["Nicolas Abboud","Malak Sayour","Imad H. Elhajj","John Zelek","Daniel Asmar"],"pdf_url":"https://arxiv.org/pdf/2409.16810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11828v2","updated":"2024-09-25T10:58:07Z","published":"2024-09-18T09:21:55Z","title":"Model-Free Generic Robust Control for Servo-Driven Actuation Mechanisms\n with Layered Insight into Energy Conversions","summary":" To advance theoretical solutions and address limitations in modeling complex\nservo-driven actuation systems experiencing high non-linearity and load\ndisturbances, this paper aims to design a practical model-free generic robust\ncontrol (GRC) framework for these mechanisms. This framework is intended to be\napplicable across all actuator systems encompassing electrical, hydraulic, or\npneumatic servomechanisms, while also functioning within complex interactions\namong dynamic components and adhering to control input constraints. In this\nrespect, the state-space model of actuator systems is decomposed into smaller\nsubsystems that incorporate the first principle equation of actuator motion\ndynamics and interactive energy conversion equations. This decomposition\noperates under the assumption that the comprehensive model of the servo-driven\nactuator system and energy conversion, uncertainties, load disturbances, and\ntheir bounds are unknown. Then, the GRC employs subsystem-based adaptive\ncontrol strategies for each state-variant subsystem separately. Despite control\ninput constraints and the unknown interactive system model, the GRC-applied\nactuator mechanism ensures uniform exponential stability and robustness in\ntracking desired motions. It features straightforward implementation,\nexperimentally evaluated by applying it to two industrial applications.\n","authors":["Mehdi Heydari Shahna","Jouni Mattila"],"pdf_url":"https://arxiv.org/pdf/2409.11828v2.pdf","comment":"This work has been submitted for possible publication in the IEEE"},{"id":"http://arxiv.org/abs/2311.13889v3","updated":"2024-09-25T09:25:48Z","published":"2023-11-23T10:20:12Z","title":"SIMBa: System Identification Methods leveraging Backpropagation","summary":" This manuscript details and extends the SIMBa toolbox (System Identification\nMethods leveraging Backpropagation) presented in previous work, which uses\nwell-established Machine Learning tools for discrete-time linear\nmulti-step-ahead state-space System Identification (SI). SIMBa leverages\nlinear-matrix-inequality-based free parametrizations of Schur matrices to\nguarantee the stability of the identified model by design. In this paper,\nbacked up by novel free parametrizations of Schur matrices, we extend the\ntoolbox to show how SIMBa can incorporate known sparsity patterns or true\nvalues of the state-space matrices to identify without jeopardizing stability.\n We extensively investigate SIMBa's behavior when identifying diverse systems\nwith various properties from both simulated and real-world data. Overall, we\nfind it consistently outperforms traditional stable subspace identification\nmethods, and sometimes significantly, especially when enforcing desired model\nproperties. These results hint at the potential of SIMBa to pave the way for\ngeneric structured nonlinear SI. The toolbox is open-sourced on\nhttps://github.com/Cemempamoi/simba.\n","authors":["Loris Di Natale","Muhammad Zakwan","Philipp Heer","Giancarlo Ferrari-Trecate","Colin N. Jones"],"pdf_url":"https://arxiv.org/pdf/2311.13889v3.pdf","comment":"First two authors contributed equally. Submitted to IEEE TCST"},{"id":"http://arxiv.org/abs/2409.16750v1","updated":"2024-09-25T09:01:39Z","published":"2024-09-25T09:01:39Z","title":"Distributed Robust Optimization Method for AC/MTDC Hybrid Power Systems\n with DC Network Cognizance","summary":" AC/multi-terminal DC (MTDC) hybrid power systems have emerged as a solution\nfor the large-scale and longdistance accommodation of power produced by\nrenewable energy systems (RESs). To ensure the optimal operation of such hybrid\npower systems, this paper addresses three key issues: system operational\nflexibility, centralized communication limitations, and RES uncertainties.\nAccordingly, a specific AC/DC optimal power flow (OPF) model and a distributed\nrobust optimization method are proposed. Firstly, we apply a set of linear\napproximation and convex relaxation techniques to formulate the mixed-integer\nconvex AC/DC OPF model. This model incorporates the DC network-cognizant\nconstraint and enables DC topology reconfiguration. Next, generalized Benders\ndecomposition (GBD) is employed to provide distributed optimization. Enhanced\napproaches are incorporated into GBD to achieve parallel computation and\nasynchronous updating. Additionally, the extreme scenario method (ESM) is\nembedded into the AC/DC OPF model to provide robust decisions to hedge against\nRES uncertainties. ESM is further extended to align the GBD procedure.\nNumerical results are finally presented to validate the effectiveness of our\nproposed method.\n","authors":["Haixiao Li","Aleksandra Lekić"],"pdf_url":"https://arxiv.org/pdf/2409.16750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16746v1","updated":"2024-09-25T08:54:42Z","published":"2024-09-25T08:54:42Z","title":"Adaptive Single-Terminal Fault Location for DC Microgrids","summary":" Identifying faulty lines and their accurate location is key for rapidly\nrestoring distribution systems. This will become a greater challenge as the\npenetration of power electronics increases, and contingencies are seen across\nlarger areas. This paper proposes a single terminal methodology (i.e., no\ncommunication involved) that is robust to variations of key parameters (e.g.,\nsampling frequency, system parameters, etc.) and performs particularly well for\nlow resistance faults that constitute the majority of faults in low voltage DC\nsystems. The proposed method uses local measurements to estimate the current\ncaused by the other terminals affected by the contingency. This mimics the\nstrategy followed by double terminal methods that require communications and\ndecouples the accuracy of the methodology from the fault resistance. The\nalgorithm takes consecutive voltage and current samples, including the\nestimated current of the other terminal, into the analysis. This mathematical\nmethodology results in a better accuracy than other single-terminal approaches\nfound in the literature. The robustness of the proposed strategy against\ndifferent fault resistances and locations is demonstrated using MATLAB\nsimulations.\n","authors":["Vaibhav Nougain","Sukumar Mishra","Joan-Marc Rodriguez-Bernuz","Adria Junyent-Ferre","Aditya Shekhar","Aleksandra Lekic"],"pdf_url":"https://arxiv.org/pdf/2409.16746v1.pdf","comment":"SEST 2024"},{"id":"http://arxiv.org/abs/2409.16743v1","updated":"2024-09-25T08:52:07Z","published":"2024-09-25T08:52:07Z","title":"Event-Triggered Non-Linear Control of Offshore MMC Grids for\n Asymmetrical AC Faults","summary":" Fault ride-through capability studies of MMC-HVDC connected wind power plants\nhave focused primarily on the DC link and onshore AC grid faults. Offshore AC\nfaults, mainly asymmetrical faults have not gained much attention in the\nliterature despite being included in the future development at national levels\nin the ENTSO-E HVDC code. The proposed work gives an event-triggered control to\nstabilize the system once the offshore AC fault has occurred, identified, and\nisolated. Different types of control actions such as proportional-integral (PI)\ncontroller and super-twisted sliding mode control (STSMC) are used to smoothly\ntransition the post-fault system to a new steady state operating point by\nsuppressing the negative sequence control. Initially, the effect of a negative\nsequence current control scheme on the transient behavior of the power system\nwith a PI controller is discussed in this paper. Further, a non-linear control\nstrategy (STSMC) is proposed which gives quicker convergence of the system\npost-fault in comparison to PI control action. These post-fault control\noperations are only triggered in the presence of a fault in the system, i.e.,\nthey are event-triggered. The validity of the proposed strategy is demonstrated\nby simulation on a $\\pm$525 kV, three-terminal meshed MMC-HVDC system model in\nReal Time Digital Simulator (RTDS).\n","authors":["Naajein Cherat","Vaibhav Nougain","Milovan Majstorović","Peter Palensky","Aleksandra Lekić"],"pdf_url":"https://arxiv.org/pdf/2409.16743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19459v4","updated":"2024-09-25T08:48:32Z","published":"2024-07-28T10:27:35Z","title":"An Alternative to Multi-Factor Authentication with a Triple-Identity\n Authentication Scheme","summary":" The existing authentication system has two entry points (i.e., username and\npassword fields) to interact with the outside, but neither of them has a\ngatekeeper, making the system vulnerable to cyberattacks. In order to ensure\nthe authentication security, the system sets a third entry point and use an\nexternal MFA service to guard it. The crux of the problem is that the system\nhas no internal mechanism to guard its own entry points as no identifiers can\nbe defined for the username and password without using any personal\ninformation. To solve this problem, we open the hash algorithm of a\ndual-password login-authentication system to three login credentials.\nTherefore, the intermediate elements of the algorithm can be used to define an\nidentifier to verify the user identity at each entry point of the system. As a\nresult of the above setup, a triple-identity authentication is established, the\nkey of which is that the readily available user's login name and password are\nrandomly converted into a matrix of meaningless hash elements which are\nconcealed, incommunicable, inaccessible, and independent of personal\ninformation. So the identifiers defined using such elements can be used by the\nsystem to verify the identities of the user at all the entry points of the\nsystem, thereby ensuring the authentication security without relying on MFA\nservices.\n","authors":["Suyun Borjigin"],"pdf_url":"https://arxiv.org/pdf/2407.19459v4.pdf","comment":"5 pages, 2 figures, 11 conferences"},{"id":"http://arxiv.org/abs/2409.16717v1","updated":"2024-09-25T08:04:43Z","published":"2024-09-25T08:04:43Z","title":"The Bayesian Separation Principle for Data-driven Control","summary":" This paper investigates the existence of a separation principle between model\nidentification and control design in the context of model predictive control.\nFirst, we elucidate that the separation principle holds asymptotically in the\nnumber of data in a Fisherian setting, and universally in a Bayesian setting.\nThen, by formulating model predictive control within a Gaussian regression\nframework, we describe how the Bayesian separation principle can be used to\nderive explicit, uncertainty-aware expressions for the control cost and optimal\ninput sequence, thereby bridging direct and indirect data-driven approaches.\n","authors":["Riccardo Alessandro Grimaldi","Giacomo Baggio","Ruggero Carli","Gianluigi Pillonetto"],"pdf_url":"https://arxiv.org/pdf/2409.16717v1.pdf","comment":"13 pages, 1 figure"},{"id":"http://arxiv.org/abs/2409.16672v1","updated":"2024-09-25T06:58:06Z","published":"2024-09-25T06:58:06Z","title":"Stochastic Shortest Path Problem with Failure Probability","summary":" We solve a sequential decision-making problem under uncertainty that takes\ninto account the failure probability of a task. This problem cannot be handled\nby the stochastic shortest path problem, which is the standard model for\nsequential decision-making. This problem is addressed by introducing dead-ends.\nConventionally, we only consider policies that minimize the probability of task\nfailure, so the optimal policy constructed could be overly conservative. In\nthis paper, we address this issue by expanding the search range to a class of\npolicies whose failure probability is less than a desired threshold. This\nproblem can be solved by treating it as a framework of a Bayesian Markov\ndecision process and a two-person zero-sum game. Also, it can be seen that the\noptimal policy is expressed in the form of a probability distribution on a set\nof deterministic policies. We also demonstrate the effectiveness of the\nproposed methods by applying them to a motion planning problem with obstacle\navoidance for a moving robot.\n","authors":["Ritsusamuel Otsubo"],"pdf_url":"https://arxiv.org/pdf/2409.16672v1.pdf","comment":"22 pages, 5 figure"},{"id":"http://arxiv.org/abs/2409.16665v1","updated":"2024-09-25T06:50:31Z","published":"2024-09-25T06:50:31Z","title":"Multirotor Nonlinear Model Predictive Control based on Visual Servoing\n of Evolving Features","summary":" This article presents a Visual Servoing Nonlinear Model Predictive Control\n(NMPC) scheme for autonomously tracking a moving target using multirotor\nUnmanned Aerial Vehicles (UAVs). The scheme is developed for surveillance and\ntracking of contour-based areas with evolving features. NMPC is used to manage\ninput and state constraints, while additional barrier functions are\nincorporated in order to ensure system safety and optimal performance. The\nproposed control scheme is designed based on the extraction and implementation\nof the full dynamic model of the features describing the target and the state\nvariables. Real-time simulations and experiments using a quadrotor UAV equipped\nwith a camera demonstrate the effectiveness of the proposed strategy.\n","authors":["Sotirios N. Aspragkathos","Panagiotis Rousseas","George C. Karras","Kostas J. Kyriakopoulos"],"pdf_url":"https://arxiv.org/pdf/2409.16665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16643v1","updated":"2024-09-25T05:43:41Z","published":"2024-09-25T05:43:41Z","title":"A Fast Dynamic Internal Predictive Power Scheduling Approach for Power\n Management in Microgrids","summary":" This paper presents a Dynamic Internal Predictive Power Scheduling (DIPPS)\napproach for optimizing power management in microgrids, particularly focusingon\nexternal power exchanges among diverse prosumers. DIPPS utilizes a dynamic\nobjective function with a time-varying binary parameter to control the timing\nof power transfers to the external grid, facilitated by efficient usage of\nenergy storage for surplus renewable power. The microgrid power scheduling\nproblem is modeled as a mixed-integer nonlinear programmig (MINLP-PS) and\nsubsequently transformed into a mixed-integer linear programming (MILP-PS)\noptimization through McCormick's relaxation to reduce the computational\ncomplexity. A predictive window with 6 data points is solved at an average of\n0.92s, a 97.6% improvement over the 38.27s required for the MINLP-PS\nformulation, implying the numerical feasibility of the DIPPS approach for\nreal-time implementation. Finally, the approach is validated against a static\nobjective using real-world load data across three case studies with different\ntime-varying parameters, demonstrationg the ability of DIPPS to optimize power\nexchanges and efficiently utilize distributed resources whie shifting the\neexternal power transfers to specified time durations.\n","authors":["Neethu Maya","Bala Kameshwar Poolla","Seshadhri Srinivasan","Narasimman Sundararajan","Suresh Sundaram"],"pdf_url":"https://arxiv.org/pdf/2409.16643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14183v2","updated":"2024-09-25T05:14:31Z","published":"2023-10-13T22:09:52Z","title":"Towards Autonomous Supply Chains: Definition, Characteristics,\n Conceptual Framework, and Autonomy Levels","summary":" Recent global disruptions, such as the pandemic and geopolitical conflicts,\nhave profoundly exposed vulnerabilities in traditional supply chains, requiring\nexploration of more resilient alternatives. Autonomous supply chains (ASCs)\nhave emerged as a potential solution, offering increased visibility,\nflexibility, and resilience in turbulent trade environments. Despite\ndiscussions in industry and academia over several years, ASCs lack\nwell-established theoretical foundations. This paper addresses this research\ngap by presenting a formal definition of ASC along with its defining\ncharacteristics and auxiliary concepts. We propose a layered conceptual\nframework called the MIISI model. An illustrative case study focusing on the\nmeat supply chain demonstrates an initial ASC implementation based on this\nconceptual model. Additionally, we introduce a seven-level supply chain\nautonomy reference model, delineating a trajectory towards achieving a full\nsupply chain autonomy. Recognising that this work represents an initial\nendeavour, we emphasise the need for continued exploration in this emerging\ndomain. We anticipate that this work will stimulate further research, both\ntheoretical and technical, and contribute to the continual evolution of ASCs.\n","authors":["Liming Xu","Stephen Mak","Yaniv Proselkov","Alexandra Brintrup"],"pdf_url":"https://arxiv.org/pdf/2401.14183v2.pdf","comment":"This paper includes 19 pages and 8 figures and has been accepted for\n publication in the Journal of Industrial Information Integration"},{"id":"http://arxiv.org/abs/2409.16595v1","updated":"2024-09-25T03:40:01Z","published":"2024-09-25T03:40:01Z","title":"Robo-Platform: A Robotic System for Recording Sensors and Controlling\n Robots","summary":" Mobile smartphones compactly provide sensors such as cameras, IMUs, GNSS\nmeasurement units, and wireless and wired communication channels required for\nrobotics projects. They are affordable, portable, and programmable, which makes\nthem ideal for testing, data acquisition, controlling mobile robots, and many\nother robotic applications. A robotic system is proposed in this paper,\nconsisting of an Android phone, a microcontroller board attached to the phone\nvia USB, and a remote wireless controller station. In the data acquisition\nmode, the Android device can record a dataset of a diverse configuration of\nmultiple cameras, IMUs, GNSS units, and external USB ADC channels in the rawest\nformat used for, but not limited to, pose estimation and scene reconstruction\napplications. In robot control mode, the Android phone, a microcontroller\nboard, and other peripherals constitute the mobile or stationary robotic\nsystem. This system is controlled using a remote server connected over Wi-Fi or\nBluetooth. Experiments show that although the SLAM and AR applications can\nutilize the acquired data, the proposed system can pave the way for more\nadvanced algorithms for processing these noisy and sporadic measurements.\nMoreover, the characteristics of the communication media are studied, and two\nexample robotic projects, which involve controlling a toy car and a quadcopter,\nare included.\n","authors":["Masoud Dayani Najafabadi"],"pdf_url":"https://arxiv.org/pdf/2409.16595v1.pdf","comment":"Project repository: https://github.com/m-dayani/robo-platform Youtube\n Video: https://youtu.be/BTQ4yLB1bak Dataset:\n https://drive.google.com/drive/folders/1OZqdA1xa-SyJ64qL_TibqhtwhR1fWWrx?usp=sharing"},{"id":"http://arxiv.org/abs/2409.08695v3","updated":"2024-09-25T03:34:45Z","published":"2024-09-13T10:27:27Z","title":"Precision Aquaculture: An Integrated Computer Vision and IoT Approach\n for Optimized Tilapia Feeding","summary":" Traditional fish farming practices often lead to inefficient feeding,\nresulting in environmental issues and reduced productivity. We developed an\ninnovative system combining computer vision and IoT technologies for precise\nTilapia feeding. Our solution uses real-time IoT sensors to monitor water\nquality parameters and computer vision algorithms to analyze fish size and\ncount, determining optimal feed amounts. A mobile app enables remote monitoring\nand control. We utilized YOLOv8 for keypoint detection to measure Tilapia\nweight from length, achieving \\textbf{94\\%} precision on 3,500 annotated\nimages. Pixel-based measurements were converted to centimeters using depth\nestimation for accurate feeding calculations. Our method, with data collection\nmirroring inference conditions, significantly improved results. Preliminary\nestimates suggest this approach could increase production up to 58 times\ncompared to traditional farms. Our models, code, and dataset are\nopen-source~\\footnote{The code, dataset, and models are available upon\nreasonable request.\n","authors":["Rania Hossam","Ahmed Heakl","Walid Gomaa"],"pdf_url":"https://arxiv.org/pdf/2409.08695v3.pdf","comment":"8 pages, 6 figures, 3 tables, 21th International Conference on\n Informatics in Control, Automation, and Robotics"},{"id":"http://arxiv.org/abs/2409.16583v1","updated":"2024-09-25T03:24:25Z","published":"2024-09-25T03:24:25Z","title":"$\\mathcal{L}_{1}$ Adaptive Optimizer for Uncertain Time-Varying Convex\n Optimization","summary":" We propose an adaptive method for uncertain time-varying (TV) convex\noptimization, termed as $\\mathcal{L}_{1}$ adaptive optimization\n($\\mathcal{L}_{1}$-AO). The proposed method uses a baseline TV optimizer with a\nprediction model, designed for the gradient dynamics to exploit the underlying\nstructure of the temporal correlation. Inspired by $\\mathcal{L}_{1}$ adaptive\ncontrol, the proposed method augments an adaptive update law to estimate and\ncompensate for the uncertainty from the inaccurate prediction in the online\nimplementation. The proposed method provides the performance bounds of the\nerror in the optimization variables and cost function, allowing efficient and\nreliable optimization for uncertain TV problems.\n","authors":["Jinrae Kim","Naira Hovakimyan"],"pdf_url":"https://arxiv.org/pdf/2409.16583v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.15177v2","updated":"2024-09-25T02:29:36Z","published":"2023-12-23T06:50:06Z","title":"Stochastic Data-Driven Predictive Control with Equivalence to Stochastic\n MPC","summary":" We propose a data-driven receding-horizon control method dealing with the\nchance-constrained output-tracking problem of unknown stochastic linear\ntime-invariant (LTI) systems with partial state observation. The proposed\nmethod takes into account the statistics of the process noise, the measurement\nnoise and the uncertain initial condition, following an analogous framework to\nStochastic Model Predictive Control (SMPC), but does not rely on the use of a\nparametric system model. As such, our receding-horizon algorithm produces a\nsequence of closed-loop control policies for predicted time steps, as opposed\nto a sequence of open-loop control actions. Under certain conditions, we\nestablish that our proposed data-driven control method produces identical\ncontrol inputs as that produced by the associated model-based SMPC. Simulation\nresults on a grid-connected power converter are provided to illustrate the\nperformance benefits of our methodology.\n","authors":["Ruiqi Li","John W. Simpson-Porco","Stephen L. Smith"],"pdf_url":"https://arxiv.org/pdf/2312.15177v2.pdf","comment":"20 pages, 4 figures. The extended version of a submission to IEEE\n Transactions on Automatic Control"},{"id":"http://arxiv.org/abs/2409.16552v1","updated":"2024-09-25T01:52:10Z","published":"2024-09-25T01:52:10Z","title":"Device for detection of activity-dependent changes in neural spheroids\n at MHz and GHz frequencies","summary":" Intracellular processes triggered by neural activity include changes in ionic\nconcentrations, protein release, and synaptic vesicle cycling. These processes\nplay significant roles in neurological disorders. The beneficial effects of\nbrain stimulation may also be mediated through intracellular changes. There is\na lack of label-free techniques for monitoring activity-dependent intracellular\nchanges. Electromagnetic (EM) waves at frequencies larger than 1x10^6 Hz (1\nMHz) were previously used to probe intracellular contents of cells, as cell\nmembrane becomes transparent at this frequency range. EM waves interact with\nmembranes of intracellular organelles, proteins, and water in the MHz-GHz\nrange. In this work, we developed a device for probing the interaction between\nintracellular contents of active neurons and EM waves. The device used an array\nof grounded coplanar waveguides (GCPWs) to deliver EM waves to a\nthree-dimensional (3D) spheroid of rat cortical neurons. Neural activity was\nevoked using optogenetics, with synchronous detection of propagation of EM\nwaves. Broadband measurements were conducted in the MHz-GHz range to track\nchanges in transmission coefficients. Neuronal activity was found to reversibly\nalter EM wave transmission. Pharmacological suppression of neuronal activity\nabolished changes in transmission. Time constants of changes in transmission\nwere in the range of seconds to tens of seconds, suggesting the presence of\nrelatively slow, activity-dependent intracellular processes. This study\nprovides the first evidence that EM transmission through neuronal tissue is\nactivity-dependent in MHz-GHz range. Device developed in this work may find\nfuture applications in studies of the mechanisms of neurological disorders and\nthe development of new therapies.\n","authors":["Saeed Omidi","Gianluca Fabi","Xiaopeng Wang","James C. M. Hwang","Yevgeny Berdichevsky"],"pdf_url":"https://arxiv.org/pdf/2409.16552v1.pdf","comment":null}],"Computation and Language":[{"id":"http://arxiv.org/abs/2409.17433v1","updated":"2024-09-25T23:52:17Z","published":"2024-09-25T23:52:17Z","title":"HDFlow: Enhancing LLM Complex Problem-Solving with Hybrid Thinking and\n Dynamic Workflows","summary":" Despite recent advancements in large language models (LLMs), their\nperformance on complex reasoning problems requiring multi-step thinking and\ncombining various skills is still limited. To address this, we propose a novel\nframework HDFlow for complex reasoning with LLMs that combines fast and slow\nthinking modes in an adaptive manner. Our approach consists of two key\ncomponents: 1) a new approach for slow, deliberate reasoning called Dynamic\nWorkflow, which automatically decomposes complex problems into more manageable\nsub-tasks and dynamically designs a workflow to assemble specialized LLM or\nsymbolic reasoning tools to solve sub-tasks; 2) Hybrid Thinking, a general\nframework that dynamically combines fast and slow thinking based on problem\ncomplexity. Finally, we propose an easy-to-scale method for automatically\nsynthesizing a large-scale dataset of 27K challenging reasoning problems for\ncomplex reasoning and a hybrid thinking tuning method that trains smaller LLMs\non this dataset to internalize the fast/slow hybrid reasoning strategies.\nExperiments on four reasoning benchmark datasets demonstrate that our slow\nthinking with dynamic workflows significantly outperforms Chain-of-Thought, and\nhybrid thinking achieves the highest accuracy while providing an effective\nbalance between computational efficiency and performance. Fine-tuning using our\nhybrid thinking approach also significantly boosts the complex reasoning\ncapabilities of open-source language models. The results showcase the promise\nof slow thinking, dynamic workflows, and hybrid thinking in expanding the\nfrontier of complex problem-solving with LLMs\\footnote{Code and data will be\nreleased at \\url{https://github.com/wenlinyao/HDFlow}.}.\n","authors":["Wenlin Yao","Haitao Mi","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2409.17433v1.pdf","comment":"27 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17431v1","updated":"2024-09-25T23:38:15Z","published":"2024-09-25T23:38:15Z","title":"On Extending Direct Preference Optimization to Accommodate Ties","summary":" We derive and investigate two DPO variants that explicitly model the\npossibility of declaring a tie in pair-wise comparisons. We replace the\nBradley-Terry model in DPO with two well-known modeling extensions, by Rao and\nKupper and by Davidson, that assign probability to ties as alternatives to\nclear preferences. Our experiments in neural machine translation and\nsummarization show that explicitly labeled ties can be added to the datasets\nfor these DPO variants without the degradation in task performance that is\nobserved when the same tied pairs are presented to DPO. We find empirically\nthat the inclusion of ties leads to stronger regularization with respect to the\nreference policy as measured by KL divergence, and we see this even for DPO in\nits original form. These findings motivate and enable the inclusion of tied\npairs in preference optimization as opposed to simply discarding them.\n","authors":["Jinghong Chen","Guangyu Yang","Weizhe Lin","Jingbiao Mei","Bill Byrne"],"pdf_url":"https://arxiv.org/pdf/2409.17431v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2409.17422v1","updated":"2024-09-25T23:14:47Z","published":"2024-09-25T23:14:47Z","title":"Discovering the Gems in Early Layers: Accelerating Long-Context LLMs\n with 1000x Input Token Reduction","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities in\nhandling long context inputs, but this comes at the cost of increased\ncomputational resources and latency. Our research introduces a novel approach\nfor the long context bottleneck to accelerate LLM inference and reduce GPU\nmemory consumption. Our research demonstrates that LLMs can identify relevant\ntokens in the early layers before generating answers to a query. Leveraging\nthis insight, we propose an algorithm that uses early layers of an LLM as\nfilters to select and compress input tokens, significantly reducing the context\nlength for subsequent processing. Our method, GemFilter, demonstrates\nsubstantial improvements in both speed and memory efficiency compared to\nexisting techniques, such as standard attention and SnapKV/H2O. Notably, it\nachieves a 2.4$\\times$ speedup and 30\\% reduction in GPU memory usage compared\nto SOTA methods. Evaluation on the Needle in a Haystack task shows that\nGemFilter significantly outperforms standard attention, SnapKV and demonstrates\ncomparable performance on the LongBench challenge. GemFilter is simple,\ntraining-free, and broadly applicable across different LLMs. Crucially, it\nprovides interpretability by allowing humans to inspect the selected input\nsequence. These findings not only offer practical benefits for LLM deployment,\nbut also enhance our understanding of LLM internal mechanisms, paving the way\nfor further optimizations in LLM design and inference. Our code is available at\n\\url{https://github.com/SalesforceAIResearch/GemFilter}.\n","authors":["Zhenmei Shi","Yifei Ming","Xuan-Phi Nguyen","Yingyu Liang","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2409.17422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17419v1","updated":"2024-09-25T23:06:55Z","published":"2024-09-25T23:06:55Z","title":"Pre-Finetuning with Impact Duration Awareness for Stock Movement\n Prediction","summary":" Understanding the duration of news events' impact on the stock market is\ncrucial for effective time-series forecasting, yet this facet is largely\noverlooked in current research. This paper addresses this research gap by\nintroducing a novel dataset, the Impact Duration Estimation Dataset (IDED),\nspecifically designed to estimate impact duration based on investor opinions.\nOur research establishes that pre-finetuning language models with IDED can\nenhance performance in text-based stock movement predictions. In addition, we\njuxtapose our proposed pre-finetuning task with sentiment analysis\npre-finetuning, further affirming the significance of learning impact duration.\nOur findings highlight the promise of this novel research direction in stock\nmovement prediction, offering a new avenue for financial forecasting. We also\nprovide the IDED and pre-finetuned language models under the CC BY-NC-SA 4.0\nlicense for academic use, fostering further exploration in this field.\n","authors":["Chr-Jr Chiu","Chung-Chi Chen","Hen-Hsen Huang","Hsin-Hsi Chen"],"pdf_url":"https://arxiv.org/pdf/2409.17419v1.pdf","comment":"NTCIR-18 FinArg-2 Dataset"},{"id":"http://arxiv.org/abs/2409.17417v1","updated":"2024-09-25T23:00:20Z","published":"2024-09-25T23:00:20Z","title":"Enhancing Investment Opinion Ranking through Argument-Based Sentiment\n Analysis","summary":" In the era of rapid Internet and social media platform development,\nindividuals readily share their viewpoints online. The overwhelming quantity of\nthese posts renders comprehensive analysis impractical. This necessitates an\nefficient recommendation system to filter and present significant, relevant\nopinions. Our research introduces a dual-pronged argument mining technique to\nimprove recommendation system effectiveness, considering both professional and\namateur investor perspectives. Our first strategy involves using the\ndiscrepancy between target and closing prices as an opinion indicator. The\nsecond strategy applies argument mining principles to score investors'\nopinions, subsequently ranking them by these scores. Experimental results\nconfirm the effectiveness of our approach, demonstrating its ability to\nidentify opinions with higher profit potential. Beyond profitability, our\nresearch extends to risk analysis, examining the relationship between\nrecommended opinions and investor behaviors. This offers a holistic view of\npotential outcomes following the adoption of these recommended opinions.\n","authors":["Chung-Chi Chen","Hen-Hsen Huang","Hsin-Hsi Chen","Hiroya Takamura","Ichiro Kobayashi","Yusuke Miyao"],"pdf_url":"https://arxiv.org/pdf/2409.17417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17416v1","updated":"2024-09-25T22:57:29Z","published":"2024-09-25T22:57:29Z","title":"From Deception to Detection: The Dual Roles of Large Language Models in\n Fake News","summary":" Fake news poses a significant threat to the integrity of information\necosystems and public trust. The advent of Large Language Models (LLMs) holds\nconsiderable promise for transforming the battle against fake news. Generally,\nLLMs represent a double-edged sword in this struggle. One major concern is that\nLLMs can be readily used to craft and disseminate misleading information on a\nlarge scale. This raises the pressing questions: Can LLMs easily generate\nbiased fake news? Do all LLMs have this capability? Conversely, LLMs offer\nvaluable prospects for countering fake news, thanks to their extensive\nknowledge of the world and robust reasoning capabilities. This leads to other\ncritical inquiries: Can we use LLMs to detect fake news, and do they outperform\ntypical detection models? In this paper, we aim to address these pivotal\nquestions by exploring the performance of various LLMs. Our objective is to\nexplore the capability of various LLMs in effectively combating fake news,\nmarking this as the first investigation to analyze seven such models. Our\nresults reveal that while some models adhere strictly to safety protocols,\nrefusing to generate biased or misleading content, other models can readily\nproduce fake news across a spectrum of biases. Additionally, our results show\nthat larger models generally exhibit superior detection abilities and that\nLLM-generated fake news are less likely to be detected than human-written ones.\nFinally, our findings demonstrate that users can benefit from LLM-generated\nexplanations in identifying fake news.\n","authors":["Dorsaf Sallami","Yuan-Chen Chang","Esma Aïmeur"],"pdf_url":"https://arxiv.org/pdf/2409.17416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17407v1","updated":"2024-09-25T22:30:42Z","published":"2024-09-25T22:30:42Z","title":"Post-hoc Reward Calibration: A Case Study on Length Bias","summary":" Reinforcement Learning from Human Feedback aligns the outputs of Large\nLanguage Models with human values and preferences. Central to this process is\nthe reward model (RM), which translates human feedback into training signals\nfor optimising LLM behaviour. However, RMs can develop biases by exploiting\nspurious correlations in their training data, such as favouring outputs based\non length or style rather than true quality. These biases can lead to incorrect\noutput rankings, sub-optimal model evaluations, and the amplification of\nundesirable behaviours in LLMs alignment. This paper addresses the challenge of\ncorrecting such biases without additional data and training, introducing the\nconcept of Post-hoc Reward Calibration. We first propose an intuitive approach\nto estimate the bias term and, thus, remove it to approximate the underlying\ntrue reward. We then extend the approach to a more general and robust form with\nthe Locally Weighted Regression. Focusing on the prevalent length bias, we\nvalidate our proposed approaches across three experimental settings,\ndemonstrating consistent improvements: (1) a 3.11 average performance gain\nacross 33 reward models on the RewardBench dataset; (2) enhanced alignment of\nRM rankings with GPT-4 evaluations and human preferences based on the\nAlpacaEval benchmark; and (3) improved Length-Controlled win rate of the RLHF\nprocess in multiple LLM--RM combinations. Our method is computationally\nefficient and generalisable to other types of bias and RMs, offering a scalable\nand robust solution for mitigating biases in LLM alignment. Our code and\nresults are available at https://github.com/ZeroYuHuang/Reward-Calibration.\n","authors":["Zeyu Huang","Zihan Qiu","Zili Wang","Edoardo M. Ponti","Ivan Titov"],"pdf_url":"https://arxiv.org/pdf/2409.17407v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2409.13221v2","updated":"2024-09-25T22:28:06Z","published":"2024-09-20T05:15:38Z","title":"RLHFuse: Efficient RLHF Training for Large Language Models with Inter-\n and Intra-Stage Fusion","summary":" Reinforcement Learning from Human Feedback (RLHF) enhances the alignment\nbetween LLMs and human preference. The workflow of RLHF typically involves\nseveral models and tasks in a series of distinct stages. Existing RLHF training\nsystems view each task as the smallest execution unit thus overlooking the\nopportunities for subtask-level optimizations. Due to the intrinsic nature of\nRLHF training, i.e., the data skewness in the generation stage, and the\npipeline bubbles in the training stage, existing RLHF systems suffer from low\nGPU utilization in production deployments.\n RLHFuse breaks the traditional view of RLHF workflow as a composition of\nindividual tasks, splitting each task into finer-grained subtasks, and\nperforming stage fusion to improve GPU utilization. RLHFuse contains two key\nideas. First, for generation and inference tasks, RLHFuse splits them into\nsample-level subtasks, enabling efficient inter-stage fusion to mitigate the\noriginal generation bottleneck dominated by long-tailed samples. Second, for\ntraining tasks, RLHFuse breaks them into subtasks of micro-batches. By\nleveraging the intuition that pipeline execution can be essentially\ncomplemented by another pipeline, RLHFuse performs intra-stage fusion to\nconcurrently execute these subtasks in the training stage with a fused pipeline\nschedule, resulting in fewer pipeline bubbles. In addition, RLHFuse\nincorporates a series of system optimizations tailored for each stage of RLHF,\nmaking it efficient and scalable for our internal product usage. We evaluate\nRLHFuse on various popular LLMs and the results show that RLHFuse increases the\ntraining throughput by up to 3.7x, compared to existing state-of-the-art\nsystems.\n","authors":["Yinmin Zhong","Zili Zhang","Bingyang Wu","Shengyu Liu","Yukun Chen","Changyi Wan","Hanpeng Hu","Lei Xia","Ranchen Ming","Yibo Zhu","Xin Jin"],"pdf_url":"https://arxiv.org/pdf/2409.13221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17397v1","updated":"2024-09-25T22:14:34Z","published":"2024-09-25T22:14:34Z","title":"Severity Prediction in Mental Health: LLM-based Creation, Analysis,\n Evaluation of a Novel Multilingual Dataset","summary":" Large Language Models (LLMs) are increasingly integrated into various medical\nfields, including mental health support systems. However, there is a gap in\nresearch regarding the effectiveness of LLMs in non-English mental health\nsupport applications. To address this problem, we present a novel multilingual\nadaptation of widely-used mental health datasets, translated from English into\nsix languages (Greek, Turkish, French, Portuguese, German, and Finnish). This\ndataset enables a comprehensive evaluation of LLM performance in detecting\nmental health conditions and assessing their severity across multiple\nlanguages. By experimenting with GPT and Llama, we observe considerable\nvariability in performance across languages, despite being evaluated on the\nsame translated dataset. This inconsistency underscores the complexities\ninherent in multilingual mental health support, where language-specific nuances\nand mental health data coverage can affect the accuracy of the models. Through\ncomprehensive error analysis, we emphasize the risks of relying exclusively on\nlarge language models (LLMs) in medical settings (e.g., their potential to\ncontribute to misdiagnoses). Moreover, our proposed approach offers significant\ncost savings for multilingual tasks, presenting a major advantage for\nbroad-scale implementation.\n","authors":["Konstantinos Skianis","John Pavlopoulos","A. Seza Doğruöz"],"pdf_url":"https://arxiv.org/pdf/2409.17397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17373v1","updated":"2024-09-25T21:32:57Z","published":"2024-09-25T21:32:57Z","title":"data2lang2vec: Data Driven Typological Features Completion","summary":" Language typology databases enhance multi-lingual Natural Language Processing\n(NLP) by improving model adaptability to diverse linguistic structures. The\nwidely-used lang2vec toolkit integrates several such databases, but its\ncoverage remains limited at 28.9\\%. Previous work on automatically increasing\ncoverage predicts missing values based on features from other languages or\nfocuses on single features, we propose to use textual data for better-informed\nfeature prediction. To this end, we introduce a multi-lingual Part-of-Speech\n(POS) tagger, achieving over 70\\% accuracy across 1,749 languages, and\nexperiment with external statistical features and a variety of machine learning\nalgorithms. We also introduce a more realistic evaluation setup, focusing on\nlikely to be missing typology features, and show that our approach outperforms\nprevious work in both setups.\n","authors":["Hamidreza Amirzadeh","Sadegh Jafari","Anika Harju","Rob van der Goot"],"pdf_url":"https://arxiv.org/pdf/2409.17373v1.pdf","comment":"9 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.05840v3","updated":"2024-09-25T20:50:00Z","published":"2024-08-11T18:22:12Z","title":"Iterative Improvement of an Additively Regularized Topic Model","summary":" Topic modelling is fundamentally a soft clustering problem (of known objects\n-- documents, over unknown clusters -- topics). That is, the task is\nincorrectly posed. In particular, the topic models are unstable and incomplete.\nAll this leads to the fact that the process of finding a good topic model\n(repeated hyperparameter selection, model training, and topic quality\nassessment) can be particularly long and labor-intensive. We aim to simplify\nthe process, to make it more deterministic and provable. To this end, we\npresent a method for iterative training of a topic model. The essence of the\nmethod is that a series of related topic models are trained so that each\nsubsequent model is at least as good as the previous one, i.e., that it retains\nall the good topics found earlier. The connection between the models is\nachieved by additive regularization. The result of this iterative training is\nthe last topic model in the series, which we call the iteratively updated\nadditively regularized topic model (ITAR). Experiments conducted on several\ncollections of natural language texts show that the proposed ITAR model\nperforms better than other popular topic models (LDA, ARTM, BERTopic), its\ntopics are diverse, and its perplexity (ability to \"explain\" the underlying\ndata) is moderate.\n","authors":["Alex Gorbulev","Vasiliy Alekseev","Konstantin Vorontsov"],"pdf_url":"https://arxiv.org/pdf/2408.05840v3.pdf","comment":"Make the last little additions to the draft"},{"id":"http://arxiv.org/abs/2409.17326v1","updated":"2024-09-25T20:05:45Z","published":"2024-09-25T20:05:45Z","title":"How Transliterations Improve Crosslingual Alignment","summary":" Recent studies have shown that post-aligning multilingual pretrained language\nmodels (mPLMs) using alignment objectives on both original and transliterated\ndata can improve crosslingual alignment. This improvement further leads to\nbetter crosslingual transfer performance. However, it remains unclear how and\nwhy a better crosslingual alignment is achieved, as this technique only\ninvolves transliterations, and does not use any parallel data. This paper\nattempts to explicitly evaluate the crosslingual alignment and identify the key\nelements in transliteration-based approaches that contribute to better\nperformance. For this, we train multiple models under varying setups for two\npairs of related languages: (1) Polish and Ukrainian and (2) Hindi and Urdu. To\nassess alignment, we define four types of similarities based on sentence\nrepresentations. Our experiments show that adding transliterations alone\nimproves the overall similarities, even for random sentence pairs. With the\nhelp of auxiliary alignment objectives, especially the contrastive objective,\nthe model learns to distinguish matched from random pairs, leading to better\nalignments. However, we also show that better alignment does not always yield\nbetter downstream performance, suggesting that further research is needed to\nclarify the connection between alignment and performance.\n","authors":["Yihong Liu","Mingyang Wang","Amir Hossein Kargaran","Ayyoob Imani","Orgest Xhelili","Haotian Ye","Chunlan Ma","François Yvon","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2409.17326v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2409.17313v1","updated":"2024-09-25T19:49:39Z","published":"2024-09-25T19:49:39Z","title":"Navigating the Nuances: A Fine-grained Evaluation of Vision-Language\n Navigation","summary":" This study presents a novel evaluation framework for the Vision-Language\nNavigation (VLN) task. It aims to diagnose current models for various\ninstruction categories at a finer-grained level. The framework is structured\naround the context-free grammar (CFG) of the task. The CFG serves as the basis\nfor the problem decomposition and the core premise of the instruction\ncategories design. We propose a semi-automatic method for CFG construction with\nthe help of Large-Language Models (LLMs). Then, we induct and generate data\nspanning five principal instruction categories (i.e. direction change, landmark\nrecognition, region recognition, vertical movement, and numerical\ncomprehension). Our analysis of different models reveals notable performance\ndiscrepancies and recurrent issues. The stagnation of numerical comprehension,\nheavy selective biases over directional concepts, and other interesting\nfindings contribute to the development of future language-guided navigation\nsystems.\n","authors":["Zehao Wang","Minye Wu","Yixin Cao","Yubo Ma","Meiqi Chen","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2409.17313v1.pdf","comment":"EMNLP 2024 Findings; project page:\n https://zehao-wang.github.io/navnuances"},{"id":"http://arxiv.org/abs/2409.17312v1","updated":"2024-09-25T19:46:49Z","published":"2024-09-25T19:46:49Z","title":"BabyLlama-2: Ensemble-Distilled Models Consistently Outperform Teachers\n With Limited Data","summary":" We present BabyLlama-2, a 345 million parameter model distillation-pretrained\nfrom two teachers on a 10 million word corpus for the BabyLM competition. On\nBLiMP and SuperGLUE benchmarks, BabyLlama-2 outperforms baselines trained on\nboth 10 and 100 million word datasets with the same data mix, as well as its\nteacher models. Through an extensive hyperparameter sweep, we demonstrate that\nthe advantages of distillation cannot be attributed to suboptimal\nhyperparameter selection of the teachers. Our findings underscore the need for\nfurther investigation into distillation techniques, particularly in\ndata-limited settings.\n","authors":["Jean-Loup Tastet","Inar Timiryasov"],"pdf_url":"https://arxiv.org/pdf/2409.17312v1.pdf","comment":"9 pages, 3 figures, 5 tables, submitted to the BabyLM Challenge\n (CoNLL 2024 Shared Task)"},{"id":"http://arxiv.org/abs/2404.18923v3","updated":"2024-09-25T19:18:16Z","published":"2024-04-29T17:58:36Z","title":"Holmes: A Benchmark to Assess the Linguistic Competence of Language\n Models","summary":" We introduce Holmes, a new benchmark designed to assess language models (LMs)\nlinguistic competence - their unconscious understanding of linguistic\nphenomena. Specifically, we use classifier-based probing to examine LMs'\ninternal representations regarding distinct linguistic phenomena (e.g.,\npart-of-speech tagging). As a result, we meet recent calls to disentangle LMs'\nlinguistic competence from other cognitive abilities, such as following\ninstructions in prompting-based evaluations. Composing Holmes, we review over\n270 probing studies and include more than 200 datasets to assess syntax,\nmorphology, semantics, reasoning, and discourse phenomena. Analyzing over 50\nLMs reveals that, aligned with known trends, their linguistic competence\ncorrelates with model size. However, surprisingly, model architecture and\ninstruction tuning also significantly influence performance, particularly in\nmorphology and syntax. Finally, we propose FlashHolmes, a streamlined version\nthat reduces the computation load while maintaining high-ranking precision.\n","authors":["Andreas Waldis","Yotam Perlitz","Leshem Choshen","Yufang Hou","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2404.18923v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10147v2","updated":"2024-09-25T19:16:16Z","published":"2024-08-19T16:47:46Z","title":"In-Context Learning with Representations: Contextual Generalization of\n Trained Transformers","summary":" In-context learning (ICL) refers to a remarkable capability of pretrained\nlarge language models, which can learn a new task given a few examples during\ninference. However, theoretical understanding of ICL is largely under-explored,\nparticularly whether transformers can be trained to generalize to unseen\nexamples in a prompt, which will require the model to acquire contextual\nknowledge of the prompt for generalization. This paper investigates the\ntraining dynamics of transformers by gradient descent through the lens of\nnon-linear regression tasks. The contextual generalization here can be attained\nvia learning the template function for each task in-context, where all template\nfunctions lie in a linear space with $m$ basis functions. We analyze the\ntraining dynamics of one-layer multi-head transformers to in-contextly predict\nunlabeled inputs given partially labeled prompts, where the labels contain\nGaussian noise and the number of examples in each prompt are not sufficient to\ndetermine the template. Under mild assumptions, we show that the training loss\nfor a one-layer multi-head transformer converges linearly to a global minimum.\nMoreover, the transformer effectively learns to perform ridge regression over\nthe basis functions. To our knowledge, this study is the first provable\ndemonstration that transformers can learn contextual (i.e., template)\ninformation to generalize to both unseen examples and tasks when prompts\ncontain only a small number of query-answer pairs.\n","authors":["Tong Yang","Yu Huang","Yingbin Liang","Yuejie Chi"],"pdf_url":"https://arxiv.org/pdf/2408.10147v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2407.12327v2","updated":"2024-09-25T19:11:20Z","published":"2024-07-17T05:53:20Z","title":"Spectra: A Comprehensive Study of Ternary, Quantized, and FP16 Language\n Models","summary":" Post-training quantization is the leading method for addressing\nmemory-related bottlenecks in LLM inference, but unfortunately, it suffers from\nsignificant performance degradation below 4-bit precision. An alternative\napproach involves training compressed models directly at a low bitwidth (e.g.,\nbinary or ternary models). However, the performance, training dynamics, and\nscaling trends of such models are not yet well understood. To address this\nissue, we train and openly release the Spectra LLM suite consisting of 54\nlanguage models ranging from 99M to 3.9B parameters, trained on 300B tokens.\nSpectra includes FloatLMs, post-training quantized QuantLMs (3, 4, 6, and 8\nbits), and ternary LLMs (TriLMs) - our improved architecture for ternary\nlanguage modeling, which significantly outperforms previously proposed ternary\nmodels of a given size (in bits), matching half-precision models at scale. For\nexample, TriLM 3.9B is (bit-wise) smaller than the half-precision FloatLM 830M,\nbut matches half-precision FloatLM 3.9B in commonsense reasoning and knowledge\nbenchmarks. However, TriLM 3.9B is also as toxic and stereotyping as FloatLM\n3.9B, a model six times larger in size. Additionally, TriLM 3.9B lags behind\nFloatLM in perplexity on validation splits and web-based corpora but performs\nbetter on less noisy datasets like Lambada and PennTreeBank.\n To enhance understanding of low-bitwidth models, we are releasing 500+\nintermediate checkpoints of the Spectra suite at\n\\href{https://github.com/NolanoOrg/SpectraSuite}{https://github.com/NolanoOrg/SpectraSuite}.\n","authors":["Ayush Kaushal","Tejas Vaidhya","Tejas Pandey","Aaryan Bhagat","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2407.12327v2.pdf","comment":"32 pages, 12 figures, and 10 tables"},{"id":"http://arxiv.org/abs/2409.15567v2","updated":"2024-09-25T18:50:52Z","published":"2024-09-23T21:48:32Z","title":"Asking an AI for salary negotiation advice is a matter of concern:\n Controlled experimental perturbation of ChatGPT for protected and\n non-protected group discrimination on a contextual task with no clear ground\n truth answers","summary":" We conducted controlled experimental bias audits for four versions of\nChatGPT, which we asked to recommend an opening offer in salary negotiations\nfor a new hire. We submitted 98,800 prompts to each version, systematically\nvarying the employee's gender, university, and major, and tested prompts in\nvoice of each side of the negotiation: the employee versus employer. We find\nChatGPT as a multi-model platform is not robust and consistent enough to be\ntrusted for such a task. We observed statistically significant salary offers\nwhen varying gender for all four models, although with smaller gaps than for\nother attributes tested. The largest gaps were different model versions and\nbetween the employee- vs employer-voiced prompts. We also observed substantial\ngaps when varying university and major, but many of the biases were not\nconsistent across model versions. We tested for fictional and fraudulent\nuniversities and found wildly inconsistent results across cases and model\nversions. We make broader contributions to the AI/ML fairness literature. Our\nscenario and our experimental design differ from mainstream AI/ML auditing\nefforts in key ways. Bias audits typically test discrimination for protected\nclasses like gender, which we contrast with testing non-protected classes of\nuniversity and major. Asking for negotiation advice includes how aggressive one\nought to be in a negotiation relative to known empirical salary distributions\nand scales, which is a deeply contextual and personalized task that has no\nobjective ground truth to validate. These results raise concerns for the\nspecific model versions we tested and ChatGPT as a multi-model platform in\ncontinuous development. Our epistemology does not permit us to definitively\ncertify these models as either generally biased or unbiased on the attributes\nwe test, but our study raises matters of concern for stakeholders to further\ninvestigate.\n","authors":["R. Stuart Geiger","Flynn O'Sullivan","Elsie Wang","Jonathan Lo"],"pdf_url":"https://arxiv.org/pdf/2409.15567v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17270v1","updated":"2024-09-25T18:35:45Z","published":"2024-09-25T18:35:45Z","title":"Proof of Thought : Neurosymbolic Program Synthesis allows Robust and\n Interpretable Reasoning","summary":" Large Language Models (LLMs) have revolutionized natural language processing,\nyet they struggle with inconsistent reasoning, particularly in novel domains\nand complex logical sequences. This research introduces Proof of Thought, a\nframework that enhances the reliability and transparency of LLM outputs. Our\napproach bridges LLM-generated ideas with formal logic verification, employing\na custom interpreter to convert LLM outputs into First Order Logic constructs\nfor theorem prover scrutiny. Central to our method is an intermediary\nJSON-based Domain-Specific Language, which by design balances precise logical\nstructures with intuitive human concepts. This hybrid representation enables\nboth rigorous validation and accessible human comprehension of LLM reasoning\nprocesses. Key contributions include a robust type system with sort management\nfor enhanced logical integrity, explicit representation of rules for clear\ndistinction between factual and inferential knowledge, and a flexible\narchitecture that allows for easy extension to various domain-specific\napplications. We demonstrate Proof of Thought's effectiveness through\nbenchmarking on StrategyQA and a novel multimodal reasoning task, showing\nimproved performance in open-ended scenarios. By providing verifiable and\ninterpretable results, our technique addresses critical needs for AI system\naccountability and sets a foundation for human-in-the-loop oversight in\nhigh-stakes domains.\n","authors":["Debargha Ganguly","Srinivasan Iyengar","Vipin Chaudhary","Shivkumar Kalyanaraman"],"pdf_url":"https://arxiv.org/pdf/2409.17270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09580v3","updated":"2024-09-25T18:30:28Z","published":"2023-11-16T05:31:21Z","title":"MMoE: Enhancing Multimodal Models with Mixtures of Multimodal\n Interaction Experts","summary":" Advances in multimodal models have greatly improved how interactions relevant\nto various tasks are modeled. Today's multimodal models mainly focus on the\ncorrespondence between images and text, using this for tasks like image-text\nmatching. However, this covers only a subset of real-world interactions. Novel\ninteractions, such as sarcasm expressed through opposing spoken words and\ngestures or humor expressed through utterances and tone of voice, remain\nchallenging. In this paper, we introduce an approach to enhance multimodal\nmodels, which we call Multimodal Mixtures of Experts (MMoE). The key idea in\nMMoE is to train separate expert models for each type of multimodal\ninteraction, such as redundancy present in both modalities, uniqueness in one\nmodality, or synergy that emerges when both modalities are fused. On a sarcasm\ndetection task (MUStARD) and a humor detection task (URFUNNY), we obtain new\nstate-of-the-art results. MMoE is also able to be applied to various types of\nmodels to gain improvement.\n","authors":["Haofei Yu","Zhengyang Qi","Lawrence Jang","Ruslan Salakhutdinov","Louis-Philippe Morency","Paul Pu Liang"],"pdf_url":"https://arxiv.org/pdf/2311.09580v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17146v1","updated":"2024-09-25T17:59:51Z","published":"2024-09-25T17:59:51Z","title":"Molmo and PixMo: Open Weights and Open Data for State-of-the-Art\n Multimodal Models","summary":" Today's most advanced multimodal models remain proprietary. The strongest\nopen-weight models rely heavily on synthetic data from proprietary VLMs to\nachieve good performance, effectively distilling these closed models into open\nones. As a result, the community is still missing foundational knowledge about\nhow to build performant VLMs from scratch. We present Molmo, a new family of\nVLMs that are state-of-the-art in their class of openness. Our key innovation\nis a novel, highly detailed image caption dataset collected entirely from human\nannotators using speech-based descriptions. To enable a wide array of user\ninteractions, we also introduce a diverse dataset mixture for fine-tuning that\nincludes in-the-wild Q&A and innovative 2D pointing data. The success of our\napproach relies on careful choices for the model architecture details, a\nwell-tuned training pipeline, and, most critically, the quality of our newly\ncollected datasets, all of which will be released. The best-in-class 72B model\nwithin the Molmo family not only outperforms others in the class of open weight\nand data models but also compares favorably against proprietary systems like\nGPT-4o, Claude 3.5, and Gemini 1.5 on both academic benchmarks and human\nevaluation.\n We will be releasing all of our model weights, captioning and fine-tuning\ndata, and source code in the near future. Select model weights, inference code,\nand demo are available at https://molmo.allenai.org.\n","authors":["Matt Deitke","Christopher Clark","Sangho Lee","Rohun Tripathi","Yue Yang","Jae Sung Park","Mohammadreza Salehi","Niklas Muennighoff","Kyle Lo","Luca Soldaini","Jiasen Lu","Taira Anderson","Erin Bransom","Kiana Ehsani","Huong Ngo","YenSung Chen","Ajay Patel","Mark Yatskar","Chris Callison-Burch","Andrew Head","Rose Hendrix","Favyen Bastani","Eli VanderBilt","Nathan Lambert","Yvonne Chou","Arnavi Chheda","Jenna Sparks","Sam Skjonsberg","Michael Schmitz","Aaron Sarnat","Byron Bischoff","Pete Walsh","Chris Newell","Piper Wolters","Tanmay Gupta","Kuo-Hao Zeng","Jon Borchardt","Dirk Groeneveld","Jen Dumas","Crystal Nam","Sophie Lebrecht","Caitlin Wittlif","Carissa Schoenick","Oscar Michel","Ranjay Krishna","Luca Weihs","Noah A. Smith","Hannaneh Hajishirzi","Ross Girshick","Ali Farhadi","Aniruddha Kembhavi"],"pdf_url":"https://arxiv.org/pdf/2409.17146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17141v1","updated":"2024-09-25T17:58:35Z","published":"2024-09-25T17:58:35Z","title":"FineZip : Pushing the Limits of Large Language Models for Practical\n Lossless Text Compression","summary":" While the language modeling objective has been shown to be deeply connected\nwith compression, it is surprising that modern LLMs are not employed in\npractical text compression systems. In this paper, we provide an in-depth\nanalysis of neural network and transformer-based compression techniques to\nanswer this question. We compare traditional text compression systems with\nneural network and LLM-based text compression methods. Although LLM-based\nsystems significantly outperform conventional compression methods, they are\nhighly impractical. Specifically, LLMZip, a recent text compression system\nusing Llama3-8B requires 9.5 days to compress just 10 MB of text, although with\nhuge improvements in compression ratios. To overcome this, we present FineZip -\na novel LLM-based text compression system that combines ideas of online\nmemorization and dynamic context to reduce the compression time immensely.\nFineZip can compress the above corpus in approximately 4 hours compared to 9.5\ndays, a 54 times improvement over LLMZip and comparable performance. FineZip\noutperforms traditional algorithmic compression methods with a large margin,\nimproving compression ratios by approximately 50\\%. With this work, we take the\nfirst step towards making lossless text compression with LLMs a reality. While\nFineZip presents a significant step in that direction, LLMs are still not a\nviable solution for large-scale text compression. We hope our work paves the\nway for future research and innovation to solve this problem.\n","authors":["Fazal Mittu","Yihuan Bu","Akshat Gupta","Ashok Devireddy","Alp Eren Ozdarendeli","Anant Singh","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2409.17141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16201v2","updated":"2024-09-25T17:58:21Z","published":"2023-11-27T07:19:26Z","title":"Pre-trained Language Models Do Not Help Auto-regressive Text-to-Image\n Generation","summary":" Recent advances in image tokenizers, such as VQ-VAE, have enabled\ntext-to-image generation using auto-regressive methods, similar to language\nmodeling. However, these methods have yet to leverage pre-trained language\nmodels, despite their adaptability to various downstream tasks. In this work,\nwe explore this gap by adapting a pre-trained language model for\nauto-regressive text-to-image generation, and find that pre-trained language\nmodels offer limited help. We provide a two-fold explanation by analyzing\ntokens from each modality. First, we demonstrate that image tokens possess\nsignificantly different semantics compared to text tokens, rendering\npre-trained language models no more effective in modeling them than randomly\ninitialized ones. Second, the text tokens in the image-text datasets are too\nsimple compared to normal language model pre-training data, which causes the\ncatastrophic degradation of language models' capability.\n","authors":["Yuhui Zhang","Brandon McKinzie","Zhe Gan","Vaishaal Shankar","Alexander Toshev"],"pdf_url":"https://arxiv.org/pdf/2311.16201v2.pdf","comment":"Published at EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2409.17130v1","updated":"2024-09-25T17:48:59Z","published":"2024-09-25T17:48:59Z","title":"Assessing the Level of Toxicity Against Distinct Groups in Bangla Social\n Media Comments: A Comprehensive Investigation","summary":" Social media platforms have a vital role in the modern world, serving as\nconduits for communication, the exchange of ideas, and the establishment of\nnetworks. However, the misuse of these platforms through toxic comments, which\ncan range from offensive remarks to hate speech, is a concerning issue. This\nstudy focuses on identifying toxic comments in the Bengali language targeting\nthree specific groups: transgender people, indigenous people, and migrant\npeople, from multiple social media sources. The study delves into the intricate\nprocess of identifying and categorizing toxic language while considering the\nvarying degrees of toxicity: high, medium, and low. The methodology involves\ncreating a dataset, manual annotation, and employing pre-trained transformer\nmodels like Bangla-BERT, bangla-bert-base, distil-BERT, and\nBert-base-multilingual-cased for classification. Diverse assessment metrics\nsuch as accuracy, recall, precision, and F1-score are employed to evaluate the\nmodel's effectiveness. The experimental findings reveal that Bangla-BERT\nsurpasses alternative models, achieving an F1-score of 0.8903. This research\nexposes the complexity of toxicity in Bangla social media dialogues, revealing\nits differing impacts on diverse demographic groups.\n","authors":["Mukaffi Bin Moin","Pronay Debnath","Usafa Akther Rifa","Rijeet Bin Anis"],"pdf_url":"https://arxiv.org/pdf/2409.17130v1.pdf","comment":"Accepted for publication in \"18th International Conference on\n Information Technology and Applications (ICITA 2024)\""},{"id":"http://arxiv.org/abs/2409.17120v1","updated":"2024-09-25T17:31:45Z","published":"2024-09-25T17:31:45Z","title":"Deep Learning and Machine Learning, Advancing Big Data Analytics and\n Management: Handy Appetizer","summary":" This book explores the role of Artificial Intelligence (AI), Machine Learning\n(ML), and Deep Learning (DL) in driving the progress of big data analytics and\nmanagement. The book focuses on simplifying the complex mathematical concepts\nbehind deep learning, offering intuitive visualizations and practical case\nstudies to help readers understand how neural networks and technologies like\nConvolutional Neural Networks (CNNs) work. It introduces several classic models\nand technologies such as Transformers, GPT, ResNet, BERT, and YOLO,\nhighlighting their applications in fields like natural language processing,\nimage recognition, and autonomous driving. The book also emphasizes the\nimportance of pre-trained models and how they can enhance model performance and\naccuracy, with instructions on how to apply these models in various real-world\nscenarios. Additionally, it provides an overview of key big data management\ntechnologies like SQL and NoSQL databases, as well as distributed computing\nframeworks such as Apache Hadoop and Spark, explaining their importance in\nmanaging and processing vast amounts of data. Ultimately, the book underscores\nthe value of mastering deep learning and big data management skills as critical\ntools for the future workforce, making it an essential resource for both\nbeginners and experienced professionals.\n","authors":["Benji Peng","Xuanhe Pan","Yizhu Wen","Ziqian Bi","Keyu Chen","Ming Li","Ming Liu","Qian Niu","Junyu Liu","Jinlang Wang","Sen Zhang","Jiawei Xu","Pohsun Feng"],"pdf_url":"https://arxiv.org/pdf/2409.17120v1.pdf","comment":"This book contains 93 pages and 60 figures"},{"id":"http://arxiv.org/abs/2409.17115v1","updated":"2024-09-25T17:28:13Z","published":"2024-09-25T17:28:13Z","title":"Programming Every Example: Lifting Pre-training Data Quality like\n Experts at Scale","summary":" Large language model pre-training has traditionally relied on human experts\nto craft heuristics for improving the corpora quality, resulting in numerous\nrules developed to date. However, these rules lack the flexibility to address\nthe unique characteristics of individual example effectively. Meanwhile,\napplying tailored rules to every example is impractical for human experts. In\nthis paper, we demonstrate that even small language models, with as few as 0.3B\nparameters, can exhibit substantial data refining capabilities comparable to\nthose of human experts. We introduce Programming Every Example (ProX), a novel\nframework that treats data refinement as a programming task, enabling models to\nrefine corpora by generating and executing fine-grained operations, such as\nstring normalization, for each individual example at scale. Experimental\nresults show that models pre-trained on ProX-curated data outperform either\noriginal data or data filtered by other selection methods by more than 2%\nacross various downstream benchmarks. Its effectiveness spans various model\nsizes and pre-training corpora, including C4, RedPajama-V2, and FineWeb.\nFurthermore, ProX exhibits significant potential in domain-specific continual\npre-training: without domain specific design, models trained on OpenWebMath\nrefined by ProX outperform human-crafted rule-based methods, improving average\naccuracy by 7.6% over Mistral-7B, with 14.6% for Llama-2-7B and 20.3% for\nCodeLlama-7B, all within 10B tokens to be comparable to models like Llemma-7B\ntrained on 200B tokens. Further analysis highlights that ProX significantly\nsaves training FLOPs, offering a promising path for efficient LLM\npre-training.We are open-sourcing ProX with >100B corpus, models, and sharing\nall training and implementation details for reproducible research and future\ninnovation. Code: https://github.com/GAIR-NLP/ProX\n","authors":["Fan Zhou","Zengzhi Wang","Qian Liu","Junlong Li","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2409.17115v1.pdf","comment":"45 pages, 13 figures, 34 tables"},{"id":"http://arxiv.org/abs/2309.17012v3","updated":"2024-09-25T16:57:20Z","published":"2023-09-29T06:53:10Z","title":"Benchmarking Cognitive Biases in Large Language Models as Evaluators","summary":" Large Language Models are cognitively biased judges. Large Language Models\n(LLMs) have recently been shown to be effective as automatic evaluators with\nsimple prompting and in-context learning. In this work, we assemble 15 LLMs of\nfour different size ranges and evaluate their output responses by preference\nranking from the other LLMs as evaluators, such as System Star is better than\nSystem Square. We then evaluate the quality of ranking outputs introducing the\nCognitive Bias Benchmark for LLMs as Evaluators (CoBBLEr), a benchmark to\nmeasure six different cognitive biases in LLM evaluation outputs, such as the\nEgocentric bias where a model prefers to rank its own outputs highly in\nevaluation. We find that LLMs are biased text quality evaluators, exhibiting\nstrong indications on our bias benchmark (average of 40% of comparisons across\nall models) within each of their evaluations that question their robustness as\nevaluators. Furthermore, we examine the correlation between human and machine\npreferences and calculate the average Rank-Biased Overlap (RBO) score to be\n49.6%, indicating that machine preferences are misaligned with humans.\nAccording to our findings, LLMs may still be unable to be utilized for\nautomatic annotation aligned with human preferences. Our project page is at:\nhttps://minnesotanlp.github.io/cobbler.\n","authors":["Ryan Koo","Minhwa Lee","Vipul Raheja","Jong Inn Park","Zae Myung Kim","Dongyeop Kang"],"pdf_url":"https://arxiv.org/pdf/2309.17012v3.pdf","comment":"Publishsed at ACL 2024. 29 pages, 9 figures, 14 tables"},{"id":"http://arxiv.org/abs/2409.17080v1","updated":"2024-09-25T16:45:02Z","published":"2024-09-25T16:45:02Z","title":"Can Vision Language Models Learn from Visual Demonstrations of Ambiguous\n Spatial Reasoning?","summary":" Large vision-language models (VLMs) have become state-of-the-art for many\ncomputer vision tasks, with in-context learning (ICL) as a popular adaptation\nstrategy for new ones. But can VLMs learn novel concepts purely from visual\ndemonstrations, or are they limited to adapting to the output format of ICL\nexamples? We propose a new benchmark we call Spatial Visual Ambiguity Tasks\n(SVAT) that challenges state-of-the-art VLMs to learn new visuospatial tasks\nin-context. We find that VLMs fail to do this zero-shot, and sometimes continue\nto fail after finetuning. However, adding simpler data to the training by\ncurriculum learning leads to improved ICL performance.\n","authors":["Bowen Zhao","Leo Parker Dirac","Paulina Varshavskaya"],"pdf_url":"https://arxiv.org/pdf/2409.17080v1.pdf","comment":"13 pages, 4 figures. Code released at\n https://github.com/groundlight/vlm-visual-demonstrations"},{"id":"http://arxiv.org/abs/2406.14829v2","updated":"2024-09-25T16:27:50Z","published":"2024-06-21T02:18:03Z","title":"Is This a Bad Table? A Closer Look at the Evaluation of Table Generation\n from Text","summary":" Understanding whether a generated table is of good quality is important to be\nable to use it in creating or editing documents using automatic methods. In\nthis work, we underline that existing measures for table quality evaluation\nfail to capture the overall semantics of the tables, and sometimes unfairly\npenalize good tables and reward bad ones. We propose TabEval, a novel table\nevaluation strategy that captures table semantics by first breaking down a\ntable into a list of natural language atomic statements and then compares them\nwith ground truth statements using entailment-based measures. To validate our\napproach, we curate a dataset comprising of text descriptions for 1,250 diverse\nWikipedia tables, covering a range of topics and structures, in contrast to the\nlimited scope of existing datasets. We compare TabEval with existing metrics\nusing unsupervised and supervised text-to-table generation methods,\ndemonstrating its stronger correlation with human judgments of table quality\nacross four datasets.\n","authors":["Pritika Ramu","Aparna Garimella","Sambaran Bandyopadhyay"],"pdf_url":"https://arxiv.org/pdf/2406.14829v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17054v1","updated":"2024-09-25T16:13:42Z","published":"2024-09-25T16:13:42Z","title":"Using LLM for Real-Time Transcription and Summarization of\n Doctor-Patient Interactions into ePuskesmas in Indonesia","summary":" One of the key issues contributing to inefficiency in Puskesmas is the\ntime-consuming nature of doctor-patient interactions. Doctors need to conduct\nthorough consultations, which include diagnosing the patient's condition,\nproviding treatment advice, and transcribing detailed notes into medical\nrecords. In regions with diverse linguistic backgrounds, doctors often have to\nask clarifying questions, further prolonging the process. While diagnosing is\nessential, transcription and summarization can often be automated using AI to\nimprove time efficiency and help doctors enhance care quality and enable early\ndiagnosis and intervention. This paper proposes a solution using a localized\nlarge language model (LLM) to transcribe, translate, and summarize\ndoctor-patient conversations. We utilize the Whisper model for transcription\nand GPT-3 to summarize them into the ePuskemas medical records format. This\nsystem is implemented as an add-on to an existing web browser extension,\nallowing doctors to fill out patient forms while talking. By leveraging this\nsolution for real-time transcription, translation, and summarization, doctors\ncan improve the turnaround time for patient care while enhancing the quality of\nrecords, which become more detailed and insightful for future visits. This\ninnovation addresses challenges like overcrowded facilities and the\nadministrative burden on healthcare providers in Indonesia. We believe this\nsolution will help doctors save time, provide better care, and produce more\naccurate medical records, representing a significant step toward modernizing\nhealthcare and ensuring patients receive timely, high-quality care, even in\nresource-constrained settings.\n","authors":["Azmul Asmar Irfan","Nur Ahmad Khatim","Mansur M. Arief"],"pdf_url":"https://arxiv.org/pdf/2409.17054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17046v1","updated":"2024-09-25T15:59:58Z","published":"2024-09-25T15:59:58Z","title":"Detecting Temporal Ambiguity in Questions","summary":" Detecting and answering ambiguous questions has been a challenging task in\nopen-domain question answering. Ambiguous questions have different answers\ndepending on their interpretation and can take diverse forms. Temporally\nambiguous questions are one of the most common types of such questions. In this\npaper, we introduce TEMPAMBIQA, a manually annotated temporally ambiguous QA\ndataset consisting of 8,162 open-domain questions derived from existing\ndatasets. Our annotations focus on capturing temporal ambiguity to study the\ntask of detecting temporally ambiguous questions. We propose a novel approach\nby using diverse search strategies based on disambiguated versions of the\nquestions. We also introduce and test non-search, competitive baselines for\ndetecting temporal ambiguity using zero-shot and few-shot approaches.\n","authors":["Bhawna Piryani","Abdelrahman Abdallah","Jamshid Mozafari","Adam Jatowt"],"pdf_url":"https://arxiv.org/pdf/2409.17046v1.pdf","comment":"Accepted at EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17044v1","updated":"2024-09-25T15:54:29Z","published":"2024-09-25T15:54:29Z","title":"How to Connect Speech Foundation Models and Large Language Models? What\n Matters and What Does Not","summary":" The remarkable performance achieved by Large Language Models (LLM) has driven\nresearch efforts to leverage them for a wide range of tasks and input\nmodalities. In speech-to-text (S2T) tasks, the emerging solution consists of\nprojecting the output of the encoder of a Speech Foundational Model (SFM) into\nthe LLM embedding space through an adapter module. However, no work has yet\ninvestigated how much the downstream-task performance depends on each component\n(SFM, adapter, LLM) nor whether the best design of the adapter depends on the\nchosen SFM and LLM. To fill this gap, we evaluate the combination of 5 adapter\nmodules, 2 LLMs (Mistral and Llama), and 2 SFMs (Whisper and SeamlessM4T) on\ntwo widespread S2T tasks, namely Automatic Speech Recognition and Speech\nTranslation. Our results demonstrate that the SFM plays a pivotal role in\ndownstream performance, while the adapter choice has moderate impact and\ndepends on the SFM and LLM.\n","authors":["Francesco Verdini","Pierfrancesco Melucci","Stefano Perna","Francesco Cariaggi","Marco Gaido","Sara Papi","Szymon Mazurek","Marek Kasztelnik","Luisa Bentivogli","Sébastien Bratières","Paolo Merialdo","Simone Scardapane"],"pdf_url":"https://arxiv.org/pdf/2409.17044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14507v3","updated":"2024-09-25T15:50:51Z","published":"2024-09-22T16:11:02Z","title":"A is for Absorption: Studying Feature Splitting and Absorption in Sparse\n Autoencoders","summary":" Sparse Autoencoders (SAEs) have emerged as a promising approach to decompose\nthe activations of Large Language Models (LLMs) into human-interpretable\nlatents. In this paper, we pose two questions. First, to what extent do SAEs\nextract monosemantic and interpretable latents? Second, to what extent does\nvarying the sparsity or the size of the SAE affect monosemanticity /\ninterpretability? By investigating these questions in the context of a simple\nfirst-letter identification task where we have complete access to ground truth\nlabels for all tokens in the vocabulary, we are able to provide more detail\nthan prior investigations. Critically, we identify a problematic form of\nfeature-splitting we call feature absorption where seemingly monosemantic\nlatents fail to fire in cases where they clearly should. Our investigation\nsuggests that varying SAE size or sparsity is insufficient to solve this issue,\nand that there are deeper conceptual issues in need of resolution.\n","authors":["David Chanin","James Wilken-Smith","Tomáš Dulka","Hardik Bhatnagar","Joseph Bloom"],"pdf_url":"https://arxiv.org/pdf/2409.14507v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17027v1","updated":"2024-09-25T15:30:24Z","published":"2024-09-25T15:30:24Z","title":"Counterfactual Token Generation in Large Language Models","summary":" \"Sure, I am happy to generate a story for you: Captain Lyra stood at the helm\nof her trusty ship, the Maelstrom's Fury, gazing out at the endless sea. [...]\nLyra's eyes welled up with tears as she realized the bitter truth - she had\nsacrificed everything for fleeting riches, and lost the love of her crew, her\nfamily, and herself.\" Although this story, generated by a large language model,\nis captivating, one may wonder -- how would the story have unfolded if the\nmodel had chosen \"Captain Maeve\" as the protagonist instead? We cannot know.\nState-of-the-art large language models are stateless -- they maintain no\ninternal memory or state. Given a prompt, they generate a sequence of tokens as\nan output using an autoregressive process. As a consequence, they cannot reason\nabout counterfactual alternatives to tokens they have generated in the past. In\nthis work, our goal is to enhance them with this functionality. To this end, we\ndevelop a causal model of token generation that builds upon the Gumbel-Max\nstructural causal model. Our model allows any large language model to perform\ncounterfactual token generation at almost no cost in comparison with vanilla\ntoken generation, it is embarrassingly simple to implement, and it does not\nrequire any fine-tuning nor prompt engineering. We implement our model on Llama\n3 8B-instruct and conduct both qualitative and quantitative analyses of\ncounterfactually generated text. We conclude with a demonstrative application\nof counterfactual token generation for bias detection, unveiling interesting\ninsights about the model of the world constructed by large language models.\n","authors":["Ivi Chatzi","Nina Corvelo Benz","Eleni Straitouri","Stratis Tsirtsis","Manuel Gomez-Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2409.17027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17011v1","updated":"2024-09-25T15:15:57Z","published":"2024-09-25T15:15:57Z","title":"LLM-CARD: Towards a Description and Landscape of Large Language Models","summary":" With the rapid growth of the Natural Language Processing (NLP) field, a vast\nvariety of Large Language Models (LLMs) continue to emerge for diverse NLP\ntasks. As an increasing number of papers are presented, researchers and\ndevelopers face the challenge of information overload. Thus, it is particularly\nimportant to develop a system that can automatically extract and organise key\ninformation about LLMs from academic papers (\\textbf{LLM model card}). This\nwork is to develop such a pioneer system by using Named Entity Recognition\n(\\textbf{NER}) and Relation Extraction (\\textbf{RE}) methods that automatically\nextract key information about large language models from the papers, helping\nresearchers to efficiently access information about LLMs. These features\ninclude model \\textit{licence}, model \\textit{name}, and model\n\\textit{application}. With these features, we can form a model card for each\npaper. \\textbf{Data-contribution} wise, 106 academic papers were processed by\ndefining three dictionaries - LLMs name, licence, and application. 11,051\nsentences were extracted through dictionary lookup, and the dataset was\nconstructed through manual review of the final selection of 129 sentences that\nhave a link between the name and the licence, and 106 sentences that have a\nlink between the model name and the application.\n","authors":["Shengwei Tian","Lifeng Han","Erick Mendez Guzman","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2409.17011v1.pdf","comment":"ongoing work, 16 pages"},{"id":"http://arxiv.org/abs/2409.17005v1","updated":"2024-09-25T15:08:08Z","published":"2024-09-25T15:08:08Z","title":"Models Can and Should Embrace the Communicative Nature of\n Human-Generated Math","summary":" Math is constructed by people for people: just as natural language corpora\nreflect not just propositions but the communicative goals of language users,\nthe math data that models are trained on reflects not just idealized\nmathematical entities but rich communicative intentions. While there are\nimportant advantages to treating math in a purely symbolic manner, we here\nhypothesize that there are benefits to treating math as situated linguistic\ncommunication and that language models are well suited for this goal, in ways\nthat are not fully appreciated. We illustrate these points with two case\nstudies. First, we ran an experiment in which we found that language models\ninterpret the equals sign in a humanlike way -- generating systematically\ndifferent word problems for the same underlying equation arranged in different\nways. Second, we found that language models prefer proofs to be ordered in\nnaturalistic ways, even though other orders would be logically equivalent. We\nadvocate for AI systems that learn from and represent the communicative\nintentions latent in human-generated math.\n","authors":["Sasha Boguraev","Ben Lipkin","Leonie Weissweiler","Kyle Mahowald"],"pdf_url":"https://arxiv.org/pdf/2409.17005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03589v3","updated":"2024-09-25T14:59:24Z","published":"2024-06-05T19:14:21Z","title":"Ranking Manipulation for Conversational Search Engines","summary":" Major search engine providers are rapidly incorporating Large Language Model\n(LLM)-generated content in response to user queries. These conversational\nsearch engines operate by loading retrieved website text into the LLM context\nfor summarization and interpretation. Recent research demonstrates that LLMs\nare highly vulnerable to jailbreaking and prompt injection attacks, which\ndisrupt the safety and quality goals of LLMs using adversarial strings. This\nwork investigates the impact of prompt injections on the ranking order of\nsources referenced by conversational search engines. To this end, we introduce\na focused dataset of real-world consumer product websites and formalize\nconversational search ranking as an adversarial problem. Experimentally, we\nanalyze conversational search rankings in the absence of adversarial injections\nand show that different LLMs vary significantly in prioritizing product name,\ndocument content, and context position. We then present a tree-of-attacks-based\njailbreaking technique which reliably promotes low-ranked products.\nImportantly, these attacks transfer effectively to state-of-the-art\nconversational search engines such as perplexity$.$ai. Given the strong\nfinancial incentive for website owners to boost their search ranking, we argue\nthat our problem formulation is of critical importance for future robustness\nwork.\n","authors":["Samuel Pfrommer","Yatong Bai","Tanmay Gautam","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2406.03589v3.pdf","comment":"2024 Conference on Empirical Methods in Natural Language Processing\n (Main)"},{"id":"http://arxiv.org/abs/2409.16984v1","updated":"2024-09-25T14:45:52Z","published":"2024-09-25T14:45:52Z","title":"AXCEL: Automated eXplainable Consistency Evaluation using LLMs","summary":" Large Language Models (LLMs) are widely used in both industry and academia\nfor various tasks, yet evaluating the consistency of generated text responses\ncontinues to be a challenge. Traditional metrics like ROUGE and BLEU show a\nweak correlation with human judgment. More sophisticated metrics using Natural\nLanguage Inference (NLI) have shown improved correlations but are complex to\nimplement, require domain-specific training due to poor cross-domain\ngeneralization, and lack explainability. More recently, prompt-based metrics\nusing LLMs as evaluators have emerged; while they are easier to implement, they\nstill lack explainability and depend on task-specific prompts, which limits\ntheir generalizability. This work introduces Automated eXplainable Consistency\nEvaluation using LLMs (AXCEL), a prompt-based consistency metric which offers\nexplanations for the consistency scores by providing detailed reasoning and\npinpointing inconsistent text spans. AXCEL is also a generalizable metric which\ncan be adopted to multiple tasks without changing the prompt. AXCEL outperforms\nboth non-prompt and prompt-based state-of-the-art (SOTA) metrics in detecting\ninconsistencies across summarization by 8.7%, free text generation by 6.2%, and\ndata-to-text conversion tasks by 29.4%. We also evaluate the influence of\nunderlying LLMs on prompt based metric performance and recalibrate the SOTA\nprompt-based metrics with the latest LLMs for fair comparison. Further, we show\nthat AXCEL demonstrates strong performance using open source LLMs.\n","authors":["P Aditya Sreekar","Sahil Verma","Suransh Chopra","Sarik Ghazarian","Abhishek Persad","Narayanan Sadagopan"],"pdf_url":"https://arxiv.org/pdf/2409.16984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12997v5","updated":"2024-09-25T14:37:39Z","published":"2024-02-20T13:25:16Z","title":"Towards Trustworthy Reranking: A Simple yet Effective Abstention\n Mechanism","summary":" Neural Information Retrieval (NIR) has significantly improved upon\nheuristic-based Information Retrieval (IR) systems. Yet, failures remain\nfrequent, the models used often being unable to retrieve documents relevant to\nthe user's query. We address this challenge by proposing a lightweight\nabstention mechanism tailored for real-world constraints, with particular\nemphasis placed on the reranking phase. We introduce a protocol for evaluating\nabstention strategies in black-box scenarios (typically encountered when\nrelying on API services), demonstrating their efficacy, and propose a simple\nyet effective data-driven mechanism. We provide open-source code for experiment\nreplication and abstention implementation, fostering wider adoption and\napplication in diverse contexts.\n","authors":["Hippolyte Gisserot-Boukhlef","Manuel Faysse","Emmanuel Malherbe","Céline Hudelot","Pierre Colombo"],"pdf_url":"https://arxiv.org/pdf/2402.12997v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16974v1","updated":"2024-09-25T14:36:30Z","published":"2024-09-25T14:36:30Z","title":"Decoding Large-Language Models: A Systematic Overview of Socio-Technical\n Impacts, Constraints, and Emerging Questions","summary":" There have been rapid advancements in the capabilities of large language\nmodels (LLMs) in recent years, greatly revolutionizing the field of natural\nlanguage processing (NLP) and artificial intelligence (AI) to understand and\ninteract with human language. Therefore, in this work, we conduct a systematic\ninvestigation of the literature to identify the prominent themes and directions\nof LLM developments, impacts, and limitations. Our findings illustrate the\naims, methodologies, limitations, and future directions of LLM research. It\nincludes responsible development considerations, algorithmic improvements,\nethical challenges, and societal implications of LLM development. Overall, this\npaper provides a rigorous and comprehensive overview of current research in LLM\nand identifies potential directions for future development. The article\nhighlights the application areas that could have a positive impact on society\nalong with the ethical considerations.\n","authors":["Zeyneb N. Kaya","Souvick Ghosh"],"pdf_url":"https://arxiv.org/pdf/2409.16974v1.pdf","comment":"28 pages, 5 figures, preprint submitted to journal"},{"id":"http://arxiv.org/abs/2409.16973v1","updated":"2024-09-25T14:35:06Z","published":"2024-09-25T14:35:06Z","title":"Adaptive Self-Supervised Learning Strategies for Dynamic On-Device LLM\n Personalization","summary":" Large language models (LLMs) have revolutionized how we interact with\ntechnology, but their personalization to individual user preferences remains a\nsignificant challenge, particularly in on-device applications. Traditional\nmethods often depend heavily on labeled datasets and can be resource-intensive.\nTo address these issues, we present Adaptive Self-Supervised Learning\nStrategies (ASLS), which utilizes self-supervised learning techniques to\npersonalize LLMs dynamically. The framework comprises a user profiling layer\nfor collecting interaction data and a neural adaptation layer for real-time\nmodel fine-tuning. This innovative approach enables continuous learning from\nuser feedback, allowing the model to generate responses that align closely with\nuser-specific contexts. The adaptive mechanisms of ASLS minimize computational\ndemands and enhance personalization efficiency. Experimental results across\nvarious user scenarios illustrate the superior performance of ASLS in boosting\nuser engagement and satisfaction, highlighting its potential to redefine LLMs\nas highly responsive and context-aware systems on-device.\n","authors":["Rafael Mendoza","Isabella Cruz","Richard Liu","Aarav Deshmukh","David Williams","Jesscia Peng","Rohan Iyer"],"pdf_url":"https://arxiv.org/pdf/2409.16973v1.pdf","comment":"First ASLS"},{"id":"http://arxiv.org/abs/2409.16954v1","updated":"2024-09-25T14:09:09Z","published":"2024-09-25T14:09:09Z","title":"Weighted Cross-entropy for Low-Resource Languages in Multilingual Speech\n Recognition","summary":" This paper addresses the challenge of integrating low-resource languages into\nmultilingual automatic speech recognition (ASR) systems. We introduce a novel\napplication of weighted cross-entropy, typically used for unbalanced datasets,\nto facilitate the integration of low-resource languages into pre-trained\nmultilingual ASR models within the context of continual multilingual learning.\nWe fine-tune the Whisper multilingual ASR model on five high-resource languages\nand one low-resource language, employing language-weighted dynamic\ncross-entropy and data augmentation. The results show a remarkable 6.69% word\nerror rate (WER) reduction for the low-resource language compared to the\nfine-tuned model without applying our approach, and a 48.86% WER reduction\ncompared to the original Whisper model. In addition, our approach yields an\naverage WER reduction of 3.29% across the six languages, showing no degradation\nfor the high-resource languages.\n","authors":["Andrés Piñeiro-Martín","Carmen García-Mateo","Laura Docío-Fernández","María del Carmen López-Pérez","Georg Rehm"],"pdf_url":"https://arxiv.org/pdf/2409.16954v1.pdf","comment":"5 pages, 1 figure. Presented at Interspeech 2024"},{"id":"http://arxiv.org/abs/2305.12620v2","updated":"2024-09-25T14:06:31Z","published":"2023-05-22T01:02:45Z","title":"Keeping Up with the Language Models: Systematic Benchmark Extension for\n Bias Auditing","summary":" Bias auditing of language models (LMs) has received considerable attention as\nLMs are becoming widespread. As such, several benchmarks for bias auditing have\nbeen proposed. At the same time, the rapid evolution of LMs can make these\nbenchmarks irrelevant in no time. Bias auditing is further complicated by LM\nbrittleness: when a presumably biased outcome is observed, is it due to model\nbias or model brittleness? We propose enlisting the models themselves to help\nconstruct bias auditing datasets that remain challenging, and introduce bias\nmeasures that distinguish between different types of model errors. First, we\nextend an existing bias benchmark for NLI (BBNLI) using a combination of\nLM-generated lexical variations, adversarial filtering, and human validation.\nWe demonstrate that the newly created dataset BBNLI-next is more challenging\nthan BBNLI: on average, BBNLI-next reduces the accuracy of state-of-the-art NLI\nmodels from 95.3%, as observed by BBNLI, to a strikingly low 57.5%. Second, we\nemploy BBNLI-next to showcase the interplay between robustness and bias: we\npoint out shortcomings in current bias scores and propose bias measures that\ntake into account both bias and model brittleness. Third, despite the fact that\nBBNLI-next was designed with non-generative models in mind, we show that the\nnew dataset is also able to uncover bias in state-of-the-art open-source\ngenerative LMs.\n Note: All datasets included in this work are in English and they address\nUS-centered social biases. In the spirit of efficient NLP research, no model\ntraining or fine-tuning was performed to conduct this research.\n Warning: This paper contains offensive text examples.\n","authors":["Ioana Baldini","Chhavi Yadav","Manish Nagireddy","Payel Das","Kush R. Varshney"],"pdf_url":"https://arxiv.org/pdf/2305.12620v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02135v2","updated":"2024-09-25T14:00:18Z","published":"2024-06-04T09:24:04Z","title":"Robust Interaction-Based Relevance Modeling for Online e-Commerce Search","summary":" Semantic relevance calculation is crucial for e-commerce search engines, as\nit ensures that the items selected closely align with customer intent.\nInadequate attention to this aspect can detrimentally affect user experience\nand engagement. Traditional text-matching techniques are prevalent but often\nfail to capture the nuances of search intent accurately, so neural networks now\nhave become a preferred solution to processing such complex text matching.\nExisting methods predominantly employ representation-based architectures, which\nstrike a balance between high traffic capacity and low latency. However, they\nexhibit significant shortcomings in generalization and robustness when compared\nto interaction-based architectures. In this work, we introduce a robust\ninteraction-based modeling paradigm to address these shortcomings. It\nencompasses 1) a dynamic length representation scheme for expedited inference,\n2) a professional terms recognition method to identify subjects and core\nattributes from complex sentence structures, and 3) a contrastive adversarial\ntraining protocol to bolster the model's robustness and matching capabilities.\nExtensive offline evaluations demonstrate the superior robustness and\neffectiveness of our approach, and online A/B testing confirms its ability to\nimprove relevance in the same exposure position, resulting in more clicks and\nconversions. To the best of our knowledge, this method is the first\ninteraction-based approach for large e-commerce search relevance calculation.\nNotably, we have deployed it for the entire search traffic on alibaba.com, the\nlargest B2B e-commerce platform in the world.\n","authors":["Ben Chen","Huangyu Dai","Xiang Ma","Wen Jiang","Wei Ning"],"pdf_url":"https://arxiv.org/pdf/2406.02135v2.pdf","comment":"Accepted by ECML-PKDD'24 as Outstanding Paper. 8 pages, 2 figures, 7\n tables"},{"id":"http://arxiv.org/abs/2406.19280v3","updated":"2024-09-25T13:36:27Z","published":"2024-06-27T15:50:41Z","title":"HuatuoGPT-Vision, Towards Injecting Medical Visual Knowledge into\n Multimodal LLMs at Scale","summary":" The rapid development of multimodal large language models (MLLMs), such as\nGPT-4V, has led to significant advancements. However, these models still face\nchallenges in medical multimodal capabilities due to limitations in the\nquantity and quality of medical vision-text data, stemming from data privacy\nconcerns and high annotation costs. While pioneering approaches utilize\nPubMed's large-scale, de-identified medical image-text pairs to address these\nlimitations, they still fall short due to inherent data noise. To tackle this,\nwe refined medical image-text pairs from PubMed and employed MLLMs (GPT-4V) in\nan 'unblinded' capacity to denoise and reformat the data, resulting in the\ncreation of the PubMedVision dataset with 1.3 million medical VQA samples. Our\nvalidation demonstrates that: (1) PubMedVision can significantly enhance the\nmedical multimodal capabilities of current MLLMs, showing significant\nimprovement in benchmarks including the MMMU Health & Medicine track; (2)\nmanual checks by medical experts and empirical results validate the superior\ndata quality of our dataset compared to other data construction methods. Using\nPubMedVision, we train a 34B medical MLLM HuatuoGPT-Vision, which shows\nsuperior performance in medical multimodal scenarios among open-source MLLMs.\n","authors":["Junying Chen","Chi Gui","Ruyi Ouyang","Anningzhe Gao","Shunian Chen","Guiming Hardy Chen","Xidong Wang","Ruifei Zhang","Zhenyang Cai","Ke Ji","Guangjun Yu","Xiang Wan","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2406.19280v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16920v1","updated":"2024-09-25T13:27:17Z","published":"2024-09-25T13:27:17Z","title":"Cross-lingual Speech Emotion Recognition: Humans vs. Self-Supervised\n Models","summary":" Utilizing Self-Supervised Learning (SSL) models for Speech Emotion\nRecognition (SER) has proven effective, yet limited research has explored\ncross-lingual scenarios. This study presents a comparative analysis between\nhuman performance and SSL models, beginning with a layer-wise analysis and an\nexploration of parameter-efficient fine-tuning strategies in monolingual,\ncross-lingual, and transfer learning contexts. We further compare the SER\nability of models and humans at both utterance- and segment-levels.\nAdditionally, we investigate the impact of dialect on cross-lingual SER through\nhuman evaluation. Our findings reveal that models, with appropriate knowledge\ntransfer, can adapt to the target language and achieve performance comparable\nto native speakers. We also demonstrate the significant effect of dialect on\nSER for individuals without prior linguistic and paralinguistic background.\nMoreover, both humans and models exhibit distinct behaviors across different\nemotions. These results offer new insights into the cross-lingual SER\ncapabilities of SSL models, underscoring both their similarities to and\ndifferences from human emotion perception.\n","authors":["Zhichen Han","Tianqi Geng","Hui Feng","Jiahong Yuan","Korin Richmond","Yuanchao Li"],"pdf_url":"https://arxiv.org/pdf/2409.16920v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16914v1","updated":"2024-09-25T13:18:57Z","published":"2024-09-25T13:18:57Z","title":"Zero-Shot Detection of LLM-Generated Text using Token Cohesiveness","summary":" The increasing capability and widespread usage of large language models\n(LLMs) highlight the desirability of automatic detection of LLM-generated text.\nZero-shot detectors, due to their training-free nature, have received\nconsiderable attention and notable success. In this paper, we identify a new\nfeature, token cohesiveness, that is useful for zero-shot detection, and we\ndemonstrate that LLM-generated text tends to exhibit higher token cohesiveness\nthan human-written text. Based on this observation, we devise TOCSIN, a generic\ndual-channel detection paradigm that uses token cohesiveness as a plug-and-play\nmodule to improve existing zero-shot detectors. To calculate token\ncohesiveness, TOCSIN only requires a few rounds of random token deletion and\nsemantic difference measurement, making it particularly suitable for a\npractical black-box setting where the source model used for generation is not\naccessible. Extensive experiments with four state-of-the-art base detectors on\nvarious datasets, source models, and evaluation settings demonstrate the\neffectiveness and generality of the proposed approach. Code available at:\n\\url{https://github.com/Shixuan-Ma/TOCSIN}.\n","authors":["Shixuan Ma","Quan Wang"],"pdf_url":"https://arxiv.org/pdf/2409.16914v1.pdf","comment":"To appear at the main conference of EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.16911v1","updated":"2024-09-25T13:15:50Z","published":"2024-09-25T13:15:50Z","title":"Pruning Multilingual Large Language Models for Multilingual Inference","summary":" Multilingual large language models (MLLMs), trained on multilingual balanced\ndata, demonstrate better zero-shot learning performance in non-English\nlanguages compared to large language models trained on English-dominant data.\nHowever, the disparity in performance between English and non-English languages\nremains a challenge yet to be fully addressed. A distinctive characteristic of\nMLLMs is their high-quality translation capabilities, indicating an acquired\nproficiency in aligning between languages. This study explores how to enhance\nthe zero-shot performance of MLLMs in non-English languages by leveraging their\nalignment capability between English and non-English languages. To achieve\nthis, we first analyze the behavior of MLLMs when performing translation and\nreveal that there are large magnitude features that play a critical role in the\ntranslation process. Inspired by these findings, we retain the weights\nassociated with operations involving the large magnitude features and prune\nother weights to force MLLMs to rely on these features for tasks beyond\ntranslation. We empirically demonstrate that this pruning strategy can enhance\nthe MLLMs' performance in non-English language.\n","authors":["Hwichan Kim","Jun Suzuki","Tosho Hirasawa","Mamoru Komachi"],"pdf_url":"https://arxiv.org/pdf/2409.16911v1.pdf","comment":"Accepted at EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.16909v1","updated":"2024-09-25T13:13:21Z","published":"2024-09-25T13:13:21Z","title":"Enhancing Temporal Sensitivity and Reasoning for Time-Sensitive Question\n Answering","summary":" Time-Sensitive Question Answering (TSQA) demands the effective utilization of\nspecific temporal contexts, encompassing multiple time-evolving facts, to\naddress time-sensitive questions. This necessitates not only the parsing of\ntemporal information within questions but also the identification and\nunderstanding of time-evolving facts to generate accurate answers. However,\ncurrent large language models still have limited sensitivity to temporal\ninformation and their inadequate temporal reasoning capabilities.In this paper,\nwe propose a novel framework that enhances temporal awareness and reasoning\nthrough Temporal Information-Aware Embedding and Granular Contrastive\nReinforcement Learning. Experimental results on four TSQA datasets demonstrate\nthat our framework significantly outperforms existing LLMs in TSQA tasks,\nmarking a step forward in bridging the performance gap between machine and\nhuman temporal understanding and reasoning.\n","authors":["Wanqi Yang","Yanda Li","Meng Fang","Ling Chen"],"pdf_url":"https://arxiv.org/pdf/2409.16909v1.pdf","comment":"Accepted by EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.16900v1","updated":"2024-09-25T13:09:23Z","published":"2024-09-25T13:09:23Z","title":"A Roadmap for Embodied and Social Grounding in LLMs","summary":" The fusion of Large Language Models (LLMs) and robotic systems has led to a\ntransformative paradigm in the robotic field, offering unparalleled\ncapabilities not only in the communication domain but also in skills like\nmultimodal input handling, high-level reasoning, and plan generation. The\ngrounding of LLMs knowledge into the empirical world has been considered a\ncrucial pathway to exploit the efficiency of LLMs in robotics. Nevertheless,\nconnecting LLMs' representations to the external world with multimodal\napproaches or with robots' bodies is not enough to let them understand the\nmeaning of the language they are manipulating. Taking inspiration from humans,\nthis work draws attention to three necessary elements for an agent to grasp and\nexperience the world. The roadmap for LLMs grounding is envisaged in an active\nbodily system as the reference point for experiencing the environment, a\ntemporally structured experience for a coherent, self-related interaction with\nthe external world, and social skills to acquire a common-grounded shared\nexperience.\n","authors":["Sara Incao","Carlo Mazzola","Giulia Belgiovine","Alessandra Sciutti"],"pdf_url":"https://arxiv.org/pdf/2409.16900v1.pdf","comment":"Accepted Version of a conference paper presented at Robophilosophy\n Conference 2024"}]},"2024-09-24T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2307.15061v2","updated":"2024-09-24T23:49:07Z","published":"2023-07-27T17:59:56Z","title":"The RoboDepth Challenge: Methods and Advancements Towards Robust Depth\n Estimation","summary":" Accurate depth estimation under out-of-distribution (OoD) scenarios, such as\nadverse weather conditions, sensor failure, and noise contamination, is\ndesirable for safety-critical applications. Existing depth estimation systems,\nhowever, suffer inevitably from real-world corruptions and perturbations and\nare struggled to provide reliable depth predictions under such cases. In this\npaper, we summarize the winning solutions from the RoboDepth Challenge -- an\nacademic competition designed to facilitate and advance robust OoD depth\nestimation. This challenge was developed based on the newly established KITTI-C\nand NYUDepth2-C benchmarks. We hosted two stand-alone tracks, with an emphasis\non robust self-supervised and robust fully-supervised depth estimation,\nrespectively. Out of more than two hundred participants, nine unique and\ntop-performing solutions have appeared, with novel designs ranging from the\nfollowing aspects: spatial- and frequency-domain augmentations, masked image\nmodeling, image restoration and super-resolution, adversarial training,\ndiffusion-based noise suppression, vision-language pre-training, learned model\nensembling, and hierarchical feature enhancement. Extensive experimental\nanalyses along with insightful observations are drawn to better understand the\nrationale behind each design. We hope this challenge could lay a solid\nfoundation for future research on robust and reliable depth estimation and\nbeyond. The datasets, competition toolkit, workshop recordings, and source code\nfrom the winning teams are publicly available on the challenge website.\n","authors":["Lingdong Kong","Yaru Niu","Shaoyuan Xie","Hanjiang Hu","Lai Xing Ng","Benoit R. Cottereau","Liangjun Zhang","Hesheng Wang","Wei Tsang Ooi","Ruijie Zhu","Ziyang Song","Li Liu","Tianzhu Zhang","Jun Yu","Mohan Jing","Pengwei Li","Xiaohua Qi","Cheng Jin","Yingfeng Chen","Jie Hou","Jie Zhang","Zhen Kan","Qiang Ling","Liang Peng","Minglei Li","Di Xu","Changpeng Yang","Yuanqi Yao","Gang Wu","Jian Kuai","Xianming Liu","Junjun Jiang","Jiamian Huang","Baojun Li","Jiale Chen","Shuang Zhang","Sun Ao","Zhenyu Li","Runze Chen","Haiyong Luo","Fang Zhao","Jingze Yu"],"pdf_url":"https://arxiv.org/pdf/2307.15061v2.pdf","comment":"Technical Report; 65 pages, 34 figures, 24 tables; Code at\n https://github.com/ldkong1205/RoboDepth"},{"id":"http://arxiv.org/abs/2409.16502v1","updated":"2024-09-24T23:18:32Z","published":"2024-09-24T23:18:32Z","title":"GSplatLoc: Grounding Keypoint Descriptors into 3D Gaussian Splatting for\n Improved Visual Localization","summary":" Although various visual localization approaches exist, such as scene\ncoordinate and pose regression, these methods often struggle with high memory\nconsumption or extensive optimization requirements. To address these\nchallenges, we utilize recent advancements in novel view synthesis,\nparticularly 3D Gaussian Splatting (3DGS), to enhance localization. 3DGS allows\nfor the compact encoding of both 3D geometry and scene appearance with its\nspatial features. Our method leverages the dense description maps produced by\nXFeat's lightweight keypoint detection and description model. We propose\ndistilling these dense keypoint descriptors into 3DGS to improve the model's\nspatial understanding, leading to more accurate camera pose predictions through\n2D-3D correspondences. After estimating an initial pose, we refine it using a\nphotometric warping loss. Benchmarking on popular indoor and outdoor datasets\nshows that our approach surpasses state-of-the-art Neural Render Pose (NRP)\nmethods, including NeRFMatch and PNeRFLoc.\n","authors":["Gennady Sidorov","Malik Mohrat","Ksenia Lebedeva","Ruslan Rakhimov","Sergey Kolyubin"],"pdf_url":"https://arxiv.org/pdf/2409.16502v1.pdf","comment":"Project website at https://gsplatloc.github.io/"},{"id":"http://arxiv.org/abs/2409.16501v1","updated":"2024-09-24T23:14:44Z","published":"2024-09-24T23:14:44Z","title":"Clarke Transform -- A Fundamental Tool for Continuum Robotics","summary":" This article introduces the Clarke transform and Clarke coordinates, which\npresent a solution to the disengagement of an arbitrary number of coupled\ndisplacement actuation of continuum and soft robots. The Clarke transform\nutilizes the generalized Clarke transformation and its inverse to reduce any\nnumber of joint values to a two-dimensional space without sacrificing any\nsignificant information. This space is the manifold of the joint space and is\ndescribed by two orthogonal Clarke coordinates. Application to kinematics,\nsampling, and control are presented. By deriving the solution to the previously\nunknown forward robot-dependent mapping for an arbitrary number of joints, the\nforward and inverse kinematics formulations are branchless, closed-form, and\nsingular-free. Sampling is used as a proxy for gauging the performance\nimplications for various methods and frameworks, leading to a branchless,\nclosed-form, and vectorizable sampling method with a 100 percent success rate\nand the possibility to shape desired distributions. Due to the utilization of\nthe manifold, the fairly simple constraint-informed, two-dimensional, and\nlinear controller always provides feasible control outputs. On top of that, the\nrelations to improved representations in continuum and soft robotics are\nestablished, where the Clarke coordinates are their generalizations.\n The Clarke transform offers valuable geometric insights and paves the way for\ndeveloping approaches directly on the two-dimensional manifold within the\nhigh-dimensional joint space, ensuring compliance with the constraint. While\nbeing an easy-to-construct linear map, the proposed Clarke transform is\nmathematically consistent, physically meaningful, as well as interpretable and\ncontributes to the unification of frameworks across continuum and soft robots.\n","authors":["Reinhard Grassmann","Anastasiia Senyk","Jessica Burgner-Kahrs"],"pdf_url":"https://arxiv.org/pdf/2409.16501v1.pdf","comment":"27 pages, 11 figures, 5 tables"},{"id":"http://arxiv.org/abs/2409.16484v1","updated":"2024-09-24T22:15:24Z","published":"2024-09-24T22:15:24Z","title":"BehAV: Behavioral Rule Guided Autonomy Using VLMs for Robot Navigation\n in Outdoor Scenes","summary":" We present BehAV, a novel approach for autonomous robot navigation in outdoor\nscenes guided by human instructions and leveraging Vision Language Models\n(VLMs). Our method interprets human commands using a Large Language Model (LLM)\nand categorizes the instructions into navigation and behavioral guidelines.\nNavigation guidelines consist of directional commands (e.g., \"move forward\nuntil\") and associated landmarks (e.g., \"the building with blue windows\"),\nwhile behavioral guidelines encompass regulatory actions (e.g., \"stay on\") and\ntheir corresponding objects (e.g., \"pavements\"). We use VLMs for their\nzero-shot scene understanding capabilities to estimate landmark locations from\nRGB images for robot navigation. Further, we introduce a novel scene\nrepresentation that utilizes VLMs to ground behavioral rules into a behavioral\ncost map. This cost map encodes the presence of behavioral objects within the\nscene and assigns costs based on their regulatory actions. The behavioral cost\nmap is integrated with a LiDAR-based occupancy map for navigation. To navigate\noutdoor scenes while adhering to the instructed behaviors, we present an\nunconstrained Model Predictive Control (MPC)-based planner that prioritizes\nboth reaching landmarks and following behavioral guidelines. We evaluate the\nperformance of BehAV on a quadruped robot across diverse real-world scenarios,\ndemonstrating a 22.49% improvement in alignment with human-teleoperated\nactions, as measured by Frechet distance, and achieving a 40% higher navigation\nsuccess rate compared to state-of-the-art methods.\n","authors":["Kasun Weerakoon","Mohamed Elnoor","Gershom Seneviratne","Vignesh Rajagopal","Senthil Hariharan Arul","Jing Liang","Mohamed Khalid M Jaffar","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2409.16484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16473v1","updated":"2024-09-24T21:48:14Z","published":"2024-09-24T21:48:14Z","title":"KinScene: Model-Based Mobile Manipulation of Articulated Scenes","summary":" Sequentially interacting with articulated objects is crucial for a mobile\nmanipulator to operate effectively in everyday environments. To enable\nlong-horizon tasks involving articulated objects, this study explores building\nscene-level articulation models for indoor scenes through autonomous\nexploration. While previous research has studied mobile manipulation with\narticulated objects by considering object kinematic constraints, it primarily\nfocuses on individual-object scenarios and lacks extension to a scene-level\ncontext for task-level planning. To manipulate multiple object parts\nsequentially, the robot needs to reason about the resultant motion of each part\nand anticipate its impact on future actions.We introduce \\ourtool{}, a\nfull-stack approach for long-horizon manipulation tasks with articulated\nobjects. The robot maps the scene, detects and physically interacts with\narticulated objects, collects observations, and infers the articulation\nproperties. For sequential tasks, the robot plans a feasible series of object\ninteractions based on the inferred articulation model. We demonstrate that our\napproach repeatably constructs accurate scene-level kinematic and geometric\nmodels, enabling long-horizon mobile manipulation in a real-world scene. Code\nand additional results are available at\nhttps://chengchunhsu.github.io/KinScene/\n","authors":["Cheng-Chun Hsu","Ben Abbatematteo","Zhenyu Jiang","Yuke Zhu","Roberto Martín-Martín","Joydeep Biswas"],"pdf_url":"https://arxiv.org/pdf/2409.16473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16470v1","updated":"2024-09-24T21:44:26Z","published":"2024-09-24T21:44:26Z","title":"Frequency-based View Selection in Gaussian Splatting Reconstruction","summary":" Three-dimensional reconstruction is a fundamental problem in robotics\nperception. We examine the problem of active view selection to perform 3D\nGaussian Splatting reconstructions with as few input images as possible.\nAlthough 3D Gaussian Splatting has made significant progress in image rendering\nand 3D reconstruction, the quality of the reconstruction is strongly impacted\nby the selection of 2D images and the estimation of camera poses through\nStructure-from-Motion (SfM) algorithms. Current methods to select views that\nrely on uncertainties from occlusions, depth ambiguities, or neural network\npredictions directly are insufficient to handle the issue and struggle to\ngeneralize to new scenes. By ranking the potential views in the frequency\ndomain, we are able to effectively estimate the potential information gain of\nnew viewpoints without ground truth data. By overcoming current constraints on\nmodel architecture and efficacy, our method achieves state-of-the-art results\nin view selection, demonstrating its potential for efficient image-based 3D\nreconstruction.\n","authors":["Monica M. Q. Li","Pierre-Yves Lajoie","Giovanni Beltrame"],"pdf_url":"https://arxiv.org/pdf/2409.16470v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.16467v1","updated":"2024-09-24T21:37:58Z","published":"2024-09-24T21:37:58Z","title":"Learning Dynamics of a Ball with Differentiable Factor Graph and\n Roto-Translational Invariant Representations","summary":" Robots in dynamic environments need fast, accurate models of how objects move\nin their environments to support agile planning. In sports such as ping pong,\nanalytical models often struggle to accurately predict ball trajectories with\nspins due to complex aerodynamics, elastic behaviors, and the challenges of\nmodeling sliding and rolling friction. On the other hand, despite the promise\nof data-driven methods, machine learning struggles to make accurate, consistent\npredictions without precise input. In this paper, we propose an end-to-end\nlearning framework that can jointly train a dynamics model and a factor graph\nestimator. Our approach leverages a Gram-Schmidt (GS) process to extract\nroto-translational invariant representations to improve the model performance,\nwhich can further reduce the validation error compared to data augmentation\nmethod. Additionally, we propose a network architecture that enhances\nnonlinearity by using self-multiplicative bypasses in the layer connections. By\nleveraging these novel methods, our proposed approach predicts the ball's\nposition with an RMSE of 37.2 mm of the paddle radius at the apex after the\nfirst bounce, and 71.5 mm after the second bounce.\n","authors":["Qingyu Xiao","Zixuan Wu","Matthew Gombolay"],"pdf_url":"https://arxiv.org/pdf/2409.16467v1.pdf","comment":"ICRA 2025"},{"id":"http://arxiv.org/abs/2409.16465v1","updated":"2024-09-24T21:33:14Z","published":"2024-09-24T21:33:14Z","title":"Initialization of Monocular Visual Navigation for Autonomous Agents\n Using Modified Structure from Small Motion","summary":" We propose a standalone monocular visual Simultaneous Localization and\nMapping (vSLAM) initialization pipeline for autonomous robots in space. Our\nmethod, a state-of-the-art factor graph optimization pipeline, enhances\nclassical Structure from Small Motion (SfSM) to robustly initialize a monocular\nagent in weak-perspective projection scenes. Furthermore, it overcomes visual\nestimation challenges introduced by spacecraft inspection trajectories, such\nas: center-pointing motion, which exacerbates the bas-relief ambiguity, and the\npresence of a dominant plane in the scene, which causes motion estimation\ndegeneracies in classical Structure from Motion (SfM). We validate our method\non realistic, simulated satellite inspection images exhibiting weak-perspective\nprojection, and we demonstrate its effectiveness and improved performance\ncompared to other monocular initialization procedures.\n","authors":["Juan-Diego Florez","Mehregan Dor","Panagiotis Tsiotras"],"pdf_url":"https://arxiv.org/pdf/2409.16465v1.pdf","comment":"6 pages, 1 page for references, 6 figures, 1 table, IEEEtran format\n This work has been submitted to the IEEE for possible publication. Copyright\n may be transferred without notice, after which this version may no longer be\n accessible"},{"id":"http://arxiv.org/abs/2409.16460v1","updated":"2024-09-24T21:21:47Z","published":"2024-09-24T21:21:47Z","title":"MBC: Multi-Brain Collaborative Control for Quadruped Robots","summary":" In the field of locomotion task of quadruped robots, Blind Policy and\nPerceptive Policy each have their own advantages and limitations. The Blind\nPolicy relies on preset sensor information and algorithms, suitable for known\nand structured environments, but it lacks adaptability in complex or unknown\nenvironments. The Perceptive Policy uses visual sensors to obtain detailed\nenvironmental information, allowing it to adapt to complex terrains, but its\neffectiveness is limited under occluded conditions, especially when perception\nfails. Unlike the Blind Policy, the Perceptive Policy is not as robust under\nthese conditions. To address these challenges, we propose a MBC:Multi-Brain\ncollaborative system that incorporates the concepts of Multi-Agent\nReinforcement Learning and introduces collaboration between the Blind Policy\nand the Perceptive Policy. By applying this multi-policy collaborative model to\na quadruped robot, the robot can maintain stable locomotion even when the\nperceptual system is impaired or observational data is incomplete. Our\nsimulations and real-world experiments demonstrate that this system\nsignificantly improves the robot's passability and robustness against\nperception failures in complex environments, validating the effectiveness of\nmulti-policy collaboration in enhancing robotic motion performance.\n","authors":["Hang Liu","Yi Cheng","Rankun Li","Xiaowen Hu","Linqi Ye","Houde Liu"],"pdf_url":"https://arxiv.org/pdf/2409.16460v1.pdf","comment":"18 pages, 9 figures, Website and Videos: https://quad-mbc.github.io/"},{"id":"http://arxiv.org/abs/2407.01468v2","updated":"2024-09-24T21:11:44Z","published":"2024-07-01T17:00:40Z","title":"Active Shadowing (ASD): Manipulating Visual Perception of Robotics\n Behaviors via Implicit Communication","summary":" Explicit communication is often valued for its directness during interaction.\nImplicit communication, on the other hand, is indirect in that its\ncommunicative content must be inferred. Implicit communication is considered\nmore desirable in teaming situations that requires reduced interruptions for\nimproved fluency. In this paper, we investigate another unique advantage of\nimplicit communication: its ability to manipulate the perception of object or\nbehavior of interest. When communication results in the perception of an object\nor behavior to deviate from other information (about the object or behavior)\navailable via observation, it introduces a discrepancy between perception and\nobservation. We show that such a discrepancy in visual perception can benefit\nhuman-robot interaction in a controlled manner and introduce an approach\nreferred to as active shadowing (ASD). Through user studies, we demonstrate the\neffectiveness of active shadowing in creating a misaligned perception of the\nrobot's behavior and its execution in the real-world, resulting in more\nefficient task completion without sacrificing its understandability. We also\nanalyze conditions under which such visual manipulation is effective.\n","authors":["Andrew Boateng","Prakhar Bhartiya","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.01468v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16455v1","updated":"2024-09-24T20:54:21Z","published":"2024-09-24T20:54:21Z","title":"MultiTalk: Introspective and Extrospective Dialogue for\n Human-Environment-LLM Alignment","summary":" LLMs have shown promising results in task planning due to their strong\nnatural language understanding and reasoning capabilities. However, issues such\nas hallucinations, ambiguities in human instructions, environmental\nconstraints, and limitations in the executing agent's capabilities often lead\nto flawed or incomplete plans. This paper proposes MultiTalk, an LLM-based task\nplanning methodology that addresses these issues through a framework of\nintrospective and extrospective dialogue loops. This approach helps ground\ngenerated plans in the context of the environment and the agent's capabilities,\nwhile also resolving uncertainties and ambiguities in the given task. These\nloops are enabled by specialized systems designed to extract and predict\ntask-specific states, and flag mismatches or misalignments among the human\nuser, the LLM agent, and the environment. Effective feedback pathways between\nthese systems and the LLM planner foster meaningful dialogue. The efficacy of\nthis methodology is demonstrated through its application to robotic\nmanipulation tasks. Experiments and ablations highlight the robustness and\nreliability of our method, and comparisons with baselines further illustrate\nthe superiority of MultiTalk in task planning for embodied agents.\n","authors":["Venkata Naren Devarakonda","Ali Umut Kaypak","Shuaihang Yuan","Prashanth Krishnamurthy","Yi Fang","Farshad Khorrami"],"pdf_url":"https://arxiv.org/pdf/2409.16455v1.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.16451v1","updated":"2024-09-24T20:42:42Z","published":"2024-09-24T20:42:42Z","title":"Hierarchical Hybrid Learning for Long-Horizon Contact-Rich Robotic\n Assembly","summary":" Generalizable long-horizon robotic assembly requires reasoning at multiple\nlevels of abstraction. End-to-end imitation learning (IL) has been proven a\npromising approach, but it requires a large amount of demonstration data for\ntraining and often fails to meet the high-precision requirement of assembly\ntasks. Reinforcement Learning (RL) approaches have succeeded in high-precision\nassembly tasks, but suffer from sample inefficiency and hence, are less\ncompetent at long-horizon tasks. To address these challenges, we propose a\nhierarchical modular approach, named ARCH (Adaptive Robotic Composition\nHierarchy), which enables long-horizon high-precision assembly in contact-rich\nsettings. ARCH employs a hierarchical planning framework, including a low-level\nprimitive library of continuously parameterized skills and a high-level policy.\nThe low-level primitive library includes essential skills for assembly tasks,\nsuch as grasping and inserting. These primitives consist of both RL and\nmodel-based controllers. The high-level policy, learned via imitation learning\nfrom a handful of demonstrations, selects the appropriate primitive skills and\ninstantiates them with continuous input parameters. We extensively evaluate our\napproach on a real robot manipulation platform. We show that while trained on a\nsingle task, ARCH generalizes well to unseen tasks and outperforms baseline\nmethods in terms of success rate and data efficiency. Videos can be found at\nhttps://long-horizon-assembly.github.io.\n","authors":["Jiankai Sun","Aidan Curtis","Yang You","Yan Xu","Michael Koehle","Leonidas Guibas","Sachin Chitta","Mac Schwager","Hui Li"],"pdf_url":"https://arxiv.org/pdf/2409.16451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07776v2","updated":"2024-09-24T20:24:33Z","published":"2024-08-14T19:07:28Z","title":"Knowledge-based Neural Ordinary Differential Equations for Cosserat\n Rod-based Soft Robots","summary":" Soft robots have many advantages over rigid robots thanks to their compliant\nand passive nature. However, it is generally challenging to model the dynamics\nof soft robots due to their high spatial dimensionality, making it difficult to\nuse model-based methods to accurately control soft robots. It often requires\ndirect numerical simulation of partial differential equations to simulate soft\nrobots. This not only requires an accurate numerical model, but also makes soft\nrobot modeling slow and expensive. Deep learning algorithms have shown promises\nin data-driven modeling of soft robots. However, these algorithms usually\nrequire a large amount of data, which are difficult to obtain in either\nsimulation or real-world experiments of soft robots. In this work, we propose\nKNODE-Cosserat, a framework that combines first-principle physics models and\nneural ordinary differential equations. We leverage the best from both worlds\n-- the generalization ability of physics-based models and the fast speed of\ndeep learning methods. We validate our framework in both simulation and\nreal-world experiments. In both cases, we show that the robot model\nsignificantly improves over the baseline models under different metrics.\n","authors":["Tom Z. Jiahao","Ryan Adolf","Cynthia Sung","M. Ani Hsieh"],"pdf_url":"https://arxiv.org/pdf/2408.07776v2.pdf","comment":"8 pages, 11 figures, 4 tables"},{"id":"http://arxiv.org/abs/2409.12338v2","updated":"2024-09-24T20:17:37Z","published":"2024-09-18T22:09:46Z","title":"Can I Pet Your Robot? Incorporating Capacitive Touch Sensing into a Soft\n Socially Assistive Robot Platform","summary":" This work presents a method of incorporating low-cost capacitive tactile\nsensors on a soft socially assistive robot platform. By embedding conductive\nthread into the robot's crocheted exterior, we formed a set of low-cost,\nflexible capacitive tactile sensors that do not disrupt the robot's soft,\nzoomorphic embodiment. We evaluated the sensors' performance through a user\nstudy (N=20) and found that the sensors reliably detected user touch events and\nlocalized touch inputs to one of three regions on the robot's exterior.\n","authors":["Amy O'Connell","Bailey Cislowski","Heather Culbertson","Maja Matarić"],"pdf_url":"https://arxiv.org/pdf/2409.12338v2.pdf","comment":"Accepted as a Work-In-Progress submission at the 2024 IEEE Haptics\n Symposium"},{"id":"http://arxiv.org/abs/2409.16431v1","updated":"2024-09-24T19:51:41Z","published":"2024-09-24T19:51:41Z","title":"Hand Gesture Classification Based on Forearm Ultrasound Video Snippets\n Using 3D Convolutional Neural Networks","summary":" Ultrasound based hand movement estimation is a crucial area of research with\napplications in human-machine interaction. Forearm ultrasound offers detailed\ninformation about muscle morphology changes during hand movement which can be\nused to estimate hand gestures. Previous work has focused on analyzing\n2-Dimensional (2D) ultrasound image frames using techniques such as\nconvolutional neural networks (CNNs). However, such 2D techniques do not\ncapture temporal features from segments of ultrasound data corresponding to\ncontinuous hand movements. This study uses 3D CNN based techniques to capture\nspatio-temporal patterns within ultrasound video segments for gesture\nrecognition. We compared the performance of a 2D convolution-based network with\n(2+1)D convolution-based, 3D convolution-based, and our proposed network. Our\nmethodology enhanced the gesture classification accuracy to 98.8 +/- 0.9%, from\n96.5 +/- 2.3% compared to a network trained with 2D convolution layers. These\nresults demonstrate the advantages of using ultrasound video snippets for\nimproving hand gesture classification performance.\n","authors":["Keshav Bimbraw","Ankit Talele","Haichong K. Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.16431v1.pdf","comment":"Accepted to IUS 2024"},{"id":"http://arxiv.org/abs/2409.16415v1","updated":"2024-09-24T19:26:21Z","published":"2024-09-24T19:26:21Z","title":"Improving Intersession Reproducibility for Forearm Ultrasound based Hand\n Gesture Classification through an Incremental Learning Approach","summary":" Ultrasound images of the forearm can be used to classify hand gestures\ntowards developing human machine interfaces. In our previous work, we have\ndemonstrated gesture classification using ultrasound on a single subject\nwithout removing the probe before evaluation. This has limitations in usage as\nonce the probe is removed and replaced, the accuracy declines since the\nclassifier performance is sensitive to the probe location on the arm. In this\npaper, we propose training a model on multiple data collection sessions to\ncreate a generalized model, utilizing incremental learning through fine tuning.\nUltrasound data was acquired for 5 hand gestures within a session (without\nremoving and putting the probe back on) and across sessions. A convolutional\nneural network (CNN) with 5 cascaded convolution layers was used for this\nstudy. A pre-trained CNN was fine tuned with the convolution blocks acting as a\nfeature extractor, and the parameters of the remaining layers updated in an\nincremental fashion. Fine tuning was done using different session splits within\na session and between multiple sessions. We found that incremental fine tuning\ncan help enhance classification accuracy with more fine tuning sessions. After\n2 fine tuning sessions for each experiment, we found an approximate 10%\nincrease in classification accuracy. This work demonstrates that incremental\nlearning through fine tuning on ultrasound based hand gesture classification\ncan be used improves accuracy while saving storage, processing power, and time.\nIt can be expanded to generalize between multiple subjects and towards\ndeveloping personalized wearable devices.\n","authors":["Keshav Bimbraw","Jack Rothenberg","Haichong K. Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.16415v1.pdf","comment":"Accepted to IUS 2024"},{"id":"http://arxiv.org/abs/2409.16412v1","updated":"2024-09-24T19:24:04Z","published":"2024-09-24T19:24:04Z","title":"Vision-based Xylem Wetness Classification in Stem Water Potential\n Determination","summary":" Water is often overused in irrigation, making efficient management of it\ncrucial. Precision Agriculture emphasizes tools like stem water potential (SWP)\nanalysis for better plant status determination. However, such tools often\nrequire labor-intensive in-situ sampling. Automation and machine learning can\nstreamline this process and enhance outcomes. This work focused on automating\nstem detection and xylem wetness classification using the Scholander Pressure\nChamber, a widely used but demanding method for SWP measurement. The aim was to\nrefine stem detection and develop computer-vision-based methods to better\nclassify water emergence at the xylem. To this end, we collected and manually\nannotated video data, applying vision- and learning-based methods for detection\nand classification. Additionally, we explored data augmentation and fine-tuned\nparameters to identify the most effective models. The identified\nbest-performing models for stem detection and xylem wetness classification were\nevaluated end-to-end over 20 SWP measurements. Learning-based stem detection\nvia YOLOv8n combined with ResNet50-based classification achieved a Top-1\naccuracy of 80.98%, making it the best-performing approach for xylem wetness\nclassification.\n","authors":["Pamodya Peiris","Aritra Samanta","Caio Mucchiani","Cody Simons","Amit Roy-Chowdhury","Konstantinos Karydis"],"pdf_url":"https://arxiv.org/pdf/2409.16412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16392v1","updated":"2024-09-24T18:46:50Z","published":"2024-09-24T18:46:50Z","title":"Rao-Blackwellized POMDP Planning","summary":" Partially Observable Markov Decision Processes (POMDPs) provide a structured\nframework for decision-making under uncertainty, but their application requires\nefficient belief updates. Sequential Importance Resampling Particle Filters\n(SIRPF), also known as Bootstrap Particle Filters, are commonly used as belief\nupdaters in large approximate POMDP solvers, but they face challenges such as\nparticle deprivation and high computational costs as the system's state\ndimension grows. To address these issues, this study introduces\nRao-Blackwellized POMDP (RB-POMDP) approximate solvers and outlines generic\nmethods to apply Rao-Blackwellization in both belief updates and online\nplanning. We compare the performance of SIRPF and Rao-Blackwellized Particle\nFilters (RBPF) in a simulated localization problem where an agent navigates\ntoward a target in a GPS-denied environment using POMCPOW and RB-POMCPOW\nplanners. Our results not only confirm that RBPFs maintain accurate belief\napproximations over time with fewer particles, but, more surprisingly, RBPFs\ncombined with quadrature-based integration improve planning quality\nsignificantly compared to SIRPF-based planning under the same computational\nlimits.\n","authors":["Jiho Lee","Nisar R. Ahmed","Kyle H. Wray","Zachary N. Sunberg"],"pdf_url":"https://arxiv.org/pdf/2409.16392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16385v1","updated":"2024-09-24T18:35:51Z","published":"2024-09-24T18:35:51Z","title":"Embedded IPC: Fast and Intersection-free Simulation in Reduced Subspace\n for Robot Manipulation","summary":" Physics-based simulation is essential for developing and evaluating robot\nmanipulation policies, particularly in scenarios involving deformable objects\nand complex contact interactions. However, existing simulators often struggle\nto balance computational efficiency with numerical accuracy, especially when\nmodeling deformable materials with frictional contact constraints. We introduce\nan efficient subspace representation for the Incremental Potential Contact\n(IPC) method, leveraging model reduction to decrease the number of degrees of\nfreedom. Our approach decouples simulation complexity from the resolution of\nthe input model by representing elasticity in a low-resolution subspace while\nmaintaining collision constraints on an embedded high-resolution surface. Our\nbarrier formulation ensures intersection-free trajectories and configurations\nregardless of material stiffness, time step size, or contact severity. We\nvalidate our simulator through quantitative experiments with a soft bubble\ngripper grasping and qualitative demonstrations of placing a plate on a dish\nrack. The results demonstrate our simulator's efficiency, physical accuracy,\ncomputational stability, and robust handling of frictional contact, making it\nwell-suited for generating demonstration data and evaluating downstream robot\ntraining applications.\n","authors":["Wenxin Du","Chang Yu","Siyu Ma","Ying Jiang","Zeshun Zong","Yin Yang","Joe Masterjohn","Alejandro Castro","Xuchen Han","Chenfanfu Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.16385v1.pdf","comment":null}],"Systems and Control":[{"id":"http://arxiv.org/abs/2409.16499v1","updated":"2024-09-24T23:11:47Z","published":"2024-09-24T23:11:47Z","title":"Learning Linear Dynamics from Bilinear Observations","summary":" We consider the problem of learning a realization of a partially observed\ndynamical system with linear state transitions and bilinear observations. Under\nvery mild assumptions on the process and measurement noises, we provide a\nfinite time analysis for learning the unknown dynamics matrices (up to a\nsimilarity transform). Our analysis involves a regression problem with\nheavy-tailed and dependent data. Moreover, each row of our design matrix\ncontains a Kronecker product of current input with a history of inputs, making\nit difficult to guarantee persistence of excitation. We overcome these\nchallenges, first providing a data-dependent high probability error bound for\narbitrary but fixed inputs. Then, we derive a data-independent error bound for\ninputs chosen according to a simple random design. Our main results provide an\nupper bound on the statistical error rates and sample complexity of learning\nthe unknown dynamics matrices from a single finite trajectory of bilinear\nobservations.\n","authors":["Yahya Sattar","Yassir Jedra","Sarah Dean"],"pdf_url":"https://arxiv.org/pdf/2409.16499v1.pdf","comment":"35 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.16475v1","updated":"2024-09-24T21:52:10Z","published":"2024-09-24T21:52:10Z","title":"Interaction Techniques for User-friendly Interfaces for Gate-based\n Quantum Computing","summary":" Quantum computers offer promising approaches to various fields. To use\ncurrent noisy quantum computers, developers need to examine the compilation of\na logical circuit, the status of available hardware, and noises in results. As\nthose tasks are less common in classical computing, quantum developers may not\nbe familiar with performing them. Therefore, easier and more intuitive\ninterfaces are necessary to make quantum computers more approachable. While\nexisting notebook-based toolkits like Qiskit offer application programming\ninterfaces and visualization techniques, it is still difficult to navigate the\nvast space of quantum program design and hardware status.\n Inspired by human-computer interaction (HCI) work in data science and\nvisualization, our work introduces four user interaction techniques that can\naugment existing notebook-based toolkits for gate-based quantum computing: (1)\na circuit writer that lets users provide high-level information about a circuit\nand generates a code snippet to build it; (2) a machine explorer that provides\ndetailed properties and configurations of a hardware with a code to load\nselected information; (3) a circuit viewer that allows for comparing logical\ncircuit, compiled circuit, and hardware configurations; and (4) a visualization\nfor adjusting measurement outcomes with hardware error rates.\n","authors":["Hyeok Kim","Kaitlin N. Smith"],"pdf_url":"https://arxiv.org/pdf/2409.16475v1.pdf","comment":"A poster accepted to IEEE QCE 2024"},{"id":"http://arxiv.org/abs/2409.16460v1","updated":"2024-09-24T21:21:47Z","published":"2024-09-24T21:21:47Z","title":"MBC: Multi-Brain Collaborative Control for Quadruped Robots","summary":" In the field of locomotion task of quadruped robots, Blind Policy and\nPerceptive Policy each have their own advantages and limitations. The Blind\nPolicy relies on preset sensor information and algorithms, suitable for known\nand structured environments, but it lacks adaptability in complex or unknown\nenvironments. The Perceptive Policy uses visual sensors to obtain detailed\nenvironmental information, allowing it to adapt to complex terrains, but its\neffectiveness is limited under occluded conditions, especially when perception\nfails. Unlike the Blind Policy, the Perceptive Policy is not as robust under\nthese conditions. To address these challenges, we propose a MBC:Multi-Brain\ncollaborative system that incorporates the concepts of Multi-Agent\nReinforcement Learning and introduces collaboration between the Blind Policy\nand the Perceptive Policy. By applying this multi-policy collaborative model to\na quadruped robot, the robot can maintain stable locomotion even when the\nperceptual system is impaired or observational data is incomplete. Our\nsimulations and real-world experiments demonstrate that this system\nsignificantly improves the robot's passability and robustness against\nperception failures in complex environments, validating the effectiveness of\nmulti-policy collaboration in enhancing robotic motion performance.\n","authors":["Hang Liu","Yi Cheng","Rankun Li","Xiaowen Hu","Linqi Ye","Houde Liu"],"pdf_url":"https://arxiv.org/pdf/2409.16460v1.pdf","comment":"18 pages, 9 figures, Website and Videos: https://quad-mbc.github.io/"},{"id":"http://arxiv.org/abs/2409.16439v1","updated":"2024-09-24T20:18:18Z","published":"2024-09-24T20:18:18Z","title":"Active Perception with Initial-State Uncertainty: A Policy Gradient\n Method","summary":" This paper studies the synthesis of an active perception policy that\nmaximizes the information leakage of the initial state in a stochastic system\nmodeled as a hidden Markov model (HMM). Specifically, the emission function of\nthe HMM is controllable with a set of perception or sensor query actions. Given\nthe goal is to infer the initial state from partial observations in the HMM, we\nuse Shannon conditional entropy as the planning objective and develop a novel\npolicy gradient method with convergence guarantees. By leveraging a variant of\nobservable operators in HMMs, we prove several important properties of the\ngradient of the conditional entropy with respect to the policy parameters,\nwhich allow efficient computation of the policy gradient and stable and fast\nconvergence. We demonstrate the effectiveness of our solution by applying it to\nan inference problem in a stochastic grid world environment.\n","authors":["Chongyang Shi","Shuo Han","Michael Dorothy","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2409.16439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16389v1","updated":"2024-09-24T18:45:14Z","published":"2024-09-24T18:45:14Z","title":"Willems' Fundamental Lemma for Nonlinear Systems with Koopman Linear\n Embedding","summary":" Koopman operator theory and Willems' fundamental lemma both can provide\n(approximated) data-driven linear representation for nonlinear systems.\nHowever, choosing lifting functions for the Koopman operator is challenging,\nand the quality of the data-driven model from Willems' fundamental lemma has no\nguarantee for general nonlinear systems. In this paper, we extend Willems'\nfundamental lemma for a class of nonlinear systems that admit a Koopman linear\nembedding. We first characterize the relationship between the trajectory space\nof a nonlinear system and that of its Koopman linear embedding. We then prove\nthat the trajectory space of Koopman linear embedding can be formed by a linear\ncombination of rich-enough trajectories from the nonlinear system. Combining\nthese two results leads to a data-driven representation of the nonlinear\nsystem, which bypasses the need for the lifting functions and thus eliminates\nthe associated bias errors. Our results illustrate that both the width (more\ntrajectories) and depth (longer trajectories) of the trajectory library are\nimportant to ensure the accuracy of the data-driven model.\n","authors":["Xu Shang","Jorge Cortés","Yang Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.16389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16285v1","updated":"2024-09-24T17:58:26Z","published":"2024-09-24T17:58:26Z","title":"Age of Gossip in Networks with Multiple Views of a Source","summary":" We consider the version age of information (AoI) in a network where a subset\nof nodes act as sensing nodes, sampling a source that in general can follow a\ncontinuous distribution. Any sample of the source constitutes a new version of\nthe information and the version age of the information is defined with respect\nto the most recent version of the information available for the whole network.\nWe derive a recursive expression for the average version AoI between different\nsubsets of the nodes which can be used to evaluate the average version AoI for\nany subset of the nodes including any single node. We derive asymptotic\nbehavior of the average AoI on any single node of the network for various\ntopologies including line, ring, and fully connected networks. The prior art\nresult on version age of a network by Yates [ISIT'21] can be interpreted as in\nour derivation as a network with a single view of the source, e.g., through a\nPoisson process with rate $\\lambda_{00}$. Our result indicates that there is no\nloss in the average version AoI performance by replacing a single view of the\nsource with distributed sensing across multiple nodes by splitting the same\nrate $\\lambda_{00}$. Particularly, we show that asymptotically, the average AoI\nscales with $O(\\log(n))$ and $O(\\sqrt{n})$ for fully connected and ring\nnetworks, respectively. More interestingly, we show that for the ring network\nthe same $O(\\sqrt{n})$ asymptotical performance on average AoI is still\nachieved with distributed sensing if the number of sensing nodes only scales\nwith $O(\\sqrt{n})$ instead of prior known result which requires $O(n)$. Our\nresults indicate that the sensing nodes can be arbitrarily chosen as long as\nthe maximum number of consecutive non-sensing nodes also scales as\n$O(\\sqrt{n})$.\n","authors":["Kian J. Khojastepour","Matin Mortaheb","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2409.16285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16342v1","updated":"2024-09-24T17:26:55Z","published":"2024-09-24T17:26:55Z","title":"Transformer based time series prediction of the maximum power point for\n solar photovoltaic cells","summary":" This paper proposes an improved deep learning based maximum power point\ntracking (MPPT) in solar photovoltaic cells considering various time series\nbased environmental inputs. Generally, artificial neural network based MPPT\nalgorithms use basic neural network architectures and inputs which do not\nrepresent the ambient conditions in a comprehensive manner. In this article,\nthe ambient conditions of a location are represented through a comprehensive\nset of environmental features. Furthermore, the inclusion of time based\nfeatures in the input data is considered to model cyclic patterns temporally\nwithin the atmospheric conditions leading to robust modeling of the MPPT\nalgorithm. A transformer based deep learning architecture is trained as a time\nseries prediction model using multidimensional time series input features. The\nmodel is trained on a dataset containing typical meteorological year data\npoints of ambient weather conditions from 50 locations. The attention mechanism\nin the transformer modules allows the model to learn temporal patterns in the\ndata efficiently. The proposed model achieves a 0.47% mean average percentage\nerror of prediction on non zero operating voltage points in a test dataset\nconsisting of data collected over a period of 200 consecutive hours resulting\nin the average power efficiency of 99.54% and peak power efficiency of 99.98%.\nThe proposed model is validated through real time simulations. The proposed\nmodel performs power point tracking in a robust, dynamic, and nonlatent manner,\nover a wide range of atmospheric conditions.\n","authors":["Palaash Agrawal","Hari Om Bansal","Aditya R. Gautam","Om Prakash Mahela","Baseem Khan"],"pdf_url":"https://arxiv.org/pdf/2409.16342v1.pdf","comment":"Published June 2022, in Energy Science and Engineering, Volume10,\n Issue9, Pages 3397-3410"},{"id":"http://arxiv.org/abs/2409.16256v1","updated":"2024-09-24T17:25:13Z","published":"2024-09-24T17:25:13Z","title":"A Critical Review of Safe Reinforcement Learning Techniques in Smart\n Grid Applications","summary":" The high penetration of distributed energy resources (DERs) in modern smart\npower systems introduces unforeseen uncertainties for the electricity sector,\nleading to increased complexity and difficulty in the operation and control of\npower systems. As a cutting-edge machine learning technology, deep\nreinforcement learning (DRL) has been widely implemented in recent years to\nhandle the uncertainty in power systems. However, in critical infrastructures\nsuch as power systems, safety issues always receive top priority, while DRL may\nnot always meet the safety requirements of power system operators. The concept\nof safe reinforcement learning (safe RL) is emerging as a potential solution to\novercome the shortcomings of conventional DRL in the operation and control of\npower systems. This study provides a rigorous review of the latest research\nefforts focused on safe RL to derive power system control policies while\naccounting for the unique safety requirements of power grids. Furthermore, this\nstudy highlights various safe RL algorithms applied in diverse applications\nwithin the power system sector, from single grid-connected power converters,\nresidential smart homes, and buildings to large power distribution networks.\nFor all methods outlined, a discussion on their bottlenecks, research\nchallenges, and potential opportunities in the operation and control of power\nsystem applications is also presented. This review aims to support research in\nthe area of safe RL algorithms, embracing smart power system operation with\nsafety constraints amid high uncertainty from DERs.\n","authors":["Van-Hai Bui","Srijita Das","Akhtar Hussain","Guilherme Vieira Hollweg","Wencong Su"],"pdf_url":"https://arxiv.org/pdf/2409.16256v1.pdf","comment":"16 pages, 7 figures, 9 tables"},{"id":"http://arxiv.org/abs/2409.16214v1","updated":"2024-09-24T16:20:28Z","published":"2024-09-24T16:20:28Z","title":"TE-PINN: Quaternion-Based Orientation Estimation using\n Transformer-Enhanced Physics-Informed Neural Networks","summary":" This paper introduces a Transformer-Enhanced Physics-Informed Neural Network\n(TE-PINN) designed for accurate quaternion-based orientation estimation in\nhigh-dynamic environments, particularly within the field of robotics. By\nintegrating transformer networks with physics-informed learning, our approach\ninnovatively captures temporal dependencies in sensor data while enforcing the\nfundamental physical laws governing rotational motion. TE-PINN leverages a\nmulti-head attention mechanism to handle sequential data from inertial sensors,\nsuch as accelerometers and gyroscopes, ensuring temporal consistency.\nSimultaneously, the model embeds quaternion kinematics and rigid body dynamics\ninto the learning process, aligning the network's predictions with mechanical\nprinciples like Euler's laws of motion. The physics-informed loss function\nincorporates the dynamics of angular velocity and external forces, enhancing\nthe network's ability to generalize in complex scenarios. Our experimental\nevaluation demonstrates that TE-PINN consistently outperforms traditional\nmethods such as Extended Kalman Filters (EKF) and LSTM-based estimators,\nparticularly in scenarios characterized by high angular velocities and noisy\nsensor data. The results show a significant reduction in mean quaternion error\nand improved gyroscope bias estimation compared to the state-of-the-art. An\nablation study further isolates the contributions of both the transformer\narchitecture and the physics-informed constraints, highlighting the synergistic\neffect of both components in improving model performance. The proposed model\nachieves real-time performance on embedded systems typical of mobile robots,\noffering a scalable and efficient solution for orientation estimation in\nautonomous systems.\n","authors":["Arman Asgharpoor Golroudbari"],"pdf_url":"https://arxiv.org/pdf/2409.16214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16186v1","updated":"2024-09-24T15:34:01Z","published":"2024-09-24T15:34:01Z","title":"System-Level Performance Metrics Sensitivity of an Electrified\n Heavy-Duty Mobile Manipulator","summary":" The shift to electric and hybrid powertrains in vehicular systems has\npropelled advancements in mobile robotics and autonomous vehicles. This paper\nexamines the sensitivity of key performance metrics in a electrified heavy-duty\nmobile manipulator (HDMM) driven by electromechanical linear actuators (EMLAs)\npowered by permanent magnet synchronous motors (PMSMs). The study evaluates\npower delivery, force dynamics, energy consumption, and overall efficiency of\nthe actuation mechanisms. By computing partial derivatives (PD) with respect to\nthe payload mass at the tool center point (TCP), it provides insights into\nthese factors under various loading conditions. This research aids in the\nappropriate choice or design of EMLAs for HDMM electrification, addressing\nactuation mechanism selection challenge in vehicular system with mounted\nmanipulator and determines the necessary battery capacity requirements.\n","authors":["Mohammad Bahari","Alvaro Paz","Jouni Mattila"],"pdf_url":"https://arxiv.org/pdf/2409.16186v1.pdf","comment":"This work is submitted to IEEE VTC 2024"},{"id":"http://arxiv.org/abs/2404.07837v2","updated":"2024-09-24T15:25:59Z","published":"2024-04-11T15:25:13Z","title":"Data-Driven System Identification of Quadrotors Subject to Motor Delays","summary":" Recently non-linear control methods like Model Predictive Control (MPC) and\nReinforcement Learning (RL) have attracted increased interest in the quadrotor\ncontrol community. In contrast to classic control methods like cascaded PID\ncontrollers, MPC and RL heavily rely on an accurate model of the system\ndynamics. The process of quadrotor system identification is notoriously tedious\nand is often pursued with additional equipment like a thrust stand.\nFurthermore, low-level details like motor delays which are crucial for accurate\nend-to-end control are often neglected. In this work, we introduce a\ndata-driven method to identify a quadrotor's inertia parameters, thrust curves,\ntorque coefficients, and first-order motor delay purely based on proprioceptive\ndata. The estimation of the motor delay is particularly challenging as usually,\nthe RPMs can not be measured. We derive a Maximum A Posteriori (MAP)-based\nmethod to estimate the latent time constant. Our approach only requires about a\nminute of flying data that can be collected without any additional equipment\nand usually consists of three simple maneuvers. Experimental results\ndemonstrate the ability of our method to accurately recover the parameters of\nmultiple quadrotors. It also facilitates the deployment of RL-based, end-to-end\nquadrotor control of a large quadrotor under harsh, outdoor conditions.\n","authors":["Jonas Eschmann","Dario Albani","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2404.07837v2.pdf","comment":"Accepted at IROS 2024"},{"id":"http://arxiv.org/abs/2409.16115v1","updated":"2024-09-24T14:20:28Z","published":"2024-09-24T14:20:28Z","title":"Mean Age of Information in Partial Offloading Mobile Edge Computing\n Networks","summary":" The age of information (AoI) performance analysis is essential for evaluating\nthe information freshness in the large-scale mobile edge computing (MEC)\nnetworks. This work proposes the earliest analysis of the mean AoI (MAoI)\nperformance of large-scale partial offloading MEC networks. Firstly, we derive\nand validate the closed-form expressions of MAoI by using queueing theory and\nstochastic geometry. Based on these expressions, we analyse the effects of\ncomputing offloading ratio (COR) and task generation rate (TGR) on the MAoI\nperformance and compare the MAoI performance under the local computing, remote\ncomputing, and partial offloading schemes. The results show that by jointly\noptimising the COR and TGR, the partial offloading scheme outperforms the local\nand remote computing schemes in terms of the MAoI, which can be improved by up\nto 51% and 61%, respectively. This encourages the MEC networks to adopt the\npartial offloading scheme to improve the MAoI performance.\n","authors":["Ying Dong","Hang Xiao","Haonan Hu","Jiliang Zhang","Qianbin Chen","Jie Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.16115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16110v1","updated":"2024-09-24T14:16:26Z","published":"2024-09-24T14:16:26Z","title":"Wind lulls and slews; consequences for the stability of future UK\n electricity systems","summary":" As the United Kingdom wind fleet increases in size, wind lulls and slews will\nincreasingly challenge the stability of its electricity system. The paper\ndescribes the use of models based on real time records and including solar\nslews, to investigate the most extreme wind variations likely to be encountered\nin future, enabling strategies to be devised to mitigate them. Wind lulls are\nsurprisingly frequent, occasionally lasting a week or more, and are always\nlikely to be beyond the capabilities of stored or imported electrical energy to\nmitigate them. The models indicate that there will be a continuing need for gas\npowered generation to mitigate wind lulls. Currently, Combined Cycle Gas\nTurbines (CCGTs) provide most of the dispatchable generation. However, CCGTs\nare not sufficiently fast acting to cope with the wind and solar slews\nanticipated in future. The paper suggests that a range of already proven\nfast-acting sources of dispatchable generation, including Open Cycle Gas\nTurbines (OCGTs), Internal Combustion Gas-Fired Reciprocating engines (ICGRs)\nand stored electrical energy systems, should be capable of coping with the\nlargest wind and solar slews likely to be encountered up to the year 2035.\nExamples are given of the recent introduction of these fast-acting sources of\ngeneration which, it is suggested, will progressively replace CCGTs as the wind\nand solar fleets increase in size. Moreover, we see the pattern of recent\ninvestments, summarised in the paper, as a good indication of likely future\ninvestments, with OCGT investments mainly serving the 440 kV grid, and ICGRs\nand stored electrical energy more local networks.\n","authors":["Anthony D Stephens","David R Walwyn"],"pdf_url":"https://arxiv.org/pdf/2409.16110v1.pdf","comment":"13 pages, 8 figures, 3 tables"},{"id":"http://arxiv.org/abs/2409.16078v1","updated":"2024-09-24T13:24:14Z","published":"2024-09-24T13:24:14Z","title":"Assessing strategies to manage distributed photovoltaics in Swiss\n low-voltage networks: An analysis of curtailment, export tariffs, and\n resource sharing","summary":" The integration of photovoltaic systems poses several challenges for the\ndistribution grid, mainly due to the infrastructure not being designed to\nhandle the upstream flow and being dimensioned for consumption only,\npotentially leading to reliability and stability issues. This study\ninvestigates the use of capacity-based tariffs, export tariffs, and curtailment\npolicies to reduce negative grid impacts without hampering PV deployment. We\nanalyze the effect of such export tariffs on three typical Swiss low-voltage\nnetworks (rural, semi-urban, and urban), using power flow analysis to evaluate\nthe power exchanges at the transformer station, as well as line overloading and\nvoltage violations. Finally, a simple case of mutualization of resources is\nanalyzed to assess its potential contribution to relieving network constraints\nand the economic costs of managing LV networks. We found that the tariff with\ncapacity-based components on the export (CT export daily) severely penalizes PV\npenetration. This applies to other tariffs as well (e.g. IRR monthly,\nCurtailment 30, and DT variable) but to a lesser extent. However, the inclusion\nof curtailment at 50\\% and 70\\%, as well as mixed tariffs with capacity-based\ncomponents at import and curtailment, allow for a high degree of PV\ninstallations in the three zones studied and help to mitigate the impact of PV\non the distributed network.\n","authors":["Alejandro Pena-Bello","Gerard Marias Gonzalez","Nicolas Wyrsch","Christophe Ballif"],"pdf_url":"https://arxiv.org/pdf/2409.16078v1.pdf","comment":"Preprint version. 25 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.16048v1","updated":"2024-09-24T12:51:32Z","published":"2024-09-24T12:51:32Z","title":"Whole-body end-effector pose tracking","summary":" Combining manipulation with the mobility of legged robots is essential for a\nwide range of robotic applications. However, integrating an arm with a mobile\nbase significantly increases the system's complexity, making precise\nend-effector control challenging. Existing model-based approaches are often\nconstrained by their modeling assumptions, leading to limited robustness.\nMeanwhile, recent Reinforcement Learning (RL) implementations restrict the\narm's workspace to be in front of the robot or track only the position to\nobtain decent tracking accuracy. In this work, we address these limitations by\nintroducing a whole-body RL formulation for end-effector pose tracking in a\nlarge workspace on rough, unstructured terrains. Our proposed method involves a\nterrain-aware sampling strategy for the robot's initial configuration and\nend-effector pose commands, as well as a game-based curriculum to extend the\nrobot's operating range. We validate our approach on the ANYmal quadrupedal\nrobot with a six DoF robotic arm. Through our experiments, we show that the\nlearned controller achieves precise command tracking over a large workspace and\nadapts across varying terrains such as stairs and slopes. On deployment, it\nachieves a pose-tracking error of 2.64 cm and 3.64 degrees, outperforming\nexisting competitive baselines.\n","authors":["Tifanny Portela","Andrei Cramariuc","Mayank Mittal","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2409.16048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16041v1","updated":"2024-09-24T12:44:23Z","published":"2024-09-24T12:44:23Z","title":"Safe Output Feedback Improvement with Baselines","summary":" In data-driven control design, an important problem is to deal with\nuncertainty due to limited and noisy data. One way to do this is to use a\nmin-max approach, which aims to minimize some design criteria for the\nworst-case scenario. However, a strategy based on this approach can lead to\noverly conservative controllers. To overcome this issue, we apply the idea of\nbaseline regret, and it is seen that minimizing the baseline regret under model\nuncertainty can guarantee safe controller improvement with less conservatism\nand variance in the resulting controllers. To exemplify the use of baseline\ncontrollers, we focus on the output feedback setting and propose a two-step\ncontrol design method; first, an uncertainty set is constructed by a\ndata-driven system identification approach based on finite impulse response\nmodels; then a control design criterion based on model reference control is\nused. To solve the baseline regret optimization problem efficiently, we use a\nconvex approximation of the criterion and apply the scenario approach in\noptimization. The numerical examples show that the inclusion of baseline regret\nindeed improves the performance and reduces the variance of the resulting\ncontroller.\n","authors":["Ruoqi Zhang","Per Mattsson","Dave Zachariah"],"pdf_url":"https://arxiv.org/pdf/2409.16041v1.pdf","comment":"Accepted by The 63rd IEEE Conference on Decision and Control"},{"id":"http://arxiv.org/abs/2409.16008v1","updated":"2024-09-24T12:08:27Z","published":"2024-09-24T12:08:27Z","title":"Robust Neural IDA-PBC: passivity-based stabilization under\n approximations","summary":" In this paper, we restructure the Neural Interconnection and Damping\nAssignment - Passivity Based Control (Neural IDA-PBC) design methodology, and\nwe formally analyze its closed-loop properties. Neural IDA-PBC redefines the\nIDA-PBC design approach as an optimization problem by building on the framework\nof Physics Informed Neural Networks (PINNs). However, the closed-loop stability\nand robustness properties under Neural IDA-PBC remain unexplored. To address\nthe issue, we study the behavior of classical IDA-PBC under approximations. Our\ntheoretical analysis allows deriving conditions for practical and asymptotic\nstability of the desired equilibrium point. Moreover, it extends the Neural\nIDA-PBC applicability to port-Hamiltonian systems where the matching conditions\ncannot be solved exactly. Our renewed optimization-based design introduces\nthree significant aspects: i) it involves a novel optimization objective\nincluding stability and robustness constraints issued from our theoretical\nanalysis; ii) it employs separate Neural Networks (NNs), which can be\nstructured to reduce the search space to relevant functions; iii) it does not\nrequire knowledge about the port-Hamiltonian formulation of the system's model.\nOur methodology is validated with simulations on three standard benchmarks: a\ndouble pendulum, a nonlinear mass-spring-damper and a cartpole. Notably,\nclassical IDA-PBC designs cannot be analytically derived for the latter.\n","authors":["Santiago Sanchez-Escalonilla","Samuele Zoboli","Bayu Jayawardhana"],"pdf_url":"https://arxiv.org/pdf/2409.16008v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2403.15780v2","updated":"2024-09-24T09:24:11Z","published":"2024-03-23T09:32:23Z","title":"A Fairness-Oriented Reinforcement Learning Approach for the Operation\n and Control of Shared Micromobility Services","summary":" As Machine Learning grows in popularity across various fields, equity has\nbecome a key focus for the AI community. However fairness-oriented approaches\nare still underexplored in smart mobility. Addressing this gap, our study\ninvestigates the balance between performance optimization and algorithmic\nfairness in shared micromobility services providing a novel framework based on\nReinforcement Learning. Exploiting Q-Learning, the proposed methodology\nachieves equitable outcomes in terms of the Gini index across different areas\ncharacterized by their distance from central hubs. Through vehicle rebalancing,\nthe provided scheme maximizes operator performance while ensuring fairness\nprinciples for users, reducing iniquity by up to 80% while only increasing\ncosts by 30% (w.r.t. applying no equity adjustment). A case study with\nsynthetic data validates our insights and highlights the importance of fairness\nin urban micromobility.\n","authors":["Matteo Cederle","Luca Vittorio Piron","Marina Ceccon","Federico Chiariotti","Alessandro Fabris","Marco Fabris","Gian Antonio Susto"],"pdf_url":"https://arxiv.org/pdf/2403.15780v2.pdf","comment":"6 pages, 2 figures, jointly submitted to IEEE L-CSS and ACC 2025"},{"id":"http://arxiv.org/abs/2409.15858v1","updated":"2024-09-24T08:31:22Z","published":"2024-09-24T08:31:22Z","title":"Identification For Control Based on Neural Networks: Approximately\n Linearizable Models","summary":" This work presents a control-oriented identification scheme for efficient\ncontrol design and stability analysis of nonlinear systems. Neural networks are\nused to identify a discrete-time nonlinear state-space model to approximate\ntime-domain input-output behavior of a nonlinear system. The network is\nconstructed such that the identified model is approximately linearizable by\nfeedback, ensuring that the control law trivially follows from the learning\nstage. After the identification and quasi-linearization procedures, linear\ncontrol theory comes at hand to design robust controllers and study stability\nof the closed-loop system. The effectiveness and interest of the methodology\nare illustrated throughout the paper on popular benchmarks for system\nidentification.\n","authors":["Maxime Thieffry","Alexandre Hache","Mohamed Yagoubi","Philippe Chevrel"],"pdf_url":"https://arxiv.org/pdf/2409.15858v1.pdf","comment":"15 pages, 3 figures, 6 tables, accepted as a poster in SysDO 2024,\n Stuttgart, Germany"},{"id":"http://arxiv.org/abs/2409.02444v2","updated":"2024-09-24T08:04:30Z","published":"2024-09-04T04:44:21Z","title":"USV-AUV Collaboration Framework for Underwater Tasks under Extreme Sea\n Conditions","summary":" Autonomous underwater vehicles (AUVs) are valuable for ocean exploration due\nto their flexibility and ability to carry communication and detection units.\nNevertheless, AUVs alone often face challenges in harsh and extreme sea\nconditions. This study introduces a unmanned surface vehicle (USV)-AUV\ncollaboration framework, which includes high-precision multi-AUV positioning\nusing USV path planning via Fisher information matrix optimization and\nreinforcement learning for multi-AUV cooperative tasks. Applied to a multi-AUV\nunderwater data collection task scenario, extensive simulations validate the\nframework's feasibility and superior performance, highlighting exceptional\ncoordination and robustness under extreme sea conditions. To accelerate\nrelevant research in this field, we have made the simulation code available as\nopen-source.\n","authors":["Jingzehua Xu","Guanwen Xie","Xinqi Wang","Yimian Ding","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14605v2","updated":"2024-09-24T08:02:21Z","published":"2024-09-22T21:52:58Z","title":"First Field Trial of LLM-Powered AI Agent for Lifecycle Management of\n Autonomous Driving Optical Networks","summary":" We design and demonstrate the first field trial of LLM-powered AI Agent for\nADON. Three operation modes of the Agent are proposed for network lifecycle\nmanagement. The Agent efficiently processes wavelength add/drop and soft/hard\nfailures, and achieves comparable performance to human-designed algorithms for\npower optimization.\n","authors":["Xiaomin Liu","Qizhi Qiu","Yihao Zhang","Yuming Cheng","Lilin Yi","Weisheng Hu","Qunbi Zhuge"],"pdf_url":"https://arxiv.org/pdf/2409.14605v2.pdf","comment":"Version submitted to ECOC PDP 2024 on September 6th"},{"id":"http://arxiv.org/abs/2409.15802v1","updated":"2024-09-24T06:52:07Z","published":"2024-09-24T06:52:07Z","title":"A Multi-Level Approach for Class Imbalance Problem in Federated Learning\n for Remote Industry 4.0 Applications","summary":" Deep neural network (DNN) models are effective solutions for industry 4.0\napplications (\\eg oil spill detection, fire detection, anomaly detection).\nHowever, training a DNN network model needs a considerable amount of data\ncollected from various sources and transferred to the central cloud server that\ncan be expensive and sensitive to privacy. For instance, in the remote offshore\noil field where network connectivity is vulnerable, a federated fog environment\ncan be a potential computing platform. Hence it is feasible to perform\ncomputation within the federation. On the contrary, performing a DNN model\ntraining using fog systems poses a security issue that the federated learning\n(FL) technique can resolve. In this case, the new challenge is the class\nimbalance problem that can be inherited in local data sets and can degrade the\nperformance of the global model. Therefore, FL training needs to be performed\nconsidering the class imbalance problem locally. In addition, an efficient\ntechnique to select the relevant worker model needs to be adopted at the global\nlevel to increase the robustness of the global model. Accordingly, we utilize\none of the suitable loss functions addressing the class imbalance in workers at\nthe local level. In addition, we employ a dynamic threshold mechanism with\nuser-defined worker's weight to efficiently select workers for aggregation that\nimprove the global model's robustness. Finally, we perform an extensive\nempirical evaluation to explore the benefits of our solution and find up to\n3-5% performance improvement than baseline federated learning methods.\n","authors":["Razin Farhan Hussain","Mohsen Amini Salehi"],"pdf_url":"https://arxiv.org/pdf/2409.15802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15792v1","updated":"2024-09-24T06:39:36Z","published":"2024-09-24T06:39:36Z","title":"Regional stability conditions for recurrent neural network-based control\n systems","summary":" In this paper we propose novel global and regional stability analysis\nconditions based on linear matrix inequalities for a general class of recurrent\nneural networks. These conditions can be also used for state-feedback control\ndesign and a suitable optimization problem enforcing H2 norm minimization\nproperties is defined. The theoretical results are corroborated by numerical\nsimulations, showing the advantages and limitations of the methods presented\nherein.\n","authors":["Alessio La Bella","Marcello Farina","William D'Amico","Luca Zaccarian"],"pdf_url":"https://arxiv.org/pdf/2409.15792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15737v1","updated":"2024-09-24T04:44:09Z","published":"2024-09-24T04:44:09Z","title":"Reinforcement Leaning for Infinite-Dimensional Systems","summary":" Interest in reinforcement learning (RL) for massive-scale systems consisting\nof large populations of intelligent agents interacting with heterogeneous\nenvironments has witnessed a significant surge in recent years across diverse\nscientific domains. However, due to the large-scale nature of the system, the\nmajority of state-of-the-art RL techniques either encounter high computational\ncost or exhibit compromised performance. To mitigate these challenges, we\npropose a novel RL architecture along with the derivation of effective\nalgorithms to learn optimal policies for any arbitrarily large system of\nagents. Specifically, we model such a system as a parameterized control system\ndefined on an infinite-dimensional function space. We then develop a moment\nkernel transform to map the parameterized system and the value function of an\nRL problem into a reproducing kernel Hilbert space. This transformation\nsubsequently generates a finite-dimensional moment representation for this RL\nproblem. Leveraging this representation, we develop a hierarchical algorithm\nfor learning optimal policies for the infinite-dimensional parameterized\nsystem. We further enhance efficiency of the algorithm by exploiting early\nstopping at each hierarchy, by which we show the fast convergence property of\nthe algorithm through constructing a convergent spectral sequence. The\nperformance and efficiency of the proposed algorithm are validated using\npractical examples.\n","authors":["Wei Zhang","Jr-Shin Li"],"pdf_url":"https://arxiv.org/pdf/2409.15737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15720v1","updated":"2024-09-24T04:10:27Z","published":"2024-09-24T04:10:27Z","title":"Optimization of partially isolated quantum harmonic oscillator memory\n systems by mean square decoherence time criteria","summary":" This paper is concerned with open quantum harmonic oscillators with\nposition-momentum system variables, whose internal dynamics and interaction\nwith the environment are governed by linear quantum stochastic differential\nequations. A recently proposed approach to such systems as Heisenberg picture\nquantum memories exploits their ability to approximately retain initial\nconditions over a decoherence horizon. Using the quantum memory decoherence\ntime defined previously in terms of a fidelity threshold on a weighted\nmean-square deviation of the system variables from their initial values, we\napply this approach to a partially isolated subsystem of the oscillator, which\nis not directly affected by the external fields. The partial isolation leads to\nan appropriate system decomposition and a qualitatively different short-horizon\nasymptotic behaviour of the deviation, which yields a longer decoherence time\nin the high-fidelity limit. The resulting approximate decoherence time\nmaximization over the energy parameters for improving the quantum memory\nperformance is discussed for a coherent feedback interconnection of such\nsystems.\n","authors":["Igor G. Vladimirov","Ian R. Petersen"],"pdf_url":"https://arxiv.org/pdf/2409.15720v1.pdf","comment":"9 pages, 3 figures, submitted to ANZCC 2025"},{"id":"http://arxiv.org/abs/2409.15717v1","updated":"2024-09-24T04:06:01Z","published":"2024-09-24T04:06:01Z","title":"Autonomous Wheel Loader Navigation Using Goal-Conditioned Actor-Critic\n MPC","summary":" This paper proposes a novel control method for an autonomous wheel loader,\nenabling time-efficient navigation to an arbitrary goal pose. Unlike prior\nworks that combine high-level trajectory planners with Model Predictive Control\n(MPC), we directly enhance the planning capabilities of MPC by integrating a\ncost function derived from Actor-Critic Reinforcement Learning (RL).\nSpecifically, we train an RL agent to solve the pose reaching task in\nsimulation, then incorporate the trained neural network critic as both the\nstage and terminal cost of an MPC. We show through comprehensive simulations\nthat the resulting MPC inherits the time-efficient behavior of the RL agent,\ngenerating trajectories that compare favorably against those found using\ntrajectory optimization. We also deploy our method on a real wheel loader,\nwhere we successfully navigate to various goal poses. In contrast, the RL actor\nrisked damaging the machine and was unsuitable for real-world use.\n","authors":["Aleksi Mäki-Penttilä","Naeim Ebrahimi Toulkani","Reza Ghabcheloo"],"pdf_url":"https://arxiv.org/pdf/2409.15717v1.pdf","comment":"Submitted to International Conference on Robotics and Automation\n (ICRA) 2025"},{"id":"http://arxiv.org/abs/2409.15710v1","updated":"2024-09-24T03:58:18Z","published":"2024-09-24T03:58:18Z","title":"Autotuning Bipedal Locomotion MPC with GRFM-Net for Efficient\n Sim-to-Real Transfer","summary":" Bipedal locomotion control is essential for humanoid robots to navigate\ncomplex, human-centric environments. While optimization-based control designs\nare popular for integrating sophisticated models of humanoid robots, they often\nrequire labor-intensive manual tuning. In this work, we address the challenges\nof parameter selection in bipedal locomotion control using DiffTune, a\nmodel-based autotuning method that leverages differential programming for\nefficient parameter learning. A major difficulty lies in balancing model\nfidelity with differentiability. We address this difficulty using a\nlow-fidelity model for differentiability, enhanced by a Ground Reaction\nForce-and-Moment Network (GRFM-Net) to capture discrepancies between MPC\ncommands and actual control effects. We validate the parameters learned by\nDiffTune with GRFM-Net in hardware experiments, which demonstrates the\nparameters' optimality in a multi-objective setting compared with baseline\nparameters, reducing the total loss by up to 40.5$\\%$ compared with the\nexpert-tuned parameters. The results confirm the GRFM-Net's effectiveness in\nmitigating the sim-to-real gap, improving the transferability of\nsimulation-learned parameters to real hardware.\n","authors":["Qianzhong Chen","Junheng Li","Sheng Cheng","Naira Hovakimyan","Quan Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.15710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15708v1","updated":"2024-09-24T03:53:35Z","published":"2024-09-24T03:53:35Z","title":"Open-/Closed-loop Active Learning for Data-driven Predictive Control","summary":" An important question in data-driven control is how to obtain an informative\ndataset. In this work, we consider the problem of effective data acquisition of\nan unknown linear system with bounded disturbance for both open-loop and\nclosed-loop stages. The learning objective is to minimize the volume of the set\nof admissible systems. First, a performance measure based on historical data\nand the input sequence is introduced to characterize the upper bound of the\nvolume of the set of admissible systems. On the basis of this performance\nmeasure, an open-loop active learning strategy is proposed to minimize the\nvolume by actively designing inputs during the open-loop stage. For the\nclosed-loop stage, an closed-loop active learning strategy is designed to\nselect and learn from informative closed-loop data. The efficiency of the\nproposed closed-loop active learning strategy is proved by showing that the\nunselected data cannot benefit the learning performance. Furthermore, an\nadaptive predictive controller is designed in accordance with the proposed data\nacquisition approach. The recursive feasibility and the stability of the\ncontroller are proved by analyzing the effect of the closed-loop active\nlearning strategy. Finally, numerical examples and comparisons illustrate the\neffectiveness of the proposed data acquisition strategy.\n","authors":["Shilun Feng","Dawei Shi","Yang Shi","Kaikai Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.15708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15703v1","updated":"2024-09-24T03:32:10Z","published":"2024-09-24T03:32:10Z","title":"Agent-state based policies in POMDPs: Beyond belief-state MDPs","summary":" The traditional approach to POMDPs is to convert them into fully observed\nMDPs by considering a belief state as an information state. However, a\nbelief-state based approach requires perfect knowledge of the system dynamics\nand is therefore not applicable in the learning setting where the system model\nis unknown. Various approaches to circumvent this limitation have been proposed\nin the literature. We present a unified treatment of some of these approaches\nby viewing them as models where the agent maintains a local recursively\nupdateable agent state and chooses actions based on the agent state. We\nhighlight the different classes of agent-state based policies and the various\napproaches that have been proposed in the literature to find good policies\nwithin each class. These include the designer's approach to find optimal\nnon-stationary agent-state based policies, policy search approaches to find a\nlocally optimal stationary agent-state based policies, and the approximate\ninformation state to find approximately optimal stationary agent-state based\npolicies. We then present how ideas from the approximate information state\napproach have been used to improve Q-learning and actor-critic algorithms for\nlearning in POMDPs.\n","authors":["Amit Sinha","Aditya Mahajan"],"pdf_url":"https://arxiv.org/pdf/2409.15703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14165v2","updated":"2024-09-24T03:12:12Z","published":"2024-09-21T15:07:37Z","title":"Will Large Language Models be a Panacea to Autonomous Driving?","summary":" Artificial intelligence (AI) plays a crucial role in autonomous driving (AD)\nresearch, propelling its development towards intelligence and efficiency.\nCurrently, the development of AD technology follows two main technical paths:\nmodularization and end-to-end. Modularization decompose the driving task into\nmodules such as perception, prediction, planning, and control, and train them\nseparately. Due to the inconsistency of training objectives between modules,\nthe integrated effect suffers from bias. End-to-end attempts to address this\nissue by utilizing a single model that directly maps from sensor data to\ncontrol signals. This path has limited learning capabilities in a comprehensive\nset of features and struggles to handle unpredictable long-tail events and\ncomplex urban traffic scenarios. In the face of challenges encountered in both\npaths, many researchers believe that large language models (LLMs) with powerful\nreasoning capabilities and extensive knowledge understanding may be the\nsolution, expecting LLMs to provide AD systems with deeper levels of\nunderstanding and decision-making capabilities. In light of the challenges\nfaced by both paths, many researchers believe that LLMs, with their powerful\nreasoning abilities and extensive knowledge, could offer a solution. To\nunderstand if LLMs could enhance AD, this paper conducts a thorough analysis of\nthe potential applications of LLMs in AD systems, including exploring their\noptimization strategies in both modular and end-to-end approaches, with a\nparticular focus on how LLMs can tackle the problems and challenges present in\ncurrent solutions. Furthermore, we discuss an important question: Can LLM-based\nartificial general intelligence (AGI) be a key to achieve high-level AD? We\nfurther analyze the potential limitations and challenges that LLMs may\nencounter in promoting the development of AD technology.\n","authors":["Yuxuan Zhu","Shiyi Wang","Wenqing Zhong","Nianchen Shen","Yunqi Li","Siqi Wang","Zhiheng Li","Cathy Wu","Zhengbing He","Li Li"],"pdf_url":"https://arxiv.org/pdf/2409.14165v2.pdf","comment":null}]},"2024-09-23T00:00:00Z":{"Systems and Control":[{"id":"http://arxiv.org/abs/2401.00212v2","updated":"2024-09-23T23:07:31Z","published":"2023-12-30T12:12:35Z","title":"Physics-Informed Multi-Agent Reinforcement Learning for Distributed\n Multi-Robot Problems","summary":" The networked nature of multi-robot systems presents challenges in the\ncontext of multi-agent reinforcement learning. Centralized control policies do\nnot scale with increasing numbers of robots, whereas independent control\npolicies do not exploit the information provided by other robots, exhibiting\npoor performance in cooperative-competitive tasks. In this work we propose a\nphysics-informed reinforcement learning approach able to learn distributed\nmulti-robot control policies that are both scalable and make use of all the\navailable information to each robot. Our approach has three key\ncharacteristics. First, it imposes a port-Hamiltonian structure on the policy\nrepresentation, respecting energy conservation properties of physical robot\nsystems and the networked nature of robot team interactions. Second, it uses\nself-attention to ensure a sparse policy representation able to handle\ntime-varying information at each robot from the interaction graph. Third, we\npresent a soft actor-critic reinforcement learning algorithm parameterized by\nour self-attention port-Hamiltonian control policy, which accounts for the\ncorrelation among robots during training while overcoming the need of value\nfunction factorization. Extensive simulations in different multi-robot\nscenarios demonstrate the success of the proposed approach, surpassing previous\nmulti-robot reinforcement learning solutions in scalability, while achieving\nsimilar or superior performance (with averaged cumulative reward up to x2\ngreater than the state-of-the-art with robot teams x6 larger than the number of\nrobots at training time).\n","authors":["Eduardo Sebastian","Thai Duong","Nikolay Atanasov","Eduardo Montijano","Carlos Sagues"],"pdf_url":"https://arxiv.org/pdf/2401.00212v2.pdf","comment":"This paper is under review at IEEE T-RO"},{"id":"http://arxiv.org/abs/2409.15559v1","updated":"2024-09-23T21:31:44Z","published":"2024-09-23T21:31:44Z","title":"LDPC Codes in Cooperative Communication","summary":" In fact, the broadcast nature of every transmitter makes it possible for\nother transceivers in the channel to overhear the broadcasted signal. The\nproposed idea in cooperative communication is to use these intermediate\ntransceivers as relay for the transmitted signal, therefore we will have\nspatial diversity which can improve throughput and received data reliability in\nthe system. In this dissertation we consider some important aspects of\ncooperative communication in a network composed of three nodes. First, we\nverify the increase of reliability for the received signal by comparing the\nreliability of the received bits in a cooperative network with a\nnon-cooperative one. Then we step forward and use LDPC error correction\ntechnique to improve the reliability of the received bits even more and compare\nit with a network without LDPC codes (encoder and decoder) to measure the level\nof improvement for different SNRs. The overall aim of this dissertation is to\ndeploy cooperative communication idea to consider and test its claimed aspects\nand also enhance its performance by using LDPC error correction technique.\n","authors":["Ali Mehrban"],"pdf_url":"https://arxiv.org/pdf/2409.15559v1.pdf","comment":"41 pages"},{"id":"http://arxiv.org/abs/2409.15506v1","updated":"2024-09-23T19:51:24Z","published":"2024-09-23T19:51:24Z","title":"Spectral Graph Theoretic Methods for Enhancing Network Robustness in\n Robot Localization","summary":" This paper addresses the optimization of edge-weighted networks by maximizing\nalgebraic connectivity to enhance network robustness. Motivated by the need for\nprecise robot position estimation in cooperative localization and pose-graph\nsparsification in Simultaneous Localization and Mapping (SLAM), the algebraic\nconnectivity maximization problem is formulated as a Mixed Integer\nSemi-Definite Program (MISDP), which is NP-hard. Leveraging spectral graph\ntheoretic methods, specifically Cheeger's inequality, this work introduces\nnovel \"Cheeger cuts\" to strengthen and efficiently solve medium-scale MISDPs.\nFurther, a new Mixed Integer Linear Program (MILP) is developed for efficiently\ncomputing Cheeger cuts, implemented within an outer-approximation algorithm for\nsolving the MISDP. A greedy k-opt heuristic is also presented, producing\nhigh-quality solutions that serve as valid lower bounds for Cheeger cuts.\nComprehensive numerical analyses demonstrate the efficacy of strengthened cuts\nvia substantial improvements in run times on synthetic and realistic robot\nlocalization datasets.\n","authors":["Neelkamal Somisetty","Harsha Nagarajan","Swaroop Darbha"],"pdf_url":"https://arxiv.org/pdf/2409.15506v1.pdf","comment":"63rd IEEE Conference on Decision and Control (CDC)"},{"id":"http://arxiv.org/abs/2409.15448v1","updated":"2024-09-23T18:23:14Z","published":"2024-09-23T18:23:14Z","title":"Optimization-based Verification of Discrete-time Control Barrier\n Functions: A Branch-and-Bound Approach","summary":" Discrete-time Control Barrier Functions (DTCBFs) form a powerful control\ntheoretic tool to guarantee safety and synthesize safe controllers for\ndiscrete-time dynamical systems. In this paper, we provide an\noptimization-based algorithm, inspired by the $\\alpha$BB algorithm, for the\nverification of a candidate DTCBF, i.e., either verifying a given candidate\nfunction as a valid DTCBF or falsifying it by providing a counterexample for a\ngeneral nonlinear discrete-time system with input constraints. This method is\napplicable whether a corresponding control policy is known or unknown. We apply\nour method to a numerical case study to illustrate its efficacy.\n","authors":["Erfan Shakhesi","W. P. M. H. Heemels","Alexander Katriniok"],"pdf_url":"https://arxiv.org/pdf/2409.15448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15267v1","updated":"2024-09-23T17:57:58Z","published":"2024-09-23T17:57:58Z","title":"Peer-to-Peer Learning Dynamics of Wide Neural Networks","summary":" Peer-to-peer learning is an increasingly popular framework that enables\nbeyond-5G distributed edge devices to collaboratively train deep neural\nnetworks in a privacy-preserving manner without the aid of a central server.\nNeural network training algorithms for emerging environments, e.g., smart\ncities, have many design considerations that are difficult to tune in\ndeployment settings -- such as neural network architectures and\nhyperparameters. This presents a critical need for characterizing the training\ndynamics of distributed optimization algorithms used to train highly nonconvex\nneural networks in peer-to-peer learning environments. In this work, we provide\nan explicit, non-asymptotic characterization of the learning dynamics of wide\nneural networks trained using popular distributed gradient descent (DGD)\nalgorithms. Our results leverage both recent advancements in neural tangent\nkernel (NTK) theory and extensive previous work on distributed learning and\nconsensus. We validate our analytical results by accurately predicting the\nparameter and error dynamics of wide neural networks trained for classification\ntasks.\n","authors":["Shreyas Chaudhari","Srinivasa Pranav","Emile Anand","José M. F. Moura"],"pdf_url":"https://arxiv.org/pdf/2409.15267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.08892v4","updated":"2024-09-23T17:07:38Z","published":"2022-07-18T19:06:18Z","title":"Distributed Differentiable Dynamic Game for Multi-robot Coordination","summary":" This paper develops a Distributed Differentiable Dynamic Game (D3G)\nframework, which can efficiently solve the forward and inverse problems in\nmulti-robot coordination. We formulate multi-robot coordination as a dynamic\ngame, where the behavior of a robot is dictated by its own dynamics and\nobjective that also depends on others' behavior. In the forward problem, D3G\nenables all robots collaboratively to seek the Nash equilibrium of the game in\na distributed manner, by developing a distributed shooting-based Nash solver.\nIn the inverse problem, where each robot aims to find (learn) its objective\n(and dynamics) parameters to mimic given coordination demonstrations, D3G\nproposes a differentiation solver based on Differential Pontryagin's Maximum\nPrinciple, which allows each robot to update its parameters in a distributed\nand coordinated manner. We test the D3G in simulation with two types of robots\ngiven different task configurations. The results demonstrate the effectiveness\nof D3G for solving both forward and inverse problems in comparison with\nexisting methods.\n","authors":["Yizhi Zhou","Wanxin Jin","Xuan Wang"],"pdf_url":"https://arxiv.org/pdf/2207.08892v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15105v1","updated":"2024-09-23T15:16:35Z","published":"2024-09-23T15:16:35Z","title":"SPformer: A Transformer Based DRL Decision Making Method for Connected\n Automated Vehicles","summary":" In mixed autonomy traffic environment, every decision made by an\nautonomous-driving car may have a great impact on the transportation system.\nBecause of the complex interaction between vehicles, it is challenging to make\ndecisions that can ensure both high traffic efficiency and safety now and\nfuther. Connected automated vehicles (CAVs) have great potential to improve the\nquality of decision-making in this continuous, highly dynamic and interactive\nenvironment because of their stronger sensing and communicating ability. For\nmulti-vehicle collaborative decision-making algorithms based on deep\nreinforcement learning (DRL), we need to represent the interactions between\nvehicles to obtain interactive features. The representation in this aspect\ndirectly affects the learning efficiency and the quality of the learned policy.\nTo this end, we propose a CAV decision-making architecture based on transformer\nand reinforcement learning algorithms. A learnable policy token is used as the\nlearning medium of the multi-vehicle joint policy, the states of all vehicles\nin the area of interest can be adaptively noticed in order to extract\ninteractive features among agents. We also design an intuitive physical\npositional encodings, the redundant location information of which optimizes the\nperformance of the network. Simulations show that our model can make good use\nof all the state information of vehicles in traffic scenario, so as to obtain\nhigh-quality driving decisions that meet efficiency and safety objectives. The\ncomparison shows that our method significantly improves existing DRL-based\nmulti-vehicle cooperative decision-making algorithms.\n","authors":["Ye Han","Lijun Zhang","Dejian Meng","Xingyu Hu","Yixia Lu"],"pdf_url":"https://arxiv.org/pdf/2409.15105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15066v1","updated":"2024-09-23T14:36:40Z","published":"2024-09-23T14:36:40Z","title":"A 3.5 GS/s 1-1 MASH VCO ADC With Second-Order Noise Shaping","summary":" In this work, a 3.5 GS/s voltage-controlled oscillator (VCO)\nanalog-to-digital converter (ADC) using multi-stage noise shaping (MASH) is\npresented. This 28nm CMOS ADC achieves second-order noise shaping in an\neasily-scalable, open-loop configuration. A key enabler of the high-bandwidth\nMASH VCO ADC is the use of a multi-bit estimated error signal. With an OSR of\n16, an SNDR of 67 dB and DR of 68 dB are achieved in 109.375 MHz bandwidth. The\nfull-custom pseudo-analog circuits consume 9 mW, while the automatically\ngenerated digital circuits consume another 24 mW. A $\\mathbf{FoM_{DR} = 163}$\ndB and core area of $\\mathbf{0.017\\,\\mathbf{mm}^2}$ are obtained.\n","authors":["Brendan Saux","Jonas Borgmans","Johan Raman","Pieter Rombouts"],"pdf_url":"https://arxiv.org/pdf/2409.15066v1.pdf","comment":"14 pages, 29 figures. Author's version. IEEE Transactions on Circuits\n and Systems I: Regular Papers"},{"id":"http://arxiv.org/abs/2409.15061v1","updated":"2024-09-23T14:35:23Z","published":"2024-09-23T14:35:23Z","title":"Cloud Deployment of Large-Scale Electromagnetic Transient Simulation --\n Discovery and Experiences","summary":" Electromagnetic Transient (EMT) simulation starts to play a critical role in\nmodern power system planning and operations due to large penetration of\ninverter based resources (IBRs). The EMT studies are computationally intensive\ndue to very small simulation time step and complex modeling of the protection\nand control of IBRs. It has been challenging for the traditional on-premises\ncomputing infrastructure to meet the ever-increasing computing needs of\nlarge-scale EMT studies. This paper shares experience of ISO New England\n(ISO-NE) on a pilot deployment of EMT simulation in a public cloud using Amazon\nWeb Services. The platform can successfully meet the large-scale EMT simulation\ncomputation needs in a cost-effective way while meeting cyber security and data\nprivacy requirements.\n","authors":["Xiaochuan Luo","Jason Ploof","Xinghao Fang","Qiang Zhang","Song Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.15061v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.15047v1","updated":"2024-09-23T14:21:16Z","published":"2024-09-23T14:21:16Z","title":"Towards a General Market for Cloud-Edge-IoT Continuum","summary":" Recent years have witnessed the proposals aiming at enabling Vertical,\ntwo-sided markets with a Single Marketplace (or exchange) (VSMs) for computing\nand data resources/services (products) offerings in a multi-cloud and\ncrowdsourced IoT-edge sensing environment. A VSM is designed vertically from\nbottom up with a broker being a built-in component of the marketplace. While\npreventing seller lock-in and improving efficiency and availability, a VSM\nsuffers from a key weakness from a buyer's perspective, i.e., the broker and\nthe corresponding marketplace lock-in, which may lead to suboptimal shopping\nexperience for buyers, due to marketplace monopoly by the broker and limited\nchoice of products in the marketplace. In this position paper, we argue that a\nHorizontal two-sided market with Multiple Marketplaces (HMM), resembling the\nglobal stock market, should be developed. In an HMM, different marketplaces may\nbe operated by different parties and sell similar and/or different types of\nproducts, e.g., computing and/or sensory data products. A broker is no longer a\nbuilt-in component of any given marketplace. Instead, it may cover multiple\nmarketplaces at the same time and there can be more than one broker in the HMM.\nBoth the number and types of marketplaces and brokers may grow independently or\nscale horizontally to meet the growing demand. A buyer shops for a broker\nthrough whom the buyer gains access to the needed products sold in the\nmarketplace(s) the broker covers and from whom the buyer receives various\npossible services, e.g., discount, value-added, or full services. An HMM not\nonly overcomes the key weakness of a VSM but also allows the market to grow\nincrementally and organically. Finally, two example use cases are given to\nillustrate the benefits of HMM.\n","authors":["Hao Che","Hong Jiang","Zhijun Wang"],"pdf_url":"https://arxiv.org/pdf/2409.15047v1.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.14902v1","updated":"2024-09-23T10:55:49Z","published":"2024-09-23T10:55:49Z","title":"A Contract Theory for Layered Control Architectures","summary":" Autonomous systems typically leverage layered control architectures with a\ncombination of discrete and continuous models operating at different\ntimescales. As a result, layered systems form a new class of hybrid systems\ncomposed of systems operating on a diverse set of continuous and discrete\nsignals. This paper formalizes the notion of a layered (hierarchical) control\narchitecture through a theory of relations between its layers. This theory\nenables us to formulate contracts within layered control systems -- these\ndefine interfaces between layers and isolate the design of each layer,\nguaranteeing that composition of contracts at each layer results in a contract\ncapturing the desired system-wide specification. Thus, the proposed theory\nyields the ability to analyze layered control architectures via a compositional\napproach.\n","authors":["Manuel Mazo Jr.","Will Compton","Max H. Cohen","Aaron D. Ames"],"pdf_url":"https://arxiv.org/pdf/2409.14902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14873v1","updated":"2024-09-23T10:11:42Z","published":"2024-09-23T10:11:42Z","title":"Optimal state estimation: Turnpike analysis and performance results","summary":" In this paper, we introduce turnpike arguments in the context of optimal\nstate estimation. In particular, we show that the optimal solution of the state\nestimation problem involving all available past data serves as turnpike for the\nsolutions of truncated problems involving only a subset of the data. We\nconsider two different mathematical characterizations of this phenomenon and\nprovide corresponding sufficient conditions that rely on strict dissipativity\nand decaying sensitivity. As second contribution, we show how a specific\nturnpike property can be used to establish performance guarantees when\napproximating the optimal solution of the full problem by a sequence of\ntruncated problems, and we show that the resulting performance (both averaged\nand non-averaged) is approximately optimal with error terms that can be made\narbitrarily small by an appropriate choice of the horizon length. In addition,\nwe discuss interesting implications of these results for the practically\nrelevant case of moving horizon estimation and illustrate our results with a\nnumerical example.\n","authors":["Julian D. Schiller","Lars Grüne","Matthias A. Müller"],"pdf_url":"https://arxiv.org/pdf/2409.14873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14821v1","updated":"2024-09-23T08:54:05Z","published":"2024-09-23T08:54:05Z","title":"Towards Real-world Deployment of NILM Systems: Challenges and Practices","summary":" Non-intrusive load monitoring (NILM), as a key load monitoring technology,\ncan much reduce the deployment cost of traditional power sensors. Previous\nresearch has largely focused on developing cloud-exclusive NILM algorithms,\nwhich often result in high computation costs and significant service delays. To\naddress these issues, we propose a three-tier framework to enhance the\nreal-world applicability of NILM systems through edge-cloud collaboration.\nConsidering the computational resources available at both the edge and cloud,\nwe implement a lightweight NILM model at the edge and a deep learning based\nmodel at the cloud, respectively. In addition to the differential model\nimplementations, we also design a NILM-specific deployment scheme that\nintegrates Gunicorn and NGINX to bridge the gap between theoretical algorithms\nand practical applications. To verify the effectiveness of the proposed\nframework, we apply real-world NILM scenario settings and implement the entire\nprocess of data acquisition, model training, and system deployment. The results\ndemonstrate that our framework can achieve high decomposition accuracy while\nsignificantly reducing the cloud workload and communication overhead under\npractical considerations.\n","authors":["Junyu Xue","Yu Zhang","Xudong Wang","Yi Wang","Guoming Tang"],"pdf_url":"https://arxiv.org/pdf/2409.14821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14739v1","updated":"2024-09-23T06:35:19Z","published":"2024-09-23T06:35:19Z","title":"AmpAgent: An LLM-based Multi-Agent System for Multi-stage Amplifier\n Schematic Design from Literature for Process and Performance Porting","summary":" Multi-stage amplifiers are widely applied in analog circuits. However, their\nlarge number of components, complex transfer functions, and intricate pole-zero\ndistributions necessitate extensive manpower for derivation and param sizing to\nensure their stability. In order to achieve efficient derivation of the\ntransfer function and simplify the difficulty of circuit design, we propose\nAmpAgent: a multi-agent system based on large language models (LLMs) for\nefficiently designing such complex amplifiers from literature with process and\nperformance porting. AmpAgent is composed of three agents: Literature Analysis\nAgent, Mathematics Reasoning Agent and Device Sizing Agent. They are separately\nresponsible for retrieving key information (e.g. formulas and transfer\nfunctions) from the literature, decompose the whole circuit's design problem by\nderiving the key formulas, and address the decomposed problem iteratively.\n AmpAgent was employed in the schematic design of seven types of multi-stage\namplifiers with different compensation techniques. In terms of design\nefficiency, AmpAgent has reduced the number of iterations by 1.32$ \\sim\n$4${\\times}$ and execution time by 1.19$ \\sim $2.99${\\times}$ compared to\nconventional optimization algorithms, with a success rate increased by 1.03$\n\\sim $6.79${\\times}$. In terms of circuit performance, it has improved by 1.63$\n\\sim $27.25${\\times}$ compared to the original literature. The findings suggest\nthat LLMs could play a crucial role in the field of complex analog circuit\nschematic design, as well as process and performance porting.\n","authors":["Chengjie Liu","Weiyu Chen","Anlan Peng","Yuan Du","Li Du","Jun Yang"],"pdf_url":"https://arxiv.org/pdf/2409.14739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12521v2","updated":"2024-09-23T06:04:36Z","published":"2024-09-19T07:24:12Z","title":"GraspSAM: When Segment Anything Model Meets Grasp Detection","summary":" Grasp detection requires flexibility to handle objects of various shapes\nwithout relying on prior knowledge of the object, while also offering\nintuitive, user-guided control. This paper introduces GraspSAM, an innovative\nextension of the Segment Anything Model (SAM), designed for prompt-driven and\ncategory-agnostic grasp detection. Unlike previous methods, which are often\nlimited by small-scale training data, GraspSAM leverages the large-scale\ntraining and prompt-based segmentation capabilities of SAM to efficiently\nsupport both target-object and category-agnostic grasping. By utilizing\nadapters, learnable token embeddings, and a lightweight modified decoder,\nGraspSAM requires minimal fine-tuning to integrate object segmentation and\ngrasp prediction into a unified framework. The model achieves state-of-the-art\n(SOTA) performance across multiple datasets, including Jacquard,\nGrasp-Anything, and Grasp-Anything++. Extensive experiments demonstrate the\nflexibility of GraspSAM in handling different types of prompts (such as points,\nboxes, and language), highlighting its robustness and effectiveness in\nreal-world robotic applications.\n","authors":["Sangjun Noh","Jongwon Kim","Dongwoo Nam","Seunghyeok Back","Raeyoung Kang","Kyoobin Lee"],"pdf_url":"https://arxiv.org/pdf/2409.12521v2.pdf","comment":"6 pages (main), 1 page (references)"},{"id":"http://arxiv.org/abs/2409.09978v2","updated":"2024-09-23T04:50:12Z","published":"2024-09-16T04:15:36Z","title":"Context-Conditioned Spatio-Temporal Predictive Learning for Reliable V2V\n Channel Prediction","summary":" Achieving reliable multidimensional Vehicle-to-Vehicle (V2V) channel state\ninformation (CSI) prediction is both challenging and crucial for optimizing\ndownstream tasks that depend on instantaneous CSI. This work extends\ntraditional prediction approaches by focusing on four-dimensional (4D) CSI,\nwhich includes predictions over time, bandwidth, and antenna (TX and RX) space.\nSuch a comprehensive framework is essential for addressing the dynamic nature\nof mobility environments within intelligent transportation systems,\nnecessitating the capture of both temporal and spatial dependencies across\ndiverse domains. To address this complexity, we propose a novel\ncontext-conditioned spatiotemporal predictive learning method. This method\nleverages causal convolutional long short-term memory (CA-ConvLSTM) to\neffectively capture dependencies within 4D CSI data, and incorporates\ncontext-conditioned attention mechanisms to enhance the efficiency of\nspatiotemporal memory updates. Additionally, we introduce an adaptive\nmeta-learning scheme tailored for recurrent networks to mitigate the issue of\naccumulative prediction errors. We validate the proposed method through\nempirical studies conducted across three different geometric configurations and\nmobility scenarios. Our results demonstrate that the proposed approach\noutperforms existing state-of-the-art predictive models, achieving superior\nperformance across various geometries. Moreover, we show that the meta-learning\nframework significantly enhances the performance of recurrent-based predictive\nmodels in highly challenging cross-geometry settings, thus highlighting its\nrobustness and adaptability.\n","authors":["Lei Chu","Daoud Burghal","Rui Wang","Michael Neuman","Andreas F. Molisch"],"pdf_url":"https://arxiv.org/pdf/2409.09978v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14688v1","updated":"2024-09-23T03:21:44Z","published":"2024-09-23T03:21:44Z","title":"A Generalized Control Revision Method for Autonomous Driving Safety","summary":" Safety is one of the most crucial challenges of autonomous driving vehicles,\nand one solution to guarantee safety is to employ an additional control\nrevision module after the planning backbone. Control Barrier Function (CBF) has\nbeen widely used because of its strong mathematical foundation on safety.\nHowever, the incompatibility with heterogeneous perception data and incomplete\nconsideration of traffic scene elements make existing systems hard to be\napplied in dynamic and complex real-world scenarios. In this study, we\nintroduce a generalized control revision method for autonomous driving safety,\nwhich adopts both vectorized perception and occupancy grid map as inputs and\ncomprehensively models multiple types of traffic scene constraints based on a\nnew proposed barrier function. Traffic elements are integrated into one unified\nframework, decoupled from specific scenario settings or rules. Experiments on\nCARLA, SUMO, and OnSite simulator prove that the proposed algorithm could\nrealize safe control revision under complicated scenes, adapting to various\nplanning backbones, road topologies, and risk types. Physical platform\nvalidation also verifies the real-world application feasibility.\n","authors":["Zehang Zhu","Yuning Wang","Tianqi Ke","Zeyu Han","Shaobing Xu","Qing Xu","John M. Dolan","Jianqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2409.14688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14675v1","updated":"2024-09-23T02:50:51Z","published":"2024-09-23T02:50:51Z","title":"Maintaining Strong $r$-Robustness in Reconfigurable Multi-Robot Networks\n using Control Barrier Functions","summary":" In leader-follower consensus, strong $r$-robustness of the communication\ngraph provides a sufficient condition for followers to achieve consensus in the\npresence of misbehaving agents. Previous studies have assumed that robots can\nform and/or switch between predetermined network topologies with known\nrobustness properties. However, robots with distance-based communication models\nmay not be able to achieve these topologies while moving through spatially\nconstrained environments, such as narrow corridors, to complete their\nobjectives. This paper introduces a Control Barrier Function (CBF) that ensures\nrobots maintain strong $r$-robustness of their communication graph above a\ncertain threshold without maintaining any fixed topologies. Our CBF directly\naddresses robustness, allowing robots to have flexible reconfigurable network\nstructure while navigating to achieve their objectives. The efficacy of our\nmethod is tested through various simulation and hardware experiments.\n","authors":["Haejoon Lee","Dimitra Panagou"],"pdf_url":"https://arxiv.org/pdf/2409.14675v1.pdf","comment":"Submitted to IEEE International Conference on Robotics and Automation\n (ICRA) 2025"},{"id":"http://arxiv.org/abs/2409.14639v1","updated":"2024-09-23T00:49:57Z","published":"2024-09-23T00:49:57Z","title":"Impedance Control for Manipulators Handling Heavy Payloads","summary":" Attaching a heavy payload to the wrist force/moment (F/M) sensor of a\nmanipulator can cause conventional impedance controllers to fail in\nestablishing the desired impedance due to the presence of non-contact forces;\nnamely, the inertial and gravitational forces of the payload. This paper\npresents an impedance control scheme designed to accurately shape the\nforce-response of such a manipulator without requiring acceleration\nmeasurements. As a result, neither wrist accelerometers nor dynamic estimators\nfor compensating inertial load forces are necessary. The proposed controller\nemploys an inner-outer loop feedback structure, which not only addresses\nuncertainties in the robot's dynamics but also enables the specification of a\ngeneral target impedance model, including nonlinear models. Stability and\nconvergence of the controller are analytically proven, with results showing\nthat the control input remains bounded as long as the desired inertia differs\nfrom the payload inertia. Experimental results confirm that the proposed\nimpedance controller effectively shapes the impedance of a manipulator carrying\na heavy load according to the desired impedance model.\n","authors":["Farhad Aghili"],"pdf_url":"https://arxiv.org/pdf/2409.14639v1.pdf","comment":null}]},"2024-09-22T00:00:00Z":{"Systems and Control":[{"id":"http://arxiv.org/abs/2409.14616v1","updated":"2024-09-22T22:50:06Z","published":"2024-09-22T22:50:06Z","title":"Learning to Refine Input Constrained Control Barrier Functions via\n Uncertainty-Aware Online Parameter Adaptation","summary":" Control Barrier Functions (CBFs) have become powerful tools for ensuring\nsafety in nonlinear systems. However, finding valid CBFs that guarantee\npersistent safety and feasibility remains an open challenge, especially in\nsystems with input constraints. Traditional approaches often rely on manually\ntuning the parameters of the class K functions of the CBF conditions a priori.\nThe performance of CBF-based controllers is highly sensitive to these fixed\nparameters, potentially leading to overly conservative behavior or safety\nviolations. To overcome these issues, this paper introduces a learning-based\noptimal control framework for online adaptation of Input Constrained CBF\n(ICCBF) parameters in discrete-time nonlinear systems. Our method employs a\nprobabilistic ensemble neural network to predict the performance and risk\nmetrics, as defined in this work, for candidate parameters, accounting for both\nepistemic and aleatoric uncertainties. We propose a two-step verification\nprocess using Jensen-Renyi Divergence and distributionally-robust Conditional\nValue at Risk to identify valid parameters. This enables dynamic refinement of\nICCBF parameters based on current state and nearby environments, optimizing\nperformance while ensuring safety within the verified parameter set.\nExperimental results demonstrate that our method outperforms both\nfixed-parameter and existing adaptive methods in robot navigation scenarios\nacross safety and performance metrics.\n","authors":["Taekyung Kim","Robin Inho Kee","Dimitra Panagou"],"pdf_url":"https://arxiv.org/pdf/2409.14616v1.pdf","comment":"Project page: https://www.taekyung.me/online-adaptive-cbf"},{"id":"http://arxiv.org/abs/2103.10952v2","updated":"2024-09-22T22:44:48Z","published":"2021-03-19T17:59:37Z","title":"Asymmetry underlies stability in power grids","summary":" Behavioral homogeneity is often critical for the functioning of network\nsystems of interacting entities. In power grids, whose stable operation\nrequires generator frequencies to be synchronized--and thus homogeneous--across\nthe network, previous work suggests that the stability of synchronous states\ncan be improved by making the generators homogeneous. Here, we show that a\nsubstantial additional improvement is possible by instead making the generators\nsuitably heterogeneous. We develop a general method for attributing this\ncounterintuitive effect to converse symmetry breaking, a recently established\nphenomenon in which the system must be asymmetric to maintain a stable\nsymmetric state. These findings constitute the first demonstration of converse\nsymmetry breaking in real-world systems, and our method promises to enable\nidentification of this phenomenon in other networks whose functions rely on\nbehavioral homogeneity.\n","authors":["Ferenc Molnar","Takashi Nishikawa","Adilson E. Motter"],"pdf_url":"https://arxiv.org/pdf/2103.10952v2.pdf","comment":"Updated to correct the damping parameters in Fig. 1 and its caption,\n which were inadvertently over-rounded in the original version. The published\n version of the Article has also been updated with this correction"},{"id":"http://arxiv.org/abs/2409.09161v2","updated":"2024-09-22T20:09:00Z","published":"2024-09-13T19:43:16Z","title":"Train-On-Request: An On-Device Continual Learning Workflow for Adaptive\n Real-World Brain Machine Interfaces","summary":" Brain-machine interfaces (BMIs) are expanding beyond clinical settings thanks\nto advances in hardware and algorithms. However, they still face challenges in\nuser-friendliness and signal variability. Classification models need periodic\nadaptation for real-life use, making an optimal re-training strategy essential\nto maximize user acceptance and maintain high performance. We propose TOR, a\ntrain-on-request workflow that enables user-specific model adaptation to novel\nconditions, addressing signal variability over time. Using continual learning,\nTOR preserves knowledge across sessions and mitigates inter-session\nvariability. With TOR, users can refine, on demand, the model through on-device\nlearning (ODL) to enhance accuracy adapting to changing conditions. We evaluate\nthe proposed methodology on a motor-movement dataset recorded with a\nnon-stigmatizing wearable BMI headband, achieving up to 92% accuracy and a\nre-calibration time as low as 1.6 minutes, a 46% reduction compared to a naive\ntransfer learning workflow. We additionally demonstrate that TOR is suitable\nfor ODL in extreme edge settings by deploying the training procedure on a\nRISC-V ultra-low-power SoC (GAP9), resulting in 21.6 ms of latency and 1 mJ of\nenergy consumption per training step. To the best of our knowledge, this work\nis the first demonstration of an online, energy-efficient, dynamic adaptation\nof a BMI model to the intrinsic variability of EEG signals in real-time\nsettings.\n","authors":["Lan Mei","Cristian Cioflan","Thorir Mar Ingolfsson","Victor Kartsch","Andrea Cossettini","Xiaying Wang","Luca Benini"],"pdf_url":"https://arxiv.org/pdf/2409.09161v2.pdf","comment":"5 pages, 6 figures, to be published in 2024 IEEE Biomedical Circuits\n and Systems Conference (BioCAS)"},{"id":"http://arxiv.org/abs/2409.14575v1","updated":"2024-09-22T19:39:53Z","published":"2024-09-22T19:39:53Z","title":"Domain knowledge-guided machine learning framework for state of health\n estimation in Lithium-ion batteries","summary":" Accurate estimation of battery state of health is crucial for effective\nelectric vehicle battery management. Here, we propose five health indicators\nthat can be extracted online from real-world electric vehicle operation and\ndevelop a machine learning-based method to estimate the battery state of\nhealth. The proposed indicators provide physical insights into the energy and\npower fade of the battery and enable accurate capacity estimation even with\npartially missing data. Moreover, they can be computed for portions of the\ncharging profile and real-world driving discharging conditions, facilitating\nreal-time battery degradation estimation. The indicators are computed using\nexperimental data from five cells aged under electric vehicle conditions, and a\nlinear regression model is used to estimate the state of health. The results\nshow that models trained with power autocorrelation and energy-based features\nachieve capacity estimation with maximum absolute percentage error within 1.5%\nto 2.5% .\n","authors":["Andrea Lanubile","Pietro Bosoni","Gabriele Pozzato","Anirudh Allam","Matteo Acquarone","Simona Onori"],"pdf_url":"https://arxiv.org/pdf/2409.14575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12317v2","updated":"2024-09-22T19:29:42Z","published":"2024-09-18T21:11:40Z","title":"Excitable Nonlinear Opinion Dynamics (E-NOD) for Agile Decision-Making","summary":" We present Excitable Nonlinear Opinion Dynamics (E-NOD), which describe\nopinion-forming and decision-making behavior with superior \"agility\" in\nresponding and adapting to fast and unpredictable changes in context,\nenvironment, or information about available options. E-NOD is derived by\nintroducing a single extra term to the previously presented Nonlinear Opinion\nDynamics (NOD), which have been shown to enable fast and flexible multi-agent\nbehavior. This extra term is inspired by the fast-positive, slow-negative\nmixed-feedback structure of excitable systems. The agile behaviors resulting\nfrom the excitable nature of decision-making driven by E-NOD are analyzed in a\ngeneral setting and illustrated through an application to robot navigation\naround human movers.\n","authors":["Charlotte Cathcart","Ian Xul Belaustegui","Alessio Franci","Naomi Ehrich Leonard"],"pdf_url":"https://arxiv.org/pdf/2409.12317v2.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2304.12458v2","updated":"2024-09-22T17:10:12Z","published":"2023-04-24T21:29:41Z","title":"Model-Free Learning and Optimal Policy Design in Multi-Agent MDPs Under\n Probabilistic Agent Dropout","summary":" This work studies a multi-agent Markov decision process (MDP) that can\nundergo agent dropout and the computation of policies for the post-dropout\nsystem based on control and sampling of the pre-dropout system. The central\nplanner's objective is to find an optimal policy that maximizes the value of\nthe expected system given a priori knowledge of the agents' dropout\nprobabilities. For MDPs with a certain transition independence and reward\nseparability structure, we assume that removing agents from the system forms a\nnew MDP comprised of the remaining agents with new state and action spaces,\ntransition dynamics that marginalize the removed agents, and rewards that are\nindependent of the removed agents. We first show that under these assumptions,\nthe value of the expected post-dropout system can be represented by a single\nMDP; this \"robust MDP\" eliminates the need to evaluate all $2^N$ realizations\nof the system, where N denotes the number of agents. More significantly, in a\nmodel-free context, it is shown that the robust MDP value can be estimated with\nsamples generated by the pre-dropout system, meaning that robust policies can\nbe found before dropout occurs. This fact is used to propose a policy\nimportance sampling (IS) routine that performs policy evaluation for dropout\nscenarios while controlling the existing system with good pre-dropout policies.\nThe policy IS routine produces value estimates for both the robust MDP and\nspecific post-dropout system realizations and is justified with exponential\nconfidence bounds. Finally, the utility of this approach is verified in\nsimulation, showing how structural properties of agent dropout can help a\ncontroller find good post-dropout policies before dropout occurs.\n","authors":["Carmel Fiscko","Soummya Kar","Bruno Sinopoli"],"pdf_url":"https://arxiv.org/pdf/2304.12458v2.pdf","comment":"22 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.14534v1","updated":"2024-09-22T17:07:59Z","published":"2024-09-22T17:07:59Z","title":"Goal-Oriented Communications for Interplanetary and Non-Terrestrial\n Networks","summary":" The accelerated pace of space exploration and satellite connectivity calls\nfor scalable communication network architectures that can effectively cater for\nincreasing numbers of bursty flows, such as those occurring in remote\nmonitoring and actuation. Communications in Space face unique challenges\nincluding highly variable delays and disruptions that sometimes preclude\nreal-time signaling and end-to-end acknowledgements. In this paper we provide a\nvision for tackling these fundamental challenges by exploiting recent progress\nin goal-oriented communication. Our vision for Goal-Oriented Networking in\nSpace is built on three pillars: (1) principles and decision metrics for\ngoal-oriented sampling and multi-user scheduling, that can handle highly\nvariable delay processes that contain memory, (2) grant-free access policies\nfor massive machine-type communications that replace exogenous arrivals with\ngoal-oriented traffic shaping, and (3) flow control mechanisms that exploit the\ncross-layer operability at application and link layers of Delay/Disruption\nTolerant Networking (DTN) protocols.\n","authors":["Elif Uysal"],"pdf_url":"https://arxiv.org/pdf/2409.14534v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.14532v1","updated":"2024-09-22T16:58:06Z","published":"2024-09-22T16:58:06Z","title":"Distributed Primal-Dual Interior Point Framework for Analyzing\n Infeasible Combined Transmission and Distribution Grid Networks","summary":" The proliferation of distributed energy resources has heightened the\ninteractions between transmission and distribution (T&D) systems, necessitating\nnovel analyses for the reliable operation and planning of interconnected T&D\nnetworks. A critical gap is an analysis approach that identifies and localizes\nthe weak spots in the combined T\\&D networks, providing valuable information to\nsystem planners and operators. The research goal is to efficiently model and\nsimulate infeasible (i.e. unsolvable in general settings) combined positive\nsequence transmission and three-phase distribution networks with a unified\nsolution algorithm. We model the combined T&D network with the equivalent\ncircuit formulation. To solve the overall T&D network, we build a\nGauss-Jacobi-Newton (GJN) based distributed primal dual interior point\noptimization algorithm capable of isolating weak nodes. We validate the\napproach on large combined T&D networks with 70k+ T and 15k+ D nodes and\ndemonstrate performance improvement over the alternating direction method of\nmultipliers (ADMM) method.\n","authors":["Muhammad Hamza Ali","Amritanshu Pandey"],"pdf_url":"https://arxiv.org/pdf/2409.14532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14499v1","updated":"2024-09-22T15:52:47Z","published":"2024-09-22T15:52:47Z","title":"A Review of Scalable and Privacy-Preserving Multi-Agent Frameworks for\n Distributed Energy Resource Control","summary":" Distributed energy resources (DERs) are gaining prominence due to their\nadvantages in improving energy efficiency, reducing carbon emissions, and\nenhancing grid resilience. Despite the increasing deployment, the potential of\nDERs has yet to be fully explored and exploited. A fundamental question\nrestrains the management of numerous DERs in large-scale power systems, \"How\nshould DER data be securely processed and DER operations be efficiently\noptimized?\" To address this question, this paper considers two critical issues,\nnamely privacy for processing DER data and scalability in optimizing DER\noperations, then surveys existing and emerging solutions from a multi-agent\nframework perspective. In the context of scalability, this paper reviews\nstate-of-the-art research that relies on parallel control, optimization, and\nlearning within distributed and/or decentralized information exchange\nstructures, while in the context of privacy, it identifies privacy preservation\nmeasures that can be synthesized into the aforementioned scalable structures.\nDespite research advances in these areas, challenges remain because these\nhighly interdisciplinary studies blend a wide variety of scalable computing\narchitectures and privacy preservation techniques from different fields, making\nthem difficult to adapt in practice. To mitigate this issue, this paper\nprovides a holistic review of trending strategies that orchestrate privacy and\nscalability for large-scale power system operations from a multi-agent\nperspective, particularly for DER control problems. Furthermore, this review\nextrapolates new approaches for future scalable, privacy-aware, and cybersecure\npathways to unlock the full potential of DERs through controlling, optimizing,\nand learning generic multi-agent-based cyber-physical systems.\n","authors":["Xiang Huo","Hao Huang","Katherine R. Davis","H. Vincent Poor","Mingxi Liu"],"pdf_url":"https://arxiv.org/pdf/2409.14499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.14399v3","updated":"2024-09-22T15:31:03Z","published":"2022-09-28T19:49:39Z","title":"FIRE: A Failure-Adaptive Reinforcement Learning Framework for Edge\n Computing Migrations","summary":" In edge computing, users' service profiles are migrated due to user mobility.\nReinforcement learning (RL) frameworks have been proposed to do so, often\ntrained on simulated data. However, existing RL frameworks overlook occasional\nserver failures, which although rare, impact latency-sensitive applications\nlike autonomous driving and real-time obstacle detection. Nevertheless, these\nfailures (rare events), being not adequately represented in historical training\ndata, pose a challenge for data-driven RL algorithms. As it is impractical to\nadjust failure frequency in real-world applications for training, we introduce\nFIRE, a framework that adapts to rare events by training a RL policy in an edge\ncomputing digital twin environment. We propose ImRE, an importance\nsampling-based Q-learning algorithm, which samples rare events proportionally\nto their impact on the value function. FIRE considers delay, migration,\nfailure, and backup placement costs across individual and shared service\nprofiles. We prove ImRE's boundedness and convergence to optimality. Next, we\nintroduce novel deep Q-learning (ImDQL) and actor critic (ImACRE) versions of\nour algorithm to enhance scalability. We extend our framework to accommodate\nusers with varying risk tolerances. Through trace driven experiments, we show\nthat FIRE reduces costs compared to vanilla RL and the greedy baseline in the\nevent of failures.\n","authors":["Marie Siew","Shikhar Sharma","Zekai Li","Kun Guo","Chao Xu","Tania Lorido-Botran","Tony Q. S. Quek","Carlee Joe-Wong"],"pdf_url":"https://arxiv.org/pdf/2209.14399v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14454v1","updated":"2024-09-22T14:07:10Z","published":"2024-09-22T14:07:10Z","title":"A Unified Approach for Learning the Dynamics of Power System Generators\n and Inverter-based Resources","summary":" The growing prevalence of inverter-based resources (IBRs) for renewable\nenergy integration and electrification greatly challenges power system dynamic\nanalysis. To account for both synchronous generators (SGs) and IBRs, this work\npresents an approach for learning the model of an individual dynamic component.\nThe recurrent neural network (RNN) model is used to match the recursive\nstructure in predicting the key dynamical states of a component from its\nterminal bus voltage and set-point input. To deal with the fast transients\nespecially due to IBRs, we develop a Stable Integral (SI-)RNN to mimic\nhigh-order integral methods that can enhance the stability and accuracy for the\ndynamic learning task. We demonstrate that the proposed SI-RNN model not only\ncan successfully predict the component's dynamic behaviors, but also offers the\npossibility of efficiently computing the dynamic sensitivity relative to a\nset-point change. These capabilities have been numerically validated based on\nfull-order Electromagnetic Transient (EMT) simulations on a small test system\nwith both SGs and IBRs, particularly for predicting the dynamics of\ngrid-forming inverters.\n","authors":["Shaohui Liu","Weiqian Cai","Hao Zhu","Brian Johnson"],"pdf_url":"https://arxiv.org/pdf/2409.14454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14369v1","updated":"2024-09-22T09:04:35Z","published":"2024-09-22T09:04:35Z","title":"Few-Shot Testing of Autonomous Vehicles with Scenario Similarity\n Learning","summary":" Testing and evaluation are critical to the development and deployment of\nautonomous vehicles (AVs). Given the rarity of safety-critical events such as\ncrashes, millions of tests are typically needed to accurately assess AV safety\nperformance. Although techniques like importance sampling can accelerate this\nprocess, it usually still requires too many numbers of tests for field testing.\nThis severely hinders the testing and evaluation process, especially for\nthird-party testers and governmental bodies with very limited testing budgets.\nThe rapid development cycles of AV technology further exacerbate this\nchallenge. To fill this research gap, this paper introduces the few-shot\ntesting (FST) problem and proposes a methodological framework to tackle it. As\nthe testing budget is very limited, usually smaller than 100, the FST method\ntransforms the testing scenario generation problem from probabilistic sampling\nto deterministic optimization, reducing the uncertainty of testing results. To\noptimize the selection of testing scenarios, a cross-attention similarity\nmechanism is proposed to learn to extract the information of AV's testing\nscenario space. This allows iterative searches for scenarios with the smallest\nevaluation error, ensuring precise testing within budget constraints.\nExperimental results in cut-in scenarios demonstrate the effectiveness of the\nFST method, significantly enhancing accuracy and enabling efficient, precise AV\ntesting.\n","authors":["Shu Li","Honglin He","Jingxuan Yang","Jianming Hu","Yi Zhang","Shuo Feng"],"pdf_url":"https://arxiv.org/pdf/2409.14369v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.14366v1","updated":"2024-09-22T08:54:56Z","published":"2024-09-22T08:54:56Z","title":"Robust Data-Driven Tube-Based Zonotopic Predictive Control with\n Closed-Loop Guarantees","summary":" This work proposes a robust data-driven tube-based zonotopic predictive\ncontrol (TZPC) approach for discrete-time linear systems, designed to ensure\nstability and recursive feasibility in the presence of bounded noise. The\nproposed approach consists of two phases. In an initial learning phase, we\nprovide an over-approximation of all models consistent with past input and\nnoisy state data using zonotope properties. Subsequently, in a control phase,\nwe formulate an optimization problem, which by integrating terminal ingredients\nis proven to be recursively feasible. Moreover, we prove that implementing this\ndata-driven predictive control approach guarantees robust exponential stability\nof the closed-loop system. The effectiveness and competitive performance of the\nproposed control strategy, compared to recent data-driven predictive control\nmethods, are illustrated through numerical simulations.\n","authors":["Mahsa Farjadnia","Angela Fontan","Amr Alanwar","Marco Molinari","Karl Henrik Johansson"],"pdf_url":"https://arxiv.org/pdf/2409.14366v1.pdf","comment":"Accepted for presentation and publication at the 63rd IEEE Conference\n on Decision and Control (CDC)"},{"id":"http://arxiv.org/abs/2409.14342v1","updated":"2024-09-22T07:01:18Z","published":"2024-09-22T07:01:18Z","title":"Adapting Gait Frequency for Posture-regulating Humanoid Push-recovery\n via Hierarchical Model Predictive Control","summary":" Current humanoid push-recovery strategies often use whole-body motion, yet\nposture regulation is often overlooked. For instance, during manipulation\ntasks, the upper body may need to stay upright and have minimal recovery\ndisplacement. This paper introduces a novel approach to enhancing humanoid\npush-recovery performance under unknown disturbances and regulating body\nposture by tailoring the recovery stepping strategy. We propose a\nhierarchical-MPC-based scheme that analyzes and detects instability in the\nprediction window and quickly recovers through adapting gait frequency. Our\napproach integrates a high-level nonlinear MPC, a posture-aware gait frequency\nadaptation planner, and a low-level convex locomotion MPC. The planners predict\nthe center of mass (CoM) state trajectories that can be assessed for precursors\nof potential instability and posture deviation. In simulation, we demonstrate\nimproved maximum recoverable impulse by 131% on average compared with baseline\napproaches. In hardware experiments, a 125 ms advancement in recovery stepping\ntiming/reflex has been observed with the proposed approach, We also demonstrate\nimproved push-recovery performance and minimized attitude change under 0.2 rad.\n","authors":["Junheng Li","Zhanhao Le","Junchao Ma","Quan Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.14342v1.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.11807v2","updated":"2024-09-22T06:44:20Z","published":"2024-04-18T00:02:18Z","title":"Continuous Dynamic Bipedal Jumping via Real-time Variable-model\n Optimization","summary":" Dynamic and continuous jumping remains an open yet challenging problem in\nbipedal robot control. Real-time planning with full body dynamics over the\nentire jumping trajectory presents unsolved challenges in computation burden.\nIn this paper, we propose a novel variable-model optimization approach, a\nunified framework of variable-model trajectory optimization (TO) and\nvariable-frequency Model Predictive Control (MPC), to effectively realize\ncontinuous and robust jumping planning and control on HECTOR bipedal robot in\nreal-time. The proposed TO fuses variable-fidelity dynamics modeling of bipedal\njumping motion in different jumping phases to balance trajectory accuracy and\nreal-time computation efficiency. In addition, conventional fixed-frequency\ncontrol approaches suffer from unsynchronized sampling frequencies, leading to\nmismatched modeling resolutions. We address this by aligning the MPC sampling\nfrequency with the variable-model TO trajectory resolutions across different\nphases. In hardware experiments, we have demonstrated robust and dynamic jumps\ncovering a distance of up to 40 cm (57% of robot height). To verify the\nrepeatability of this experiment, we run 53 jumping experiments and achieve 90%\nsuccess rate. In continuous jumps, we demonstrate continuous bipedal jumping\nwith terrain height perturbations (up to 5 cm) and discontinuities (up to 20 cm\ngap).\n","authors":["Junheng Li","Omar Kolt","Quan Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.11807v2.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.08665v3","updated":"2024-09-22T05:51:54Z","published":"2024-09-13T09:27:34Z","title":"Agile Decision-Making and Safety-Critical Motion Planning for Emergency\n Autonomous Vehicles","summary":" Efficiency is critical for autonomous vehicles (AVs), especially for\nemergency AVs. However, most existing methods focus on regular vehicles,\noverlooking the distinct strategies required by emergency vehicles to address\nthe challenge of maximizing efficiency while ensuring safety. In this paper, we\npropose an Integrated Agile Decision-Making with Active and Safety-Critical\nMotion Planning System (IDEAM). IDEAM focuses on enabling emergency AVs, such\nas ambulances, to actively attain efficiency in dense traffic scenarios with\nsafety in mind. Firstly, the speed-centric decision-making algorithm named the\nlong short-term spatio-temporal graph-centric decision-making (LSGM) is given.\nLSGM comprises conditional depth-first search (C-DFS) for multiple paths\ngeneration as well as methods for speed gains and risk evaluation for path\nselection, which presents a robust algorithm for high efficiency and safety\nconsideration. Secondly, with an output path from LSGM, the motion planner\nreconsiders environmental conditions to decide constraints states for the final\nplanning stage, among which the lane-probing state is designed for actively\nattaining spatial and speed advantage. Thirdly, under the Frenet-based model\npredictive control (MPC) framework with final constraints state and selected\npath, the safety-critical motion planner employs decoupled discrete control\nbarrier functions (DCBFs) and linearized discrete-time high-order control\nbarrier functions (DHOCBFs) to model the constraints associated with different\ndriving behaviors, making the optimal optimization problem convex. Finally, we\nextensively validate our system using scenarios from a randomly synthetic\ndataset, demonstrating its capability to achieve speed benefits and assure\nsafety simultaneously.\n","authors":["Yiming Shu","Jingyuan Zhou","Fu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.08665v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14323v1","updated":"2024-09-22T05:49:51Z","published":"2024-09-22T05:49:51Z","title":"Cluster-based Network Time Synchronization for Resilience with Energy\n Efficiency","summary":" Time synchronization of devices in Internet-of-Things (IoT) networks is one\nof the challenging problems and a pre-requisite for the design of low-latency\napplications. Although many existing solutions have tried to address this\nproblem, almost all solutions assume all the devices (nodes) in the network are\nfaultless. Furthermore, these solutions exchange a large number of messages to\nachieve synchronization, leading to significant communication and energy\noverhead. To address these shortcomings, we propose C-sync, a clustering-based\ndecentralized time synchronization protocol that provides resilience against\nseveral types of faults with energy-efficient communication. C-sync achieves\nscalability by introducing multiple reference nodes in the network that\nrestrict the maximum number of hops any node can have to its time source. The\nprotocol is designed with a modular structure on the Contiki platform to allow\napplication transitions. We evaluate C-sync on a real testbed that comprises\nover 40 Tmote Sky hardware nodes distributed across different levels in a\nbuilding and show through experiments the fault resilience, energy efficiency,\nand scalability of the protocol. C-sync detects and isolates faults to a\ncluster and recovers quickly. The evaluation makes a qualitative comparison\nwith state-of-the-art protocols and a quantitative comparison with a class of\ndecentralized protocols (derived from GTSP) that provide synchronization with\nno/limited fault-tolerance. Results also show a reduction of 56.12% and 75.75%\nin power consumption in the worst-case and best-case scenarios, respectively,\ncompared to GTSP, while achieving similar accuracy.\n","authors":["Nitin Shivaraman","Patrick Schuster","Saravanan Ramanathan","Arvind Easwaran","Sebastian Steinhorst"],"pdf_url":"https://arxiv.org/pdf/2409.14323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14320v1","updated":"2024-09-22T05:16:09Z","published":"2024-09-22T05:16:09Z","title":"Exploring the Use of Contingency for Nuclear Electrical Studies","summary":" This paper examines the use of contingency analysis for a nuclear power plant\nto determine its potential benefits for the nuclear industry. Various N-1\ncontingencies were analyzed for a model of an existing nuclear plant, primarily\ninspecting voltage violations resulting from a failure. Remedial Actions\nSchemes were suggested to support the reduction of voltage violations in the\nevent of a failure within the system. Many of the schemes presented were solved\nby existing redundancies and protection schemes that have been provided through\nthe use of industry standard bounding analysis in the design process. This\npaper proposes the future use of real-time contingency analysis for nuclear\npower plants, conducted using constantly updating voltage, current, and power\nmeasurements through the system. This will provide real-time information of the\nsystem and can serve as historical data to reduce the analysis needed for\npending design changes in the plant.\n","authors":["Cameron Khanpour","Jon T. Fontejon"],"pdf_url":"https://arxiv.org/pdf/2409.14320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14293v1","updated":"2024-09-22T01:58:37Z","published":"2024-09-22T01:58:37Z","title":"A novel load distribution strategy for aggregators using IoT-enabled\n mobile devices","summary":" The rapid proliferation of Internet-of-things (IoT) as well as mobile devices\nsuch as Electric Vehicles (EVs), has led to unpredictable load at the grid. The\ndemand to supply ratio is particularly exacerbated at a few grid aggregators\n(charging stations) with excessive demand due to the geographic location, peak\ntime, etc. Existing solutions on demand response cannot achieve significant\nimprovements based only on time-shifting the loads without considering the\ndevice properties such as charging modes and movement capabilities to enable\ngeographic migration. Additionally, the information on the spare capacity at a\nfew aggregators can aid in re-channeling the load from other aggregators facing\nexcess demand to allow migration of devices. In this paper, we model these\nflexible properties of the devices as a mixed-integer non-linear problem\n(MINLP) to minimize excess load and the improve the utility (benefit) across\nall devices. We propose an online distributed low-complexity heuristic that\nprioritizes devices based on demand and deadlines to minimize the cumulative\nloss in utility. The proposed heuristic is tested on an exhaustive set of\nsynthetic data and compared with solutions from a solver/optimization tool for\nthe same runtime to show the impracticality of using a solver. A real-world EV\ntestbed data is also tested with our proposed solution and other scheduling\nsolutions to show the practicality of generating a feasible schedule and a loss\nimprovement of at least 57.23%.\n","authors":["Nitin Shivaraman","Jakob Fittler","Saravanan Ramanathan","Arvind Easwaran","Sebastian Steinhorst"],"pdf_url":"https://arxiv.org/pdf/2409.14293v1.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000..7f5166c Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 0000000..9ded9d9 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 0000000..61800af --- /dev/null +++ b/index.html @@ -0,0 +1,61227 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Robotics 52 + +
+
+
+ + ☆ UniCal: Unified Neural Sensor Calibration ECCV 2024 + + +
+ Self-driving vehicles (SDVs) require accurate calibration of LiDARs and +cameras to fuse sensor data accurately for autonomy. Traditional calibration +methods typically leverage fiducials captured in a controlled and structured +scene and compute correspondences to optimize over. These approaches are costly +and require substantial infrastructure and operations, making it challenging to +scale for vehicle fleets. In this work, we propose UniCal, a unified framework +for effortlessly calibrating SDVs equipped with multiple LiDARs and cameras. +Our approach is built upon a differentiable scene representation capable of +rendering multi-view geometrically and photometrically consistent sensor +observations. We jointly learn the sensor calibration and the underlying scene +representation through differentiable volume rendering, utilizing outdoor +sensor data without the need for specific calibration fiducials. This +"drive-and-calibrate" approach significantly reduces costs and operational +overhead compared to existing calibration systems, enabling efficient +calibration for large SDV fleets at scale. To ensure geometric consistency +across observations from different sensors, we introduce a novel surface +alignment loss that combines feature-based registration with neural rendering. +Comprehensive evaluations on multiple datasets demonstrate that UniCal +outperforms or matches the accuracy of existing calibration approaches while +being more efficient, demonstrating the value of UniCal for scalable +calibration. + +
+
+ comment: ECCV 2024. Project page: https://waabi.ai/unical/ +
+
+
+
+
+ + ☆ Towards Super-Nominal Payload Handling: Inverse Dynamics Analysis for + Multi-Skill Robotic Manipulation ICRA + + +
+ Motion planning for articulated robots has traditionally been governed by +algorithms that operate within manufacturer-defined payload limits. Our +empirical analysis of the Franka Emika Panda robot demonstrates that this +approach unnecessarily restricts the robot's dynamically-reachable task space. +These results establish an expanded operational envelope for such robots, +showing that they can handle payloads of more than twice their rated capacity. +Additionally, our preliminary findings indicate that integrating non-prehensile +motion primitives with grasping-based manipulation has the potential to further +increase the success rates of manipulation tasks involving payloads exceeding +nominal limits. + +
+
+ comment: Accepted as an extended abstract to ICRA@40 +
+
+
+
+
+ + ☆ Safe Decentralized Multi-Agent Control using Black-Box Predictors, + Conformal Decision Policies, and Control Barrier Functions ICRA 2025 + + +
+ We address the challenge of safe control in decentralized multi-agent robotic +settings, where agents use uncertain black-box models to predict other agents' +trajectories. We use the recently proposed conformal decision theory to adapt +the restrictiveness of control barrier functions-based safety constraints based +on observed prediction errors. We use these constraints to synthesize +controllers that balance between the objectives of safety and task +accomplishment, despite the prediction errors. We provide an upper bound on the +average over time of the value of a monotonic function of the difference +between the safety constraint based on the predicted trajectories and the +constraint based on the ground truth ones. We validate our theory through +experimental results showing the performance of our controllers when navigating +a robot in the multi-agent scenes in the Stanford Drone Dataset. + +
+
+ comment: 6 pages, 1 figure, submitted for ICRA 2025 +
+
+
+
+
+ + ☆ Open-Nav: Exploring Zero-Shot Vision-and-Language Navigation in + Continuous Environment with Open-Source LLMs + + +
+ Vision-and-Language Navigation (VLN) tasks require an agent to follow textual +instructions to navigate through 3D environments. Traditional approaches use +supervised learning methods, relying heavily on domain-specific datasets to +train VLN models. Recent methods try to utilize closed-source large language +models (LLMs) like GPT-4 to solve VLN tasks in zero-shot manners, but face +challenges related to expensive token costs and potential data breaches in +real-world applications. In this work, we introduce Open-Nav, a novel study +that explores open-source LLMs for zero-shot VLN in the continuous environment. +Open-Nav employs a spatial-temporal chain-of-thought (CoT) reasoning approach +to break down tasks into instruction comprehension, progress estimation, and +decision-making. It enhances scene perceptions with fine-grained object and +spatial knowledge to improve LLM's reasoning in navigation. Our extensive +experiments in both simulated and real-world environments demonstrate that +Open-Nav achieves competitive performance compared to using closed-source LLMs. + +
+
+
+
+
+ + ☆ Excavating in the Wild: The GOOSE-Ex Dataset for Semantic Segmentation + + +
+ The successful deployment of deep learning-based techniques for autonomous +systems is highly dependent on the data availability for the respective system +in its deployment environment. Especially for unstructured outdoor +environments, very few datasets exist for even fewer robotic platforms and +scenarios. In an earlier work, we presented the German Outdoor and Offroad +Dataset (GOOSE) framework along with 10000 multimodal frames from an offroad +vehicle to enhance the perception capabilities in unstructured environments. In +this work, we address the generalizability of the GOOSE framework. To +accomplish this, we open-source the GOOSE-Ex dataset, which contains additional +5000 labeled multimodal frames from various completely different environments, +recorded on a robotic excavator and a quadruped platform. We perform a +comprehensive analysis of the semantic segmentation performance on different +platforms and sensor modalities in unseen environments. In addition, we +demonstrate how the combined datasets can be utilized for different downstream +applications or competitions such as offroad navigation, object manipulation or +scene completion. The dataset, its platform documentation and pre-trained +state-of-the-art models for offroad perception will be made available on +https://goose-dataset.de/. + \ + +
+
+ comment: Submitted to IEEE for review +
+
+
+
+
+ + ☆ A POMDP-based hierarchical planning framework for manipulation under + pose uncertainty + + +
+ Robots often face challenges in domestic environments where visual feedback +is ineffective, such as retrieving objects obstructed by occlusions or finding +a light switch in the dark. In these cases, utilizing contacts to localize the +target object can be effective. We propose an online planning framework using +binary contact signals for manipulation tasks with pose uncertainty, formulated +as a Partially Observable Markov Decision Process (POMDP). Naively representing +the belief as a particle set makes planning infeasible due to the large +uncertainties in domestic settings, as identifying the best sequence of actions +requires rolling out thousands of actions across millions of particles, taking +significant compute time. To address this, we propose a hierarchical belief +representation. Initially, we represent the uncertainty coarsely in a 3D +volumetric space. Policies that refine uncertainty in this space are computed +and executed, and once uncertainty is sufficiently reduced, the problem is +translated back into the particle space for further refinement before task +completion. We utilize a closed-loop planning and execution framework with a +heuristic-search-based anytime solver that computes partial policies within a +limited time budget. The performance of the framework is demonstrated both in +real world and in simulation on the high-precision task of inserting a plug +into a port using a UR10e manipulator, resolving positional uncertainties up to +50 centimeters and angular uncertainties close to $2\pi$. Experimental results +highlight the framework's effectiveness, achieving a 93\% success rate in the +real world and over 50\% improvement in solution quality compared to greedy +baselines, significantly accelerating planning and enabling real-time solutions +for complex problems. + +
+
+ comment: Under review (2025 IEEE International Conference on Robotics & + Automation) +
+
+
+
+
+ + ☆ Learning from Demonstration with Implicit Nonlinear Dynamics Models + + +
+ Learning from Demonstration (LfD) is a useful paradigm for training policies +that solve tasks involving complex motions. In practice, the successful +application of LfD requires overcoming error accumulation during policy +execution, i.e. the problem of drift due to errors compounding over time and +the consequent out-of-distribution behaviours. Existing works seek to address +this problem through scaling data collection, correcting policy errors with a +human-in-the-loop, temporally ensembling policy predictions or through learning +the parameters of a dynamical system model. In this work, we propose and +validate an alternative approach to overcoming this issue. Inspired by +reservoir computing, we develop a novel neural network layer that includes a +fixed nonlinear dynamical system with tunable dynamical properties. We validate +the efficacy of our neural network layer on the task of reproducing human +handwriting motions using the LASA Human Handwriting Dataset. Through empirical +experiments we demonstrate that incorporating our layer into existing neural +network architectures addresses the issue of compounding errors in LfD. +Furthermore, we perform a comparative evaluation against existing approaches +including a temporal ensemble of policy predictions and an Echo State Networks +(ESNs) implementation. We find that our approach yields greater policy +precision and robustness on the handwriting task while also generalising to +multiple dynamics regimes and maintaining competitive latency scores. + +
+
+ comment: 21 pages, 9 figures +
+
+
+
+
+ + ☆ Transparency evaluation for the Kinematic Design of the Harnesses + through Human-Exoskeleton Interaction Modeling + + +
+ Lower Limb Exoskeletons (LLEs) are wearable robots that provide mechanical +power to the user. Human-exoskeleton (HE) connections must preserve the user's +natural behavior during the interaction, avoiding undesired forces. Therefore, +numerous works focus on their minimization. Given the inherent complications of +repeatedly prototyping and experimentally testing a device, modeling the +exoskeleton and its physical interaction with the user emerges as a valuable +approach for assessing the design effects. This paper proposes a novel method +to compare different exoskeleton configurations with a flexible simulation +tool. This approach contemplates simulating the dynamics of the device, +including its interaction with the wearer, to evaluate multiple connection +mechanism designs along with the kinematics and actuation of the LLE. This +evaluation is based on the minimization of the interaction wrenches through an +optimization process that includes the impedance parameters at the interfaces +as optimization variables and the similarity of the LLE's joint variables +trajectories with the motion of the wearer's articulations. Exploratory tests +are conducted using the Wearable Walker LLE in different configurations and +measuring the interaction forces. Experimental data are then compared to the +optimization outcomes, proving that the proposed method provides contact wrench +estimations consistent with the collected measurements and previous outcomes +from the literature. Copyright 2024 IEEE. Personal use of this material is +permitted. Permission from IEEE must be obtained for all other uses, in any +current or future media, including reprinting/republishing this material for +advertising or promotional purposes, creating new collective works, for resale +or redistribution to servers or lists, or reuse of any copyrighted component of +this work in other works. + +
+
+
+
+
+ + ☆ Royal Reveals: LiDAR Mapping of Kronborg Castle, Echoes of Hamlet's + Halls + + +
+ This paper presents a large scale dataset from a meticulous 360-degree LiDAR +(Light Detection and Ranging) scan conducted on Kronborg Castle, a renowned +Renaissance fortress located in Elsinore (Helsing{\o}r), Denmark, famously +associated with Shakespeare's "Hamlet." Utilising a vertical mounted, gimbal +stabilised, 16 channel, 360-degree Velodyne VLP-16 LiDAR scanner, paired with +an Intel RealSense L515 depth camera. This research offers an unparalleled +digital representation of the castle's intricate architectural details and +structural nuances, enabling fellow researchers to conduct experiments +utilising the data for SLAM (Simultaneous Localisation and Mapping) as well as +floorplan generation. + +
+
+ comment: 4 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ A study on the effects of mixed explicit and implicit communications in + human-virtual-agent interactions + + +
+ Communication between humans and robots (or virtual agents) is essential for +interaction and often inspired by human communication, which uses gestures, +facial expressions, gaze direction, and other explicit and implicit means. This +work presents an interaction experiment where humans and virtual agents +interact through explicit (gestures, manual entries using mouse and keyboard, +voice, sound, and information on screen) and implicit (gaze direction, +location, facial expressions, and raise of eyebrows) communication to evaluate +the effect of mixed explicit-implicit communication against purely explicit +communication. Results obtained using Bayesian parameter estimation show that +the number of errors and task execution time did not significantly change when +mixed explicit and implicit communications were used, and neither the perceived +efficiency of the interaction. In contrast, acceptance, sociability, and +transparency of the virtual agent increased when using mixed communication +modalities (88.3%, 92%, and 92.9% of the effect size posterior distribution of +each variable, respectively, were above the upper limit of the region of +practical equivalence). This suggests that task-related measures, such as time, +number of errors, and perceived efficiency of the interaction, have not been +influenced by the communication type in our particular experiment. However, the +improvement of subjective measures related to the virtual agent, such as +acceptance, sociability, and transparency, suggests that humans are more +receptive to mixed explicit and implicit communications. + +
+
+ comment: 22 pages, 12 figures, 4 tables. Under review for International + Journal of Social Robotics +
+
+
+
+
+ + ☆ OpenObject-NAV: Open-Vocabulary Object-Oriented Navigation Based on + Dynamic Carrier-Relationship Scene Graph + + +
+ In everyday life, frequently used objects like cups often have unfixed +positions and multiple instances within the same category, and their carriers +frequently change as well. As a result, it becomes challenging for a robot to +efficiently navigate to a specific instance. To tackle this challenge, the +robot must capture and update scene changes and plans continuously. However, +current object navigation approaches primarily focus on semantic-level and lack +the ability to dynamically update scene representation. This paper captures the +relationships between frequently used objects and their static carriers. It +constructs an open-vocabulary Carrier-Relationship Scene Graph (CRSG) and +updates the carrying status during robot navigation to reflect the dynamic +changes of the scene. Based on the CRSG, we further propose an instance +navigation strategy that models the navigation process as a Markov Decision +Process. At each step, decisions are informed by Large Language Model's +commonsense knowledge and visual-language feature similarity. We designed a +series of long-sequence navigation tasks for frequently used everyday items in +the Habitat simulator. The results demonstrate that by updating the CRSG, the +robot can efficiently navigate to moved targets. Additionally, we deployed our +algorithm on a real robot and validated its practical effectiveness. + +
+
+ comment: Project website: https://openobject-nav.github.io/ +
+
+
+
+
+ + ☆ Optimum Configuration for Hovering n-Quadrotors carrying a Slung Payload + + +
+ This work proposes a strategy for organising quadrotors around a payload to +enable hovering without external stimuli, together with a MATLAB software for +modelling the dynamics of a quadrotor-payload system. Based on geometric +concepts, the proposed design keeps the payload and system centre of mass +aligned. Hovering tests that are successful confirm the method's efficiency. +Moreover, the algorithm is improved to take thrust capacities and propeller +distances into account, calculating the minimum number of quadrotors needed for +hovering. The algorithm's effectiveness is demonstrated by numerical examples, +which reveal that larger quadrotors may require fewer units while smaller ones +give greater flexibility. Our code can be found at: +\href{https://github.com/Hosnooo/Swarm-Slung-Payload}{https://github.com/Hosnooo/Swarm-Slung-Payload} + +
+
+ comment: accepted for publication at AIAA SCITECH 2025 +
+
+
+
+
+ + ☆ Discrete Policy: Learning Disentangled Action Space for Multi-Task + Robotic Manipulation + + +
+ Learning visuomotor policy for multi-task robotic manipulation has been a +long-standing challenge for the robotics community. The difficulty lies in the +diversity of action space: typically, a goal can be accomplished in multiple +ways, resulting in a multimodal action distribution for a single task. The +complexity of action distribution escalates as the number of tasks increases. +In this work, we propose \textbf{Discrete Policy}, a robot learning method for +training universal agents capable of multi-task manipulation skills. Discrete +Policy employs vector quantization to map action sequences into a discrete +latent space, facilitating the learning of task-specific codes. These codes are +then reconstructed into the action space conditioned on observations and +language instruction. We evaluate our method on both simulation and multiple +real-world embodiments, including both single-arm and bimanual robot settings. +We demonstrate that our proposed Discrete Policy outperforms a well-established +Diffusion Policy baseline and many state-of-the-art approaches, including ACT, +Octo, and OpenVLA. For example, in a real-world multi-task training setting +with five tasks, Discrete Policy achieves an average success rate that is 26\% +higher than Diffusion Policy and 15\% higher than OpenVLA. As the number of +tasks increases to 12, the performance gap between Discrete Policy and +Diffusion Policy widens to 32.5\%, further showcasing the advantages of our +approach. Our work empirically demonstrates that learning multi-task policies +within the latent space is a vital step toward achieving general-purpose +agents. + +
+
+
+
+
+ + ☆ Automatic Gain Tuning for Humanoid Robots Walking Architectures Using + Gradient-Free Optimization Techniques + + +
+ Developing sophisticated control architectures has endowed robots, +particularly humanoid robots, with numerous capabilities. However, tuning these +architectures remains a challenging and time-consuming task that requires +expert intervention. In this work, we propose a methodology to automatically +tune the gains of all layers of a hierarchical control architecture for walking +humanoids. We tested our methodology by employing different gradient-free +optimization methods: Genetic Algorithm (GA), Covariance Matrix Adaptation +Evolution Strategy (CMA-ES), Evolution Strategy (ES), and Differential +Evolution (DE). We validated the parameter found both in simulation and on the +real ergoCub humanoid robot. Our results show that GA achieves the fastest +convergence (10 x 10^3 function evaluations vs 25 x 10^3 needed by the other +algorithms) and 100% success rate in completing the task both in simulation and +when transferred on the real robotic platform. These findings highlight the +potential of our proposed method to automate the tuning process, reducing the +need for manual intervention. + +
+
+
+
+
+ + ☆ Pseudo-kinematic trajectory control of tracked vehicles + + +
+ Tracked vehicles are used in complex scenarios, where motion planning and +navigation can be very complex. They have complex dynamics, with many +parameters that are difficult to identify and that change significantly based +on the operating conditions. We propose a simple pseudo-kinematic model, where +the intricate dynamic effects underlying the vehicle's motion are captured in a +small set of velocity-dependent parameters. This choice enables the development +of a Lyapunov-based trajectory controller with guaranteed performance and small +computation time. We demonstrate the correctness of our approach with both +simulation and experimental data. + +
+
+
+
+
+ + ☆ From One to the Power of Many: Augmentations for Invariance to + Multi-LiDAR Perception from Single-Sensor Datasets + + +
+ Recently, LiDAR perception methods for autonomous vehicles, powered by deep +neural networks have experienced steep growth in performance on classic +benchmarks, such as nuScenes and SemanticKITTI. However, there are still large +gaps in performance when deploying models trained on such single-sensor setups +to modern multi-sensor vehicles. In this work, we investigate if a lack of +invariance may be responsible for these performance gaps, and propose some +initial solutions in the form of application-specific data augmentations, which +can facilitate better transfer to multi-sensor LiDAR setups. We provide +experimental evidence that our proposed augmentations improve generalization +across LiDAR sensor setups, and investigate how these augmentations affect the +models' invariance properties on simulations of different LiDAR sensor setups. + +
+
+
+
+
+ + ☆ Analysis of Truncated Singular Value Decomposition for Koopman + Operator-Based Lane Change Model + + +
+ Understanding and modeling complex dynamic systems is crucial for enhancing +vehicle performance and safety, especially in the context of autonomous +driving. Recently, popular methods such as Koopman operators and their +approximators, known as Extended Dynamic Mode Decomposition (EDMD), have +emerged for their effectiveness in transforming strongly nonlinear system +behavior into linear representations. This allows them to be integrated with +conventional linear controllers. To achieve this, Singular Value Decomposition +(SVD), specifically truncated SVD, is employed to approximate Koopman operators +from extensive datasets efficiently. This study evaluates different basis +functions used in EDMD and ranks for truncated SVD for representing lane change +behavior models, aiming to balance computational efficiency with information +loss. The findings, however, suggest that the technique of truncated SVD does +not necessarily achieve substantial reductions in computational training time +and results in significant information loss. + +
+
+ comment: Submitted to the 21st International Conference on Informatics in + Control, Automation and Robotics (ICINCO 2024) +
+
+
+
+
+ + ☆ Unscented Transform-based Pure Pursuit Path-Tracking Algorithm under + Uncertainty + + +
+ Automated driving has become more and more popular due to its potential to +eliminate road accidents by taking over driving tasks from humans. One of the +remaining challenges is to follow a planned path autonomously, especially when +uncertainties in self-localizing or understanding the surroundings can +influence the decisions made by autonomous vehicles, such as calculating how +much they need to steer to minimize tracking errors. In this paper, a modified +geometric pure pursuit path-tracking algorithm is proposed, taking into +consideration such uncertainties using the unscented transform. The algorithm +is tested through simulations for typical road geometries, such as straight and +circular lines. + +
+
+ comment: Submitted to the 21st International Conference on Informatics in + Control, Automation and Robotics (ICINCO 2024) +
+
+
+
+
+ + ☆ An Epistemic Human-Aware Task Planner which Anticipates Human Beliefs + and Decisions + + +
+ We present a substantial extension of our Human-Aware Task Planning +framework, tailored for scenarios with intermittent shared execution +experiences and significant belief divergence between humans and robots, +particularly due to the uncontrollable nature of humans. Our objective is to +build a robot policy that accounts for uncontrollable human behaviors, thus +enabling the anticipation of possible advancements achieved by the robot when +the execution is not shared, e.g. when humans are briefly absent from the +shared environment to complete a subtask. But, this anticipation is considered +from the perspective of humans who have access to an estimated model for the +robot. To this end, we propose a novel planning framework and build a solver +based on AND-OR search, which integrates knowledge reasoning, including +situation assessment by perspective taking. Our approach dynamically models and +manages the expansion and contraction of potential advances while precisely +keeping track of when (and when not) agents share the task execution +experience. The planner systematically assesses the situation and ignores +worlds that it has reason to think are impossible for humans. Overall, our new +solver can estimate the distinct beliefs of the human and the robot along +potential courses of action, enabling the synthesis of plans where the robot +selects the right moment for communication, i.e. informing, or replying to an +inquiry, or defers ontic actions until the execution experiences can be shared. +Preliminary experiments in two domains, one novel and one adapted, demonstrate +the effectiveness of the framework. + +
+
+ comment: 15 pages, 4 figures, 1 table +
+
+
+
+
+ + ☆ DynaWeightPnP: Toward global real-time 3D-2D solver in PnP without + correspondences + + +
+ This paper addresses a special Perspective-n-Point (PnP) problem: estimating +the optimal pose to align 3D and 2D shapes in real-time without +correspondences, termed as correspondence-free PnP. While several studies have +focused on 3D and 2D shape registration, achieving both real-time and accurate +performance remains challenging. This study specifically targets the 3D-2D +geometric shape registration tasks, applying the recently developed Reproducing +Kernel Hilbert Space (RKHS) to address the "big-to-small" issue. An iterative +reweighted least squares method is employed to solve the RKHS-based formulation +efficiently. Moreover, our work identifies a unique and interesting +observability issue in correspondence-free PnP: the numerical ambiguity between +rotation and translation. To address this, we proposed DynaWeightPnP, +introducing a dynamic weighting sub-problem and an alternative searching +algorithm designed to enhance pose estimation and alignment accuracy. +Experiments were conducted on a typical case, that is, a 3D-2D vascular +centerline registration task within Endovascular Image-Guided Interventions +(EIGIs). Results demonstrated that the proposed algorithm achieves registration +processing rates of 60 Hz (without post-refinement) and 31 Hz (with +post-refinement) on modern single-core CPUs, with competitive accuracy +comparable to existing methods. These results underscore the suitability of +DynaWeightPnP for future robot navigation tasks like EIGIs. + +
+
+
+
+
+ + ☆ Exploiting Physical Human-Robot Interaction to Provide a Unique Rolling + Experience with a Riding Ballbot + + +
+ This study introduces the development of hands-free control schemes for a +riding ballbot, designed to allow riders including manual wheelchair users to +control its movement through torso leaning and twisting. The hardware platform, +Personal Unique Rolling Experience (PURE), utilizes a ballbot drivetrain, a +dynamically stable mobile robot that uses a ball as its wheel to provide +omnidirectional maneuverability. To accommodate users with varying torso motion +functions, the hanads-free control scheme should be adjustable based on the +rider's torso function and personal preferences. Therefore, concepts of (a) +impedance control and (b) admittance control were integrated into the control +scheme. A duo-agent optimization framework was utilized to assess the +efficiency of this rider-ballbot system for a safety-critical task: braking +from 1.4 m/s. The candidate control schemes were further implemented in the +physical robot hardware and validated with two experienced users, demonstrating +the efficiency and robustness of the hands-free admittance control scheme +(HACS). This interface, which utilized physical human-robot interaction (pHRI) +as the input, resulted in lower braking effort and shorter braking distance and +time. Subsequently, 12 novice participants (six able-bodied users and six +manual wheelchair users) with different levels of torso motion capability were +then recruited to benchmark the braking performance with HACS. The indoor +navigation capability of PURE was further demonstrated with these participants +in courses simulating narrow hallways, tight turns, and navigation through +static and dynamic obstacles. By exploiting pHRI, the proposed admittance-style +control scheme provided effective control of the ballbot via torso motions. +This interface enables PURE to provide a personal unique rolling experience to +manual wheelchair users for safe and agile indoor navigation. + +
+
+
+
+
+ + ☆ Get It For Free: Radar Segmentation without Expert Labels and Its + Application in Odometry and Localization + + +
+ This paper presents a novel weakly supervised semantic segmentation method +for radar segmentation, where the existing LiDAR semantic segmentation models +are employed to generate semantic labels, which then serve as supervision +signals for training a radar semantic segmentation model. The obtained radar +semantic segmentation model outperforms LiDAR-based models, providing more +consistent and robust segmentation under all-weather conditions, particularly +in the snow, rain and fog. To mitigate potential errors in LiDAR semantic +labels, we design a dedicated refinement scheme that corrects erroneous labels +based on structural features and distribution patterns. The semantic +information generated by our radar segmentation model is used in two downstream +tasks, achieving significant performance improvements. In large-scale +radar-based localization using OpenStreetMap, it leads to localization error +reduction by 20.55\% over prior methods. For the odometry task, it improves +translation accuracy by 16.4\% compared to the second-best method, securing the +first place in the radar odometry competition at the Radar in Robotics workshop +of ICRA 2024, Japan + +
+
+
+
+
+ + ☆ BoT-Drive: Hierarchical Behavior and Trajectory Planning for Autonomous + Driving using POMDPs + + +
+ Uncertainties in dynamic road environments pose significant challenges for +behavior and trajectory planning in autonomous driving. This paper introduces +BoT-Drive, a planning algorithm that addresses uncertainties at both behavior +and trajectory levels within a Partially Observable Markov Decision Process +(POMDP) framework. BoT-Drive employs driver models to characterize unknown +behavioral intentions and utilizes their model parameters to infer hidden +driving styles. By also treating driver models as decision-making actions for +the autonomous vehicle, BoT-Drive effectively tackles the exponential +complexity inherent in POMDPs. To enhance safety and robustness, the planner +further applies importance sampling to refine the driving trajectory +conditioned on the planned high-level behavior. Evaluation on real-world data +shows that BoT-Drive consistently outperforms both existing planning methods +and learning-based methods in regular and complex urban driving scenes, +demonstrating significant improvements in driving safety and reliability. + +
+
+
+
+
+ + ☆ Word2Wave: Language Driven Mission Programming for Efficient Subsea + Deployments of Marine Robots + + +
+ This paper explores the design and development of a language-based interface +for dynamic mission programming of autonomous underwater vehicles (AUVs). The +proposed 'Word2Wave' (W2W) framework enables interactive programming and +parameter configuration of AUVs for remote subsea missions. The W2W framework +includes: (i) a set of novel language rules and command structures for +efficient language-to-mission mapping; (ii) a GPT-based prompt engineering +module for training data generation; (iii) a small language model (SLM)-based +sequence-to-sequence learning pipeline for mission command generation from +human speech or text; and (iv) a novel user interface for 2D mission map +visualization and human-machine interfacing. The proposed learning pipeline +adapts an SLM named T5-Small that can learn language-to-mission mapping from +processed language data effectively, providing robust and efficient +performance. In addition to a benchmark evaluation with state-of-the-art, we +conduct a user interaction study to demonstrate the effectiveness of W2W over +commercial AUV programming interfaces. Across participants, W2W-based +programming required less than 10% time for mission programming compared to +traditional interfaces; it is deemed to be a simpler and more natural paradigm +for subsea mission programming with a usability score of 76.25. W2W opens up +promising future research opportunities on hands-free AUV mission programming +for efficient subsea deployments. + +
+
+
+
+
+ + ☆ An Augmented Reality Interface for Teleoperating Robot Manipulators: + Reducing Demonstrator Task Load through Digital Twin Control + + +
+ Acquiring high-quality demonstration data is essential for the success of +data-driven methods, such as imitation learning. Existing platforms for +providing demonstrations for manipulation tasks often impose significant +physical and mental demands on the demonstrator, require additional hardware +systems, or necessitate specialized domain knowledge. In this work, we present +a novel augmented reality (AR) interface for teleoperating robotic +manipulators, emphasizing the demonstrator's experience, particularly in the +context of performing complex tasks that require precision and accuracy. This +interface, designed for the Microsoft HoloLens 2, leverages the adaptable +nature of mixed reality (MR), enabling users to control a physical robot +through digital twin surrogates. We assess the effectiveness of our approach +across three complex manipulation tasks and compare its performance against +OPEN TEACH, a recent virtual reality (VR) teleoperation system, as well as two +traditional control methods: kinesthetic teaching and a 3D SpaceMouse for +end-effector control. Our findings show that our method performs comparably to +the VR approach and demonstrates the potential for AR in data collection. +Additionally, we conduct a pilot study to evaluate the usability and task load +associated with each method. Results indicate that our AR-based system achieves +higher usability scores than the VR benchmark and significantly reduces mental +demand, physical effort, and frustration experienced by users. An accompanying +video can be found at https://youtu.be/w-M58ohPgrA. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ Speech to Reality: On-Demand Production using Natural Language, 3D + Generative AI, and Discrete Robotic Assembly + + +
+ We present a system that transforms speech into physical objects by combining +3D generative Artificial Intelligence with robotic assembly. The system +leverages natural language input to make design and manufacturing more +accessible, enabling individuals without expertise in 3D modeling or robotic +programming to create physical objects. We propose utilizing discrete robotic +assembly of lattice-based voxel components to address the challenges of using +generative AI outputs in physical production, such as design variability, +fabrication speed, structural integrity, and material waste. The system +interprets speech to generate 3D objects, discretizes them into voxel +components, computes an optimized assembly sequence, and generates a robotic +toolpath. The results are demonstrated through the assembly of various objects, +ranging from chairs to shelves, which are prompted via speech and realized +within 5 minutes using a 6-axis robotic arm. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible. An updated version will replace this version +
+
+
+
+
+ + ☆ Robo-CSK-Organizer: Commonsense Knowledge to Organize Detected Objects + for Multipurpose Robots + + +
+ This paper presents a system called Robo-CSK-Organizer that infuses +commonsense knowledge from a classical knowledge based to enhance the context +recognition capabilities of robots so as to facilitate the organization of +detected objects by classifying them in a task-relevant manner. It is +particularly useful in multipurpose robotics. Unlike systems relying solely on +deep learning tools such as ChatGPT, the Robo-CSK-Organizer system stands out +in multiple avenues as follows. It resolves ambiguities well, and maintains +consistency in object placement. Moreover, it adapts to diverse task-based +classifications. Furthermore, it contributes to explainable AI, hence helping +to improve trust and human-robot collaboration. Controlled experiments +performed in our work, simulating domestic robotics settings, make +Robo-CSK-Organizer demonstrate superior performance while placing objects in +contextually relevant locations. This work highlights the capacity of an +AI-based system to conduct commonsense-guided decision-making in robotics +closer to the thresholds of human cognition. Hence, Robo-CSK-Organizer makes +positive impacts on AI and robotics. + +
+
+
+
+
+ + ☆ AquaMILR+: Design of an untethered limbless robot for complex aquatic + terrain navigation + + +
+ This paper presents AquaMILR+, an untethered limbless robot designed for +agile navigation in complex aquatic environments. The robot features a +bilateral actuation mechanism that models musculoskeletal actuation in many +anguilliform swimming organisms which propagates a moving wave from head to +tail allowing open fluid undulatory swimming. This actuation mechanism employs +mechanical intelligence, enhancing the robot's maneuverability when interacting +with obstacles. AquaMILR+ also includes a compact depth control system inspired +by the swim bladder and lung structures of eels and sea snakes. The mechanism, +driven by a syringe and telescoping leadscrew, enables depth and pitch +control-capabilities that are difficult for most anguilliform swimming robots +to achieve. Additional structures, such as fins and a tail, further improve +stability and propulsion efficiency. Our tests in both open water and indoor 2D +and 3D heterogeneous aquatic environments highlight AquaMILR+'s capabilities +and suggest a promising system for complex underwater tasks such as search and +rescue and deep-sea exploration. + +
+
+
+
+
+ + ☆ CurricuLLM: Automatic Task Curricula Design for Learning Complex Robot + Skills using Large Language Models ICRA 2025 + + +
+ Curriculum learning is a training mechanism in reinforcement learning (RL) +that facilitates the achievement of complex policies by progressively +increasing the task difficulty during training. However, designing effective +curricula for a specific task often requires extensive domain knowledge and +human intervention, which limits its applicability across various domains. Our +core idea is that large language models (LLMs), with their extensive training +on diverse language data and ability to encapsulate world knowledge, present +significant potential for efficiently breaking down tasks and decomposing +skills across various robotics environments. Additionally, the demonstrated +success of LLMs in translating natural language into executable code for RL +agents strengthens their role in generating task curricula. In this work, we +propose CurricuLLM, which leverages the high-level planning and programming +capabilities of LLMs for curriculum design, thereby enhancing the efficient +learning of complex target tasks. CurricuLLM consists of: (Step 1) Generating +sequence of subtasks that aid target task learning in natural language form, +(Step 2) Translating natural language description of subtasks in executable +task code, including the reward code and goal distribution code, and (Step 3) +Evaluating trained policies based on trajectory rollout and subtask +description. We evaluate CurricuLLM in various robotics simulation +environments, ranging from manipulation, navigation, and locomotion, to show +that CurricuLLM can aid learning complex robot control tasks. In addition, we +validate humanoid locomotion policy learned through CurricuLLM in real-world. +The code is provided in https://github.com/labicon/CurricuLLM + +
+
+ comment: Submitted to ICRA 2025 +
+
+
+
+
+ + ☆ iWalker: Imperative Visual Planning for Walking Humanoid Robot + + +
+ Humanoid robots, with the potential to perform a broad range of tasks in +environments designed for humans, have been deemed crucial for the basis of +general AI agents. When talking about planning and controlling, although +traditional models and task-specific methods have been extensively studied over +the past few decades, they are inadequate for achieving the flexibility and +versatility needed for general autonomy. Learning approaches, especially +reinforcement learning, are powerful and popular nowadays, but they are +inherently "blind" during training, relying heavily on trials in simulation +without proper guidance from physical principles or underlying dynamics. In +response, we propose a novel end-to-end pipeline that seamlessly integrates +perception, planning, and model-based control for humanoid robot walking. We +refer to our method as iWalker, which is driven by imperative learning (IL), a +self-supervising neuro-symbolic learning framework. This enables the robot to +learn from arbitrary unlabeled data, significantly improving its adaptability +and generalization capabilities. In experiments, iWalker demonstrates +effectiveness in both simulated and real-world environments, representing a +significant advancement toward versatile and autonomous humanoid robots. + +
+
+
+
+
+ + ☆ A New 10-mg SMA-Based Fast Bimorph Actuator for Microrobotics IROS 2024 + + +
+ We present a new millimeter-scale bimorph actuator for microrobotic +applications, driven by feedforward controlled shape-memory alloy (SMA) wires. +The device weighs 10 mg, measures 14 mm in length, and occupies a volume of 4.8 +mm3, which makes it the lightest and smallest fully functional SMA-based +bimorph actuator for microrobotics developed to date. The experimentally +measured operational bandwidth is on the order of 20 Hz, and the unimorph and +bimorph maximum low-frequency displacement outputs are on the order of 3.5 and +7 mm, respectively. To test and demonstrate the functionality and suitability +of the actuator for microrobotics, we developed the Fish-&-Ribbon-Inspired +Small Swimming Harmonic roBot (FRISSHBot). Loosely inspired by carangiformes, +the FRISSHBot leverages fluid-structure interaction (FSI) phenomena to propel +itself forward, weighs 30 mg, measures 34 mm in length, operates at frequencies +of up to 4 Hz, and swims at speeds of up to 3.06 mm/s (0.09 Bl/s). This robot +is the lightest and smallest swimmer with onboard actuation developed to date. + +
+
+ comment: To be presented at the 2024 IEEE/RSJ International Conference on + Intelligent Robots and Systems (IROS 2024) +
+
+
+
+
+ + ♻ ☆ Explaining Explaining + + +
+ Explanation is key to people having confidence in high-stakes AI systems. +However, machine-learning-based systems -- which account for almost all current +AI -- can't explain because they are usually black boxes. The explainable AI +(XAI) movement hedges this problem by redefining "explanation". The +human-centered explainable AI (HCXAI) movement identifies the +explanation-oriented needs of users but can't fulfill them because of its +commitment to machine learning. In order to achieve the kinds of explanations +needed by real people operating in critical domains, we must rethink how to +approach AI. We describe a hybrid approach to developing cognitive agents that +uses a knowledge-based infrastructure supplemented by data obtained through +machine learning when applicable. These agents will serve as assistants to +humans who will bear ultimate responsibility for the decisions and actions of +the human-robot team. We illustrate the explanatory potential of such agents +using the under-the-hood panels of a demonstration system in which a team of +simulated robots collaborate on a search task assigned by a human. + +
+
+
+
+
+ + ♻ Learning Occlusion-aware Decision-making from Agent Interaction via + Active Perception + + +
+ Occlusion-aware decision-making is essential in autonomous driving due to the +high uncertainty of various occlusions. Recent occlusion-aware decision-making +methods encounter issues such as high computational complexity, scenario +scalability challenges, or reliance on limited expert data. Benefiting from +automatically generating data by exploration randomization, we uncover that +reinforcement learning (RL) may show promise in occlusion-aware +decision-making. However, previous occlusion-aware RL faces challenges in +expanding to various dynamic and static occlusion scenarios, low learning +efficiency, and lack of predictive ability. To address these issues, we +introduce Pad-AI, a self-reinforcing framework to learn occlusion-aware +decision-making through active perception. Pad-AI utilizes vectorized +representation to represent occluded environments efficiently and learns over +the semantic motion primitives to focus on high-level active perception +exploration. Furthermore, Pad-AI integrates prediction and RL within a unified +framework to provide risk-aware learning and security guarantees. Our framework +was tested in challenging scenarios under both dynamic and static occlusions +and demonstrated efficient and general perception-aware exploration performance +to other strong baselines in closed-loop evaluations. + +
+
+
+
+
+ + ♻ ☆ Proprioception Is All You Need: Terrain Classification for Boreal + Forests IROS 2024 + + +
+ Recent works in field robotics highlighted the importance of resiliency +against different types of terrains. Boreal forests, in particular, are home to +many mobility-impeding terrains that should be considered for off-road +autonomous navigation. Also, being one of the largest land biomes on Earth, +boreal forests are an area where autonomous vehicles are expected to become +increasingly common. In this paper, we address this issue by introducing +BorealTC, a publicly available dataset for proprioceptive-based terrain +classification (TC). Recorded with a Husky A200, our dataset contains 116 min +of Inertial Measurement Unit (IMU), motor current, and wheel odometry data, +focusing on typical boreal forest terrains, notably snow, ice, and silty loam. +Combining our dataset with another dataset from the state-of-the-art, we +evaluate both a Convolutional Neural Network (CNN) and the novel state space +model (SSM)-based Mamba architecture on a TC task. Interestingly, we show that +while CNN outperforms Mamba on each separate dataset, Mamba achieves greater +accuracy when trained on a combination of both. In addition, we demonstrate +that Mamba's learning capacity is greater than a CNN for increasing amounts of +data. We show that the combination of two TC datasets yields a latent space +that can be interpreted with the properties of the terrains. We also discuss +the implications of merging datasets on classification. Our source code and +dataset are publicly available online: +https://github.com/norlab-ulaval/BorealTC. + +
+
+ comment: Accepted to the 2024 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2024) +
+
+
+
+
+ + ♻ Universal Trajectory Optimization Framework for Differential Drive Robot + Class + + +
+ Differential drive robots are widely used in various scenarios thanks to +their straightforward principle, from household service robots to disaster +response field robots. There are several types of driving mechanisms for +real-world applications, including two-wheeled, four-wheeled skid-steering, +tracked robots, and so on. The differences in the driving mechanisms usually +require specific kinematic modeling when precise control is desired. +Furthermore, the nonholonomic dynamics and possible lateral slip lead to +different degrees of difficulty in getting feasible and high-quality +trajectories. Therefore, a comprehensive trajectory optimization framework to +compute trajectories efficiently for various kinds of differential drive robots +is highly desirable. In this paper, we propose a universal trajectory +optimization framework that can be applied to differential drive robots, +enabling the generation of high-quality trajectories within a restricted +computational timeframe. We introduce a novel trajectory representation based +on polynomial parameterization of motion states or their integrals, such as +angular and linear velocities, which inherently matches the robots' motion to +the control principle. The trajectory optimization problem is formulated to +minimize complexity while prioritizing safety and operational efficiency. We +then build a full-stack autonomous planning and control system to demonstrate +its feasibility and robustness. We conduct extensive simulations and real-world +testing in crowded environments with three kinds of differential drive robots +to validate the effectiveness of our approach. + +
+
+ comment: 15 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ AnySkin: Plug-and-play Skin Sensing for Robotic Touch + + +
+ While tactile sensing is widely accepted as an important and useful sensing +modality, its use pales in comparison to other sensory modalities like vision +and proprioception. AnySkin addresses the critical challenges that impede the +use of tactile sensing -- versatility, replaceability, and data reusability. +Building on the simplistic design of ReSkin, and decoupling the sensing +electronics from the sensing interface, AnySkin simplifies integration making +it as straightforward as putting on a phone case and connecting a charger. +Furthermore, AnySkin is the first uncalibrated tactile-sensor with +cross-instance generalizability of learned manipulation policies. To summarize, +this work makes three key contributions: first, we introduce a streamlined +fabrication process and a design tool for creating an adhesive-free, durable +and easily replaceable magnetic tactile sensor; second, we characterize slip +detection and policy learning with the AnySkin sensor; and third, we +demonstrate zero-shot generalization of models trained on one instance of +AnySkin to new instances, and compare it with popular existing tactile +solutions like DIGIT and ReSkin. Videos of experiments, fabrication details and +design files can be found on https://any-skin.github.io/ + +
+
+
+
+
+ + ♻ ☆ Soft Acoustic Curvature Sensor: Design and Development + + +
+ This paper introduces a novel Soft Acoustic Curvature (SAC) sensor. SAC +incorporates integrated audio components and features an acoustic channel +within a flexible structure. A reference acoustic wave, generated by a speaker +at one end of the channel, propagates and is received by a microphone at the +other channel's end. Our previous study revealed that acoustic wave energy +dissipation varies with acoustic channel deformation, leading us to design a +novel channel capable of large deformation due to bending. We then use Machine +Learning (ML) models to establish a complex mapping between channel +deformations and sound modulation. Various sound frequencies and ML models were +evaluated to enhance curvature detection accuracy. The sensor, constructed +using soft material and 3D printing, was validated experimentally, with +curvature measurement errors remaining within 3.5 m-1 for a range of 0 to 60 +m-1 curvatures. These results demonstrate the effectiveness of the proposed +method for estimating curvatures. With its flexible structure, the SAC sensor +holds potential for applications in soft robotics, including shape measurement +for continuum manipulators, soft grippers, and wearable devices. + +
+
+ comment: To appear in Robotics and Automation Letter +
+
+
+
+
+ + ♻ ☆ Deep Bayesian Future Fusion for Self-Supervised, High-Resolution, + Off-Road Mapping + + +
+ High-speed off-road navigation requires long-range, high-resolution maps to +enable robots to safely navigate over different surfaces while avoiding +dangerous obstacles. However, due to limited computational power and sensing +noise, most approaches to off-road mapping focus on producing coarse (20-40cm) +maps of the environment. In this paper, we propose Future Fusion, a framework +capable of generating dense, high-resolution maps from sparse sensing data (30m +forward at 2cm). This is accomplished by - (1) the efficient realization of the +well-known Bayes filtering within the standard deep learning models that +explicitly accounts for the sparsity pattern in stereo and LiDAR depth data, +and (2) leveraging perceptual losses common in generative image completion. The +proposed methodology outperforms the conventional baselines. Moreover, the +learned features and the completed dense maps lead to improvements in the +downstream navigation task. + +
+
+
+
+
+ + ♻ ☆ Learning Adaptive Multi-Objective Robot Navigation Incorporating + Demonstrations + + +
+ Preference-aligned robot navigation in human environments is typically +achieved through learning-based approaches, utilizing user feedback or +demonstrations for personalization. However, personal preferences are subject +to change and might even be context-dependent. Yet traditional reinforcement +learning (RL) approaches with static reward functions often fall short in +adapting to these varying user preferences, inevitably reflecting +demonstrations once training is completed. This paper introduces a framework +that combines multi-objective reinforcement learning (MORL) with +demonstration-based learning. Our approach allows for dynamic adaptation to +changing user preferences without retraining. It fluently modulates between +reward-defined preference objectives and the amount of demonstration data +reflection. Through rigorous evaluations, including a sim-to-real transfer on +two robots, we demonstrate our framework's capability to reflect user +preferences accurately while achieving high navigational performance in terms +of collision avoidance and goal pursuance. + +
+
+
+
+
+ + ♻ ☆ TinyVLA: Towards Fast, Data-Efficient Vision-Language-Action Models for + Robotic Manipulation + + +
+ Vision-Language-Action (VLA) models have shown remarkable potential in +visuomotor control and instruction comprehension through end-to-end learning +processes. However, current VLA models face significant challenges: they are +slow during inference and require extensive pre-training on large amounts of +robotic data, making real-world deployment difficult. In this paper, we +introduce a new family of compact vision-language-action models, called +TinyVLA, which offers two key advantages over existing VLA models: (1) faster +inference speeds, and (2) improved data efficiency, eliminating the need for +pre-training stage. Our framework incorporates two essential components to +build TinyVLA: (1) initializing the policy backbone with robust, high-speed +multimodal models, and (2) integrating a diffusion policy decoder during +fine-tuning to enable precise robot actions. We conducted extensive evaluations +of TinyVLA in both simulation and on real robots, demonstrating that our +approach significantly outperforms the state-of-the-art VLA model, OpenVLA, in +terms of speed and data efficiency, while delivering comparable or superior +performance. Additionally, TinyVLA exhibits strong generalization capabilities +across various dimensions, including language instructions, novel objects, +unseen positions, changes in object appearance, background variations, and +environmental shifts, often matching or exceeding the performance of OpenVLA. +We believe that \methodname offers an interesting perspective on utilizing +pre-trained multimodal models for policy learning. Our project is at +https://tiny-vla.github.io. + +
+
+ comment: add more citations +
+
+
+
+
+ + ♻ ☆ FracGM: A Fast Fractional Programming Technique for Geman-McClure Robust + Estimator + + +
+ Robust estimation is essential in computer vision, robotics, and navigation, +aiming to minimize the impact of outlier measurements for improved accuracy. We +present a fast algorithm for Geman-McClure robust estimation, FracGM, +leveraging fractional programming techniques. This solver reformulates the +original non-convex fractional problem to a convex dual problem and a linear +equation system, iteratively solving them in an alternating optimization +pattern. Compared to graduated non-convexity approaches, this strategy exhibits +a faster convergence rate and better outlier rejection capability. In addition, +the global optimality of the proposed solver can be guaranteed under given +conditions. We demonstrate the proposed FracGM solver with Wahba's rotation +problem and 3-D point-cloud registration along with relaxation pre-processing +and projection post-processing. Compared to state-of-the-art algorithms, when +the outlier rates increase from 20% to 80%, FracGM shows 53% and 88% lower +rotation and translation increases. In real-world scenarios, FracGM achieves +better results in 13 out of 18 outcomes, while having a 19.43% improvement in +the computation time. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Comparing Apples to Oranges: LLM-powered Multimodal Intention Prediction + in an Object Categorization Task + + +
+ Human intention-based systems enable robots to perceive and interpret user +actions to interact with humans and adapt to their behavior proactively. +Therefore, intention prediction is pivotal in creating a natural interaction +with social robots in human-designed environments. In this paper, we examine +using Large Language Models (LLMs) to infer human intention in a collaborative +object categorization task with a physical robot. We propose a novel multimodal +approach that integrates user non-verbal cues, like hand gestures, body poses, +and facial expressions, with environment states and user verbal cues to predict +user intentions in a hierarchical architecture. Our evaluation of five LLMs +shows the potential for reasoning about verbal and non-verbal user cues, +leveraging their context-understanding and real-world knowledge to support +intention prediction while collaborating on a task with a social robot. + +
+
+ comment: Accepted at ICSR 2024,14 pages,5 figures,2 tables; work was co-funded + by Horizon Europe project TERAIS under Grant agreement number 101079338 +
+
+
+
+
+ + ♻ ☆ Commonsense Scene Graph-based Target Localization for Object Search + + +
+ Object search is a fundamental skill for household robots, yet the core +problem lies in the robot's ability to locate the target object accurately. The +dynamic nature of household environments, characterized by the arbitrary +placement of daily objects by users, makes it challenging to perform target +localization. To efficiently locate the target object, the robot needs to be +equipped with knowledge at both the object and room level. However, existing +approaches rely solely on one type of knowledge, leading to unsatisfactory +object localization performance and, consequently, inefficient object search +processes. To address this problem, we propose a commonsense scene graph-based +target localization, CSG-TL, to enhance target object search in the household +environment. Given the pre-built map with stationary items, the robot models +the room-level knowledge with object-level commonsense knowledge generated by a +large language model (LLM) to a commonsense scene graph (CSG), supporting both +types of knowledge for CSG-TL. To demonstrate the superiority of CSG-TL on +target localization, extensive experiments are performed on the real-world +ScanNet dataset and the AI2THOR simulator. Moreover, we have extended CSG-TL to +an object search framework, CSG-OS, validated in both simulated and real-world +environments. Code and videos are available at +https://sites.google.com/view/csg-os. + +
+
+
+
+
+ + ♻ ☆ Query-based Semantic Gaussian Field for Scene Representation in + Reinforcement Learning + + +
+ Latent scene representation plays a significant role in training +reinforcement learning (RL) agents. To obtain good latent vectors describing +the scenes, recent works incorporate the 3D-aware latent-conditioned NeRF +pipeline into scene representation learning. However, these NeRF-related +methods struggle to perceive 3D structural information due to the inefficient +dense sampling in volumetric rendering. Moreover, they lack fine-grained +semantic information included in their scene representation vectors because +they evenly consider free and occupied spaces. Both of them can destroy the +performance of downstream RL tasks. To address the above challenges, we propose +a novel framework that adopts the efficient 3D Gaussian Splatting (3DGS) to +learn 3D scene representation for the first time. In brief, we present the +Query-based Generalizable 3DGS to bridge the 3DGS technique and scene +representations with more geometrical awareness than those in NeRFs. Moreover, +we present the Hierarchical Semantics Encoding to ground the fine-grained +semantic features to 3D Gaussians and further distilled to the scene +representation vectors. We conduct extensive experiments on two RL platforms +including Maniskill2 and Robomimic across 10 different tasks. The results show +that our method outperforms the other 5 baselines by a large margin. We achieve +the best success rates on 8 tasks and the second-best on the other two tasks. + +
+
+
+
+
+ + ♻ ☆ TOP-Nav: Legged Navigation Integrating Terrain, Obstacle and + Proprioception Estimation CoRL 2024 + + +
+ Legged navigation is typically examined within open-world, off-road, and +challenging environments. In these scenarios, estimating external disturbances +requires a complex synthesis of multi-modal information. This underlines a +major limitation in existing works that primarily focus on avoiding obstacles. +In this work, we propose TOP-Nav, a novel legged navigation framework that +integrates a comprehensive path planner with Terrain awareness, Obstacle +avoidance and close-loop Proprioception. TOP-Nav underscores the synergies +between vision and proprioception in both path and motion planning. Within the +path planner, we present and integrate a terrain estimator that enables the +robot to select waypoints on terrains with higher traversability while +effectively avoiding obstacles. In the motion planning level, we not only +implement a locomotion controller to track the navigation commands, but also +construct a proprioception advisor to provide motion evaluations for the path +planner. Based on the close-loop motion feedback, we make online corrections +for the vision-based terrain and obstacle estimations. Consequently, TOP-Nav +achieves open-world navigation that the robot can handle terrains or +disturbances beyond the distribution of prior knowledge and overcomes +constraints imposed by visual conditions. Building upon extensive experiments +conducted in both simulation and real-world environments, TOP-Nav demonstrates +superior performance in open-world navigation compared to existing methods. + +
+
+ comment: Published on CoRL 2024 +
+
+
+
+
+ + ♻ ☆ Enabling On-Chip High-Frequency Adaptive Linear Optimal Control via + Linearized Gaussian Process + + +
+ Unpredictable and complex aerodynamic effects pose significant challenges to +achieving precise flight control, such as the downwash effect from upper +vehicles to lower ones. Conventional methods often struggle to accurately model +these interactions, leading to controllers that require large safety margins +between vehicles. Moreover, the controller on real drones usually requires +high-frequency and has limited on-chip computation, making the adaptive control +design more difficult to implement. To address these challenges, we incorporate +Gaussian process (GP) to model the adaptive external aerodynamics with linear +model predictive control. The GP is linearized to enable real-time +high-frequency solutions. Moreover, to handle the error caused by +linearization, we integrate end-to-end Bayesian optimization during sample +collection stages to improve the control performance. Experimental results on +both simulations and real quadrotors show that we can achieve real-time +solvable computation speed with acceptable tracking errors. + +
+
+
+
+
+ + ♻ ☆ Compact 3D Gaussian Splatting For Dense Visual SLAM + + +
+ Recent work has shown that 3D Gaussian-based SLAM enables high-quality +reconstruction, accurate pose estimation, and real-time rendering of scenes. +However, these approaches are built on a tremendous number of redundant 3D +Gaussian ellipsoids, leading to high memory and storage costs, and slow +training speed. To address the limitation, we propose a compact 3D Gaussian +Splatting SLAM system that reduces the number and the parameter size of +Gaussian ellipsoids. A sliding window-based masking strategy is first proposed +to reduce the redundant ellipsoids. Then we observe that the covariance matrix +(geometry) of most 3D Gaussian ellipsoids are extremely similar, which +motivates a novel geometry codebook to compress 3D Gaussian geometric +attributes, i.e., the parameters. Robust and accurate pose estimation is +achieved by a global bundle adjustment method with reprojection loss. Extensive +experiments demonstrate that our method achieves faster training and rendering +speed while maintaining the state-of-the-art (SOTA) quality of the scene +representation. + +
+
+
+
+
+ + ♻ ☆ Constraint-Guided Online Data Selection for Scalable Data-Driven Safety + Filters in Uncertain Robotic Systems + + +
+ As the use of autonomous robots expands in tasks that are complex and +challenging to model, the demand for robust data-driven control methods that +can certify safety and stability in uncertain conditions is increasing. +However, the practical implementation of these methods often faces scalability +issues due to the growing amount of data points with system complexity, and a +significant reliance on high-quality training data. In response to these +challenges, this study presents a scalable data-driven controller that +efficiently identifies and infers from the most informative data points for +implementing data-driven safety filters. Our approach is grounded in the +integration of a model-based certificate function-based method and Gaussian +Process (GP) regression, reinforced by a novel online data selection algorithm +that reduces time complexity from quadratic to linear relative to dataset size. +Empirical evidence, gathered from successful real-world cart-pole swing-up +experiments and simulated locomotion of a five-link bipedal robot, demonstrates +the efficacy of our approach. Our findings reveal that our efficient online +data selection algorithm, which strategically selects key data points, enhances +the practicality and efficiency of data-driven certifying filters in complex +robotic systems, significantly mitigating scalability concerns inherent in +nonparametric learning-based control methods. + +
+
+ comment: The first three authors contributed equally to the work. This work + has been submitted to the IEEE for possible publication. Copyright may be + transferred without notice, after which this version may no longer be + accessible +
+
+
+
+
+ + ♻ ☆ AquaMILR: Mechanical intelligence simplifies control of undulatory + robots in cluttered fluid environments + + +
+ While undulatory swimming of elongate limbless robots has been extensively +studied in open hydrodynamic environments, less research has been focused on +limbless locomotion in complex, cluttered aquatic environments. Motivated by +the concept of mechanical intelligence, where controls for obstacle navigation +can be offloaded to passive body mechanics in terrestrial limbless locomotion, +we hypothesize that principles of mechanical intelligence can be extended to +cluttered hydrodynamic regimes. To test this, we developed an untethered +limbless robot capable of undulatory swimming on water surfaces, utilizing a +bilateral cable-driven mechanism inspired by organismal muscle actuation +morphology to achieve programmable anisotropic body compliance. We demonstrated +through robophysical experiments that, similar to terrestrial locomotion, an +appropriate level of body compliance can facilitate emergent swim through +complex hydrodynamic environments under pure open-loop control. Moreover, we +found that swimming performance depends on undulation frequency, with effective +locomotion achieved only within a specific frequency range. This contrasts with +highly damped terrestrial regimes, where inertial effects can often be +neglected. Further, to enhance performance and address the challenges posed by +nondeterministic obstacle distributions, we incorporated computational +intelligence by developing a real-time body compliance tuning controller based +on cable tension feedback. This controller improves the robot's robustness and +overall speed in heterogeneous hydrodynamic environments. + +
+
+
+
+
+ + ♻ ☆ 3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple + 3D Representations RSS + + +
+ Imitation learning provides an efficient way to teach robots dexterous +skills; however, learning complex skills robustly and generalizablely usually +consumes large amounts of human demonstrations. To tackle this challenging +problem, we present 3D Diffusion Policy (DP3), a novel visual imitation +learning approach that incorporates the power of 3D visual representations into +diffusion policies, a class of conditional action generative models. The core +design of DP3 is the utilization of a compact 3D visual representation, +extracted from sparse point clouds with an efficient point encoder. In our +experiments involving 72 simulation tasks, DP3 successfully handles most tasks +with just 10 demonstrations and surpasses baselines with a 24.2% relative +improvement. In 4 real robot tasks, DP3 demonstrates precise control with a +high success rate of 85%, given only 40 demonstrations of each task, and shows +excellent generalization abilities in diverse aspects, including space, +viewpoint, appearance, and instance. Interestingly, in real robot experiments, +DP3 rarely violates safety requirements, in contrast to baseline methods which +frequently do, necessitating human intervention. Our extensive evaluation +highlights the critical importance of 3D representations in real-world robot +learning. Videos, code, and data are available on +https://3d-diffusion-policy.github.io . + +
+
+ comment: Published at Robotics: Science and Systems (RSS) 2024. Videos, code, + and data: https://3d-diffusion-policy.github.io +
+
+
+
+
+ + ♻ ☆ AED: Adaptable Error Detection for Few-shot Imitation Policy NeurIPS2024 + + +
+ We introduce a new task called Adaptable Error Detection (AED), which aims to +identify behavior errors in few-shot imitation (FSI) policies based on visual +observations in novel environments. The potential to cause serious damage to +surrounding areas limits the application of FSI policies in real-world +scenarios. Thus, a robust system is necessary to notify operators when FSI +policies are inconsistent with the intent of demonstrations. This task +introduces three challenges: (1) detecting behavior errors in novel +environments, (2) identifying behavior errors that occur without revealing +notable changes, and (3) lacking complete temporal information of the rollout +due to the necessity of online detection. However, the existing benchmarks +cannot support the development of AED because their tasks do not present all +these challenges. To this end, we develop a cross-domain AED benchmark, +consisting of 322 base and 153 novel environments. Additionally, we propose +Pattern Observer (PrObe) to address these challenges. PrObe is equipped with a +powerful pattern extractor and guided by novel learning objectives to parse +discernible patterns in the policy feature representations of normal or error +states. Through our comprehensive evaluation, PrObe demonstrates superior +capability to detect errors arising from a wide range of FSI policies, +consistently surpassing strong baselines. Moreover, we conduct detailed +ablations and a pilot study on error correction to validate the effectiveness +of the proposed architecture design and the practicality of the AED task, +respectively. + +
+
+ comment: Accepted to NeurIPS2024 +
+
+
+
+
+ + ♻ ☆ Speech-Guided Sequential Planning for Autonomous Navigation using Large + Language Model Meta AI 3 (Llama3) + + +
+ In social robotics, a pivotal focus is enabling robots to engage with humans +in a more natural and seamless manner. The emergence of advanced large language +models (LLMs) such as Generative Pre-trained Transformers (GPTs) and +autoregressive models like Large Language Model Meta AI (Llamas) has driven +significant advancements in integrating natural language understanding +capabilities into social robots. This paper presents a system for speech-guided +sequential planning in autonomous navigation, utilizing Llama3 and the Robot +Operating System~(ROS). The proposed system involves using Llama3 to interpret +voice commands, extracting essential details through parsing, and decoding +these commands into sequential actions for tasks. Such sequential planning is +essential in various domains, particularly in the pickup and delivery of an +object. Once a sequential navigation task is evaluated, we employ DRL-VO, a +learning-based control policy that allows a robot to autonomously navigate +through social spaces with static infrastructure and (crowds of) people. We +demonstrate the effectiveness of the system in simulation experiment using +Turtlebot 2 in ROS1 and Turtlebot 3 in ROS2. We conduct hardware trials using a +Clearpath Robotics Jackal UGV, highlighting its potential for real-world +deployment in scenarios requiring flexible and interactive robotic behaviors. + +
+
+ comment: Accepted at the 16th International Conference on Social Robotics + AI +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 126 + +
+
+
+ + ☆ PhysGen: Rigid-Body Physics-Grounded Image-to-Video Generation ECCV 2024 + + +
+ We present PhysGen, a novel image-to-video generation method that converts a +single image and an input condition (e.g., force and torque applied to an +object in the image) to produce a realistic, physically plausible, and +temporally consistent video. Our key insight is to integrate model-based +physical simulation with a data-driven video generation process, enabling +plausible image-space dynamics. At the heart of our system are three core +components: (i) an image understanding module that effectively captures the +geometry, materials, and physical parameters of the image; (ii) an image-space +dynamics simulation model that utilizes rigid-body physics and inferred +parameters to simulate realistic behaviors; and (iii) an image-based rendering +and refinement module that leverages generative video diffusion to produce +realistic video footage featuring the simulated motion. The resulting videos +are realistic in both physics and appearance and are even precisely +controllable, showcasing superior results over existing data-driven +image-to-video generation works through quantitative comparison and +comprehensive user study. PhysGen's resulting videos can be used for various +downstream applications, such as turning an image into a realistic animation or +allowing users to interact with the image and create various dynamics. Project +page: https://stevenlsw.github.io/physgen/ + +
+
+ comment: Accepted to ECCV 2024. Project page: + https://stevenlsw.github.io/physgen/ +
+
+
+
+
+ + ☆ Exploring Token Pruning in Vision State Space Models NeurIPS'24 + + +
+ State Space Models (SSMs) have the advantage of keeping linear computational +complexity compared to attention modules in transformers, and have been applied +to vision tasks as a new type of powerful vision foundation model. Inspired by +the observations that the final prediction in vision transformers (ViTs) is +only based on a subset of most informative tokens, we take the novel step of +enhancing the efficiency of SSM-based vision models through token-based +pruning. However, direct applications of existing token pruning techniques +designed for ViTs fail to deliver good performance, even with extensive +fine-tuning. To address this issue, we revisit the unique computational +characteristics of SSMs and discover that naive application disrupts the +sequential token positions. This insight motivates us to design a novel and +general token pruning method specifically for SSM-based vision models. We first +introduce a pruning-aware hidden state alignment method to stabilize the +neighborhood of remaining tokens for performance enhancement. Besides, based on +our detailed analysis, we propose a token importance evaluation method adapted +for SSM models, to guide the token pruning. With efficient implementation and +practical acceleration methods, our method brings actual speedup. Extensive +experiments demonstrate that our approach can achieve significant computation +reduction with minimal impact on performance across different tasks. Notably, +we achieve 81.7\% accuracy on ImageNet with a 41.6\% reduction in the FLOPs for +pruned PlainMamba-L3. Furthermore, our work provides deeper insights into +understanding the behavior of SSM-based vision models for future research. + +
+
+ comment: NeurIPS'24 +
+
+
+
+
+ + ☆ ProMerge: Prompt and Merge for Unsupervised Instance Segmentation ECCV2024 + + +
+ Unsupervised instance segmentation aims to segment distinct object instances +in an image without relying on human-labeled data. This field has recently seen +significant advancements, partly due to the strong local correspondences +afforded by rich visual feature representations from self-supervised models +(e.g., DINO). Recent state-of-the-art approaches use self-supervised features +to represent images as graphs and solve a generalized eigenvalue system (i.e., +normalized-cut) to generate foreground masks. While effective, this strategy is +limited by its attendant computational demands, leading to slow inference +speeds. In this paper, we propose Prompt and Merge (ProMerge), which leverages +self-supervised visual features to obtain initial groupings of patches and +applies a strategic merging to these segments, aided by a sophisticated +background-based mask pruning technique. ProMerge not only yields competitive +results but also offers a significant reduction in inference time compared to +state-of-the-art normalized-cut-based approaches. Furthermore, when training an +object detector using our mask predictions as pseudo-labels, the resulting +detector surpasses the current leading unsupervised model on various +challenging instance segmentation benchmarks. + +
+
+ comment: ECCV2024 camera-ready +
+
+
+
+
+ + ☆ UniCal: Unified Neural Sensor Calibration ECCV 2024 + + +
+ Self-driving vehicles (SDVs) require accurate calibration of LiDARs and +cameras to fuse sensor data accurately for autonomy. Traditional calibration +methods typically leverage fiducials captured in a controlled and structured +scene and compute correspondences to optimize over. These approaches are costly +and require substantial infrastructure and operations, making it challenging to +scale for vehicle fleets. In this work, we propose UniCal, a unified framework +for effortlessly calibrating SDVs equipped with multiple LiDARs and cameras. +Our approach is built upon a differentiable scene representation capable of +rendering multi-view geometrically and photometrically consistent sensor +observations. We jointly learn the sensor calibration and the underlying scene +representation through differentiable volume rendering, utilizing outdoor +sensor data without the need for specific calibration fiducials. This +"drive-and-calibrate" approach significantly reduces costs and operational +overhead compared to existing calibration systems, enabling efficient +calibration for large SDV fleets at scale. To ensure geometric consistency +across observations from different sensors, we introduce a novel surface +alignment loss that combines feature-based registration with neural rendering. +Comprehensive evaluations on multiple datasets demonstrate that UniCal +outperforms or matches the accuracy of existing calibration approaches while +being more efficient, demonstrating the value of UniCal for scalable +calibration. + +
+
+ comment: ECCV 2024. Project page: https://waabi.ai/unical/ +
+
+
+
+
+ + ☆ Spectral Wavelet Dropout: Regularization in the Wavelet Domain ICML + + +
+ Regularization techniques help prevent overfitting and therefore improve the +ability of convolutional neural networks (CNNs) to generalize. One reason for +overfitting is the complex co-adaptations among different parts of the network, +which make the CNN dependent on their joint response rather than encouraging +each part to learn a useful feature representation independently. Frequency +domain manipulation is a powerful strategy for modifying data that has temporal +and spatial coherence by utilizing frequency decomposition. This work +introduces Spectral Wavelet Dropout (SWD), a novel regularization method that +includes two variants: 1D-SWD and 2D-SWD. These variants improve CNN +generalization by randomly dropping detailed frequency bands in the discrete +wavelet decomposition of feature maps. Our approach distinguishes itself from +the pre-existing Spectral "Fourier" Dropout (2D-SFD), which eliminates +coefficients in the Fourier domain. Notably, SWD requires only a single +hyperparameter, unlike the two required by SFD. We also extend the literature +by implementing a one-dimensional version of Spectral "Fourier" Dropout +(1D-SFD), setting the stage for a comprehensive comparison. Our evaluation +shows that both 1D and 2D SWD variants have competitive performance on +CIFAR-10/100 benchmarks relative to both 1D-SFD and 2D-SFD. Specifically, +1D-SWD has a significantly lower computational complexity compared to +1D/2D-SFD. In the Pascal VOC Object Detection benchmark, SWD variants surpass +1D-SFD and 2D-SFD in performance and demonstrate lower computational complexity +during training. + +
+
+ comment: Accepted by The International Conference on Machine Learning and + Applications (ICMLA) 2024 +
+
+
+
+
+ + ☆ From Seconds to Hours: Reviewing MultiModal Large Language Models on + Comprehensive Long Video Understanding + + +
+ The integration of Large Language Models (LLMs) with visual encoders has +recently shown promising performance in visual understanding tasks, leveraging +their inherent capability to comprehend and generate human-like text for visual +reasoning. Given the diverse nature of visual data, MultiModal Large Language +Models (MM-LLMs) exhibit variations in model designing and training for +understanding images, short videos, and long videos. Our paper focuses on the +substantial differences and unique challenges posed by long video understanding +compared to static image and short video understanding. Unlike static images, +short videos encompass sequential frames with both spatial and within-event +temporal information, while long videos consist of multiple events with +between-event and long-term temporal information. In this survey, we aim to +trace and summarize the advancements of MM-LLMs from image understanding to +long video understanding. We review the differences among various visual +understanding tasks and highlight the challenges in long video understanding, +including more fine-grained spatiotemporal details, dynamic events, and +long-term dependencies. We then provide a detailed summary of the advancements +in MM-LLMs in terms of model design and training methodologies for +understanding long videos. Finally, we compare the performance of existing +MM-LLMs on video understanding benchmarks of various lengths and discuss +potential future directions for MM-LLMs in long video understanding. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ ReviveDiff: A Universal Diffusion Model for Restoring Images in Adverse + Weather Conditions + + +
+ Images captured in challenging environments--such as nighttime, foggy, rainy +weather, and underwater--often suffer from significant degradation, resulting +in a substantial loss of visual quality. Effective restoration of these +degraded images is critical for the subsequent vision tasks. While many +existing approaches have successfully incorporated specific priors for +individual tasks, these tailored solutions limit their applicability to other +degradations. In this work, we propose a universal network architecture, dubbed +"ReviveDiff", which can address a wide range of degradations and bring images +back to life by enhancing and restoring their quality. Our approach is inspired +by the observation that, unlike degradation caused by movement or electronic +issues, quality degradation under adverse conditions primarily stems from +natural media (such as fog, water, and low luminance), which generally +preserves the original structures of objects. To restore the quality of such +images, we leveraged the latest advancements in diffusion models and developed +ReviveDiff to restore image quality from both macro and micro levels across +some key factors determining image quality, such as sharpness, distortion, +noise level, dynamic range, and color accuracy. We rigorously evaluated +ReviveDiff on seven benchmark datasets covering five types of degrading +conditions: Rainy, Underwater, Low-light, Smoke, and Nighttime Hazy. Our +experimental results demonstrate that ReviveDiff outperforms the +state-of-the-art methods both quantitatively and visually. + +
+
+
+
+
+ + ☆ SurfaceAI: Automated creation of cohesive road surface quality datasets + based on open street-level imagery + + +
+ This paper introduces SurfaceAI, a pipeline designed to generate +comprehensive georeferenced datasets on road surface type and quality from +openly available street-level imagery. The motivation stems from the +significant impact of road unevenness on the safety and comfort of traffic +participants, especially vulnerable road users, emphasizing the need for +detailed road surface data in infrastructure modeling and analysis. SurfaceAI +addresses this gap by leveraging crowdsourced Mapillary data to train models +that predict the type and quality of road surfaces visible in street-level +images, which are then aggregated to provide cohesive information on entire +road segment conditions. + +
+
+ comment: 4 pages, 2 figures; accepted at 2nd ACM SIGSPATIAL International + Workshop on Advances in Urban-AI +
+
+
+
+
+ + ☆ Improving Visual Object Tracking through Visual Prompting + + +
+ Learning a discriminative model to distinguish a target from its surrounding +distractors is essential to generic visual object tracking. Dynamic target +representation adaptation against distractors is challenging due to the limited +discriminative capabilities of prevailing trackers. We present a new visual +Prompting mechanism for generic Visual Object Tracking (PiVOT) to address this +issue. PiVOT proposes a prompt generation network with the pre-trained +foundation model CLIP to automatically generate and refine visual prompts, +enabling the transfer of foundation model knowledge for tracking. While CLIP +offers broad category-level knowledge, the tracker, trained on +instance-specific data, excels at recognizing unique object instances. Thus, +PiVOT first compiles a visual prompt highlighting potential target locations. +To transfer the knowledge of CLIP to the tracker, PiVOT leverages CLIP to +refine the visual prompt based on the similarities between candidate objects +and the reference templates across potential targets. Once the visual prompt is +refined, it can better highlight potential target locations, thereby reducing +irrelevant prompt information. With the proposed prompting mechanism, the +tracker can generate improved instance-aware feature maps through the guidance +of the visual prompt, thus effectively reducing distractors. The proposed +method does not involve CLIP during training, thereby keeping the same training +complexity and preserving the generalization capability of the pretrained +foundation model. Extensive experiments across multiple benchmarks indicate +that PiVOT, using the proposed prompting method can suppress distracting +objects and enhance the tracker. + +
+
+ comment: Accepted and to appear in IEEE Transactions on Multimedia +
+
+
+
+
+ + ☆ Unsupervised Low-light Image Enhancement with Lookup Tables and + Diffusion Priors + + +
+ Low-light image enhancement (LIE) aims at precisely and efficiently +recovering an image degraded in poor illumination environments. Recent advanced +LIE techniques are using deep neural networks, which require lots of low-normal +light image pairs, network parameters, and computational resources. As a +result, their practicality is limited. In this work, we devise a novel +unsupervised LIE framework based on diffusion priors and lookup tables (DPLUT) +to achieve efficient low-light image recovery. The proposed approach comprises +two critical components: a light adjustment lookup table (LLUT) and a noise +suppression lookup table (NLUT). LLUT is optimized with a set of unsupervised +losses. It aims at predicting pixel-wise curve parameters for the dynamic range +adjustment of a specific image. NLUT is designed to remove the amplified noise +after the light brightens. As diffusion models are sensitive to noise, +diffusion priors are introduced to achieve high-performance noise suppression. +Extensive experiments demonstrate that our approach outperforms +state-of-the-art methods in terms of visual quality and efficiency. + +
+
+ comment: 13 pages, 10 figures +
+
+
+
+
+ + ☆ Detecting Dataset Abuse in Fine-Tuning Stable Diffusion Models for + Text-to-Image Synthesis + + +
+ Text-to-image synthesis has become highly popular for generating realistic +and stylized images, often requiring fine-tuning generative models with +domain-specific datasets for specialized tasks. However, these valuable +datasets face risks of unauthorized usage and unapproved sharing, compromising +the rights of the owners. In this paper, we address the issue of dataset abuse +during the fine-tuning of Stable Diffusion models for text-to-image synthesis. +We present a dataset watermarking framework designed to detect unauthorized +usage and trace data leaks. The framework employs two key strategies across +multiple watermarking schemes and is effective for large-scale dataset +authorization. Extensive experiments demonstrate the framework's effectiveness, +minimal impact on the dataset (only 2% of the data required to be modified for +high detection accuracy), and ability to trace data leaks. Our results also +highlight the robustness and transferability of the framework, proving its +practical applicability in detecting dataset abuse. + +
+
+
+
+
+ + ☆ S2O: Static to Openable Enhancement for Articulated 3D Objects + + +
+ Despite much progress in large 3D datasets there are currently few +interactive 3D object datasets, and their scale is limited due to the manual +effort required in their construction. We introduce the static to openable +(S2O) task which creates interactive articulated 3D objects from static +counterparts through openable part detection, motion prediction, and interior +geometry completion. We formulate a unified framework to tackle this task, and +curate a challenging dataset of openable 3D objects that serves as a test bed +for systematic evaluation. Our experiments benchmark methods from prior work +and simple yet effective heuristics for the S2O task. We find that turning +static 3D objects into interactively openable counterparts is possible but that +all methods struggle to generalize to realistic settings of the task, and we +highlight promising future work directions. + +
+
+
+
+
+ + ☆ Explainable Artifacts for Synthetic Western Blot Source Attribution + + +
+ Recent advancements in artificial intelligence have enabled generative models +to produce synthetic scientific images that are indistinguishable from pristine +ones, posing a challenge even for expert scientists habituated to working with +such content. When exploited by organizations known as paper mills, which +systematically generate fraudulent articles, these technologies can +significantly contribute to the spread of misinformation about ungrounded +science, potentially undermining trust in scientific research. While previous +studies have explored black-box solutions, such as Convolutional Neural +Networks, for identifying synthetic content, only some have addressed the +challenge of generalizing across different models and providing insight into +the artifacts in synthetic images that inform the detection process. This study +aims to identify explainable artifacts generated by state-of-the-art generative +models (e.g., Generative Adversarial Networks and Diffusion Models) and +leverage them for open-set identification and source attribution (i.e., +pointing to the model that created the image). + +
+
+ comment: Accepted in IEEE International Workshop on Information Forensics and + Security - WIFS 2024, Rome, Italy +
+
+
+
+
+ + ☆ UniEmoX: Cross-modal Semantic-Guided Large-Scale Pretraining for + Universal Scene Emotion Perception + + +
+ Visual emotion analysis holds significant research value in both computer +vision and psychology. However, existing methods for visual emotion analysis +suffer from limited generalizability due to the ambiguity of emotion perception +and the diversity of data scenarios. To tackle this issue, we introduce +UniEmoX, a cross-modal semantic-guided large-scale pretraining framework. +Inspired by psychological research emphasizing the inseparability of the +emotional exploration process from the interaction between individuals and +their environment, UniEmoX integrates scene-centric and person-centric +low-level image spatial structural information, aiming to derive more nuanced +and discriminative emotional representations. By exploiting the similarity +between paired and unpaired image-text samples, UniEmoX distills rich semantic +knowledge from the CLIP model to enhance emotional embedding representations +more effectively. To the best of our knowledge, this is the first large-scale +pretraining framework that integrates psychological theories with contemporary +contrastive learning and masked image modeling techniques for emotion analysis +across diverse scenarios. Additionally, we develop a visual emotional dataset +titled Emo8. Emo8 samples cover a range of domains, including cartoon, natural, +realistic, science fiction and advertising cover styles, covering nearly all +common emotional scenes. Comprehensive experiments conducted on six benchmark +datasets across two downstream tasks validate the effectiveness of UniEmoX. The +source code is available at https://github.com/chincharles/u-emo. + +
+
+ comment: Submitted to TIP +
+
+
+
+
+ + ☆ CemiFace: Center-based Semi-hard Synthetic Face Generation for Face + Recognition NeurIPS 2024 + + +
+ Privacy issue is a main concern in developing face recognition techniques. +Although synthetic face images can partially mitigate potential legal risks +while maintaining effective face recognition (FR) performance, FR models +trained by face images synthesized by existing generative approaches frequently +suffer from performance degradation problems due to the insufficient +discriminative quality of these synthesized samples. In this paper, we +systematically investigate what contributes to solid face recognition model +training, and reveal that face images with certain degree of similarities to +their identity centers show great effectiveness in the performance of trained +FR models. Inspired by this, we propose a novel diffusion-based approach +(namely Center-based Semi-hard Synthetic Face Generation (CemiFace)) which +produces facial samples with various levels of similarity to the subject +center, thus allowing to generate face datasets containing effective +discriminative samples for training face recognition. Experimental results show +that with a modest degree of similarity, training on the generated dataset can +produce competitive performance compared to previous generation methods. + +
+
+ comment: accepted to NeurIPS 2024. We are preparing the camera-ready version + according to the reviews +
+
+
+
+
+ + ☆ Simulating Dynamic Tumor Contrast Enhancement in Breast MRI using + Conditional Generative Adversarial Networks + + +
+ This paper presents a method for virtual contrast enhancement in breast MRI, +offering a promising non-invasive alternative to traditional contrast +agent-based DCE-MRI acquisition. Using a conditional generative adversarial +network, we predict DCE-MRI images, including jointly-generated sequences of +multiple corresponding DCE-MRI timepoints, from non-contrast-enhanced MRIs, +enabling tumor localization and characterization without the associated health +risks. Furthermore, we qualitatively and quantitatively evaluate the synthetic +DCE-MRI images, proposing a multi-metric Scaled Aggregate Measure (SAMe), +assessing their utility in a tumor segmentation downstream task, and conclude +with an analysis of the temporal patterns in multi-sequence DCE-MRI generation. +Our approach demonstrates promising results in generating realistic and useful +DCE-MRI sequences, highlighting the potential of virtual contrast enhancement +for improving breast cancer diagnosis and treatment, particularly for patients +where contrast agent administration is contraindicated. + +
+
+
+
+
+ + ☆ Emu3: Next-Token Prediction is All You Need + + +
+ While next-token prediction is considered a promising path towards artificial +general intelligence, it has struggled to excel in multimodal tasks, which are +still dominated by diffusion models (e.g., Stable Diffusion) and compositional +approaches (e.g., CLIP combined with LLMs). In this paper, we introduce Emu3, a +new suite of state-of-the-art multimodal models trained solely with next-token +prediction. By tokenizing images, text, and videos into a discrete space, we +train a single transformer from scratch on a mixture of multimodal sequences. +Emu3 outperforms several well-established task-specific models in both +generation and perception tasks, surpassing flagship models such as SDXL and +LLaVA-1.6, while eliminating the need for diffusion or compositional +architectures. Emu3 is also capable of generating high-fidelity video via +predicting the next token in a video sequence. We simplify complex multimodal +model designs by converging on a singular focus: tokens, unlocking great +potential for scaling both during training and inference. Our results +demonstrate that next-token prediction is a promising path towards building +general multimodal intelligence beyond language. We open-source key techniques +and models to support further research in this direction. + +
+
+ comment: Project Page: https://emu.baai.ac.cn +
+
+
+
+
+ + ☆ MCUBench: A Benchmark of Tiny Object Detectors on MCUs + + +
+ We introduce MCUBench, a benchmark featuring over 100 YOLO-based object +detection models evaluated on the VOC dataset across seven different MCUs. This +benchmark provides detailed data on average precision, latency, RAM, and Flash +usage for various input resolutions and YOLO-based one-stage detectors. By +conducting a controlled comparison with a fixed training pipeline, we collect +comprehensive performance metrics. Our Pareto-optimal analysis shows that +integrating modern detection heads and training techniques allows various YOLO +architectures, including legacy models like YOLOv3, to achieve a highly +efficient tradeoff between mean Average Precision (mAP) and latency. MCUBench +serves as a valuable tool for benchmarking the MCU performance of contemporary +object detectors and aids in model selection based on specific constraints. + +
+
+ comment: Code and data are available at + https://github.com/Deeplite/deeplite-torch-zoo +
+
+
+
+
+ + ☆ Positional Encoder Graph Quantile Neural Networks for Geographic Data + + +
+ Positional Encoder Graph Neural Networks (PE-GNNs) are a leading approach for +modeling continuous spatial data. However, they often fail to produce +calibrated predictive distributions, limiting their effectiveness for +uncertainty quantification. We introduce the Positional Encoder Graph Quantile +Neural Network (PE-GQNN), a novel method that integrates PE-GNNs, Quantile +Neural Networks, and recalibration techniques in a fully nonparametric +framework, requiring minimal assumptions about the predictive distributions. We +propose a new network architecture that, when combined with a quantile-based +loss function, yields accurate and reliable probabilistic models without +increasing computational complexity. Our approach provides a flexible, robust +framework for conditional density estimation, applicable beyond spatial data +contexts. We further introduce a structured method for incorporating a KNN +predictor into the model while avoiding data leakage through the GNN layer +operation. Experiments on benchmark datasets demonstrate that PE-GQNN +significantly outperforms existing state-of-the-art methods in both predictive +accuracy and uncertainty quantification. + +
+
+ comment: 17 main text pages, 4 figures +
+
+
+
+
+ + ☆ LW2G: Learning Whether to Grow for Prompt-based Continual Learning + + +
+ Continual Learning (CL) aims to learn in non-stationary scenarios, +progressively acquiring and maintaining knowledge from sequential tasks. Recent +Prompt-based Continual Learning (PCL) has achieved remarkable performance with +Pre-Trained Models (PTMs). These approaches grow a prompt sets pool by adding a +new set of prompts when learning each new task (\emph{prompt learning}) and +adopt a matching mechanism to select the correct set for each testing sample +(\emph{prompt retrieval}). Previous studies focus on the latter stage by +improving the matching mechanism to enhance Prompt Retrieval Accuracy (PRA). To +promote cross-task knowledge facilitation and form an effective and efficient +prompt sets pool, we propose a plug-in module in the former stage to +\textbf{Learn Whether to Grow (LW2G)} based on the disparities between tasks. +Specifically, a shared set of prompts is utilized when several tasks share +certain commonalities, and a new set is added when there are significant +differences between the new task and previous tasks. Inspired by Gradient +Projection Continual Learning, our LW2G develops a metric called Hinder Forward +Capability (HFC) to measure the hindrance imposed on learning new tasks by +surgically modifying the original gradient onto the orthogonal complement of +the old feature space. With HFC, an automated scheme Dynamic Growing Approach +adaptively learns whether to grow with a dynamic threshold. Furthermore, we +design a gradient-based constraint to ensure the consistency between the +updating prompts and pre-trained knowledge, and a prompts weights reusing +strategy to enhance forward transfer. Extensive experiments show the +effectiveness of our method. The source codes are available at +\url{https://github.com/RAIAN08/LW2G}. + +
+
+ comment: submit to neurips2024 +
+
+
+
+
+ + ☆ Space-time 2D Gaussian Splatting for Accurate Surface Reconstruction + under Complex Dynamic Scenes + + +
+ Previous surface reconstruction methods either suffer from low geometric +accuracy or lengthy training times when dealing with real-world complex dynamic +scenes involving multi-person activities, and human-object interactions. To +tackle the dynamic contents and the occlusions in complex scenes, we present a +space-time 2D Gaussian Splatting approach. Specifically, to improve geometric +quality in dynamic scenes, we learn canonical 2D Gaussian splats and deform +these 2D Gaussian splats while enforcing the disks of the Gaussian located on +the surface of the objects by introducing depth and normal regularizers. +Further, to tackle the occlusion issues in complex scenes, we introduce a +compositional opacity deformation strategy, which further reduces the surface +recovery of those occluded areas. Experiments on real-world sparse-view video +datasets and monocular dynamic datasets demonstrate that our reconstructions +outperform state-of-the-art methods, especially for the surface of the details. +The project page and more visualizations can be found at: +https://tb2-sy.github.io/st-2dgs/. + +
+
+ comment: Project page: https://tb2-sy.github.io/st-2dgs/ +
+
+
+
+
+ + ☆ MinerU: An Open-Source Solution for Precise Document Content Extraction + + +
+ Document content analysis has been a crucial research area in computer +vision. Despite significant advancements in methods such as OCR, layout +detection, and formula recognition, existing open-source solutions struggle to +consistently deliver high-quality content extraction due to the diversity in +document types and content. To address these challenges, we present MinerU, an +open-source solution for high-precision document content extraction. MinerU +leverages the sophisticated PDF-Extract-Kit models to extract content from +diverse documents effectively and employs finely-tuned preprocessing and +postprocessing rules to ensure the accuracy of the final results. Experimental +results demonstrate that MinerU consistently achieves high performance across +various document types, significantly enhancing the quality and consistency of +content extraction. The MinerU open-source project is available at +https://github.com/opendatalab/MinerU. + +
+
+ comment: MinerU Technical Report +
+
+
+
+
+ + ☆ Classification and regression of trajectories rendered as images via 2D + Convolutional Neural Networks + + +
+ Trajectories can be regarded as time-series of coordinates, typically arising +from motile objects. Methods for trajectory classification are particularly +important to detect different movement patterns, while methods for regression +to compute motility metrics and forecasting. Recent advances in computer vision +have facilitated the processing of trajectories rendered as images via +artificial neural networks with 2d convolutional layers (CNNs). This approach +leverages the capability of CNNs to learn spatial hierarchies of features from +images, necessary to recognize complex shapes. Moreover, it overcomes the +limitation of other machine learning methods that require input trajectories +with a fixed number of points. However, rendering trajectories as images can +introduce poorly investigated artifacts such as information loss due to the +plotting of coordinates on a discrete grid, and spectral changes due to line +thickness and aliasing. In this study, we investigate the effectiveness of CNNs +for solving classification and regression problems from synthetic trajectories +that have been rendered as images using different modalities. The parameters +considered in this study include line thickness, image resolution, usage of +motion history (color-coding of the temporal component) and anti-aliasing. +Results highlight the importance of choosing an appropriate image resolution +according to model depth and motion history in applications where movement +direction is critical. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ YOLOv8-ResCBAM: YOLOv8 Based on An Effective Attention Module for + Pediatric Wrist Fracture Detection ICONIP 2024 + + +
+ Wrist trauma and even fractures occur frequently in daily life, particularly +among children who account for a significant proportion of fracture cases. +Before performing surgery, surgeons often request patients to undergo X-ray +imaging first, and prepare for the surgery based on the analysis of the X-ray +images. With the development of neural networks, You Only Look Once (YOLO) +series models have been widely used in fracture detection for Computer-Assisted +Diagnosis, where the YOLOv8 model has obtained the satisfactory results. +Applying the attention modules to neural networks is one of the effective +methods to improve the model performance. This paper proposes YOLOv8-ResCBAM, +which incorporates Convolutional Block Attention Module integrated with +resblock (ResCBAM) into the original YOLOv8 network architecture. The +experimental results on the GRAZPEDWRI-DX dataset demonstrate that the mean +Average Precision calculated at Intersection over Union threshold of 0.5 (mAP +50) of the proposed model increased from 63.6% of the original YOLOv8 model to +65.8%, which achieves the state-of-the-art performance. The implementation code +is available at +https://github.com/RuiyangJu/Fracture_Detection_Improved_YOLOv8. + +
+
+ comment: Accepted by ICONIP 2024. arXiv admin note: substantial text overlap + with arXiv:2402.09329 +
+
+
+
+
+ + ☆ Early diagnosis of Alzheimer's disease from MRI images with deep + learning model + + +
+ It is acknowledged that the most common cause of dementia worldwide is +Alzheimer's disease (AD). This condition progresses in severity from mild to +severe and interferes with people's everyday routines. Early diagnosis plays a +critical role in patient care and clinical trials. Convolutional neural +networks (CNN) are used to create a framework for identifying specific disease +features from MRI scans Classification of dementia involves approaches such as +medical history review, neuropsychological tests, and magnetic resonance +imaging (MRI). However, the image dataset obtained from Kaggle faces a +significant issue of class imbalance, which requires equal distribution of +samples from each class to address. In this article, to address this imbalance, +the Synthetic Minority Oversampling Technique (SMOTE) is utilized. Furthermore, +a pre-trained convolutional neural network has been applied to the DEMNET +dementia network to extract key features from AD images. The proposed model +achieved an impressive accuracy of 98.67%. + +
+
+ comment: 7 pages, 3 figures, Presented at the 20-th CSI International + Symposium on Artificial Intelligence and Signal Processing (AISP) 21-22 + February, 2024, Mazandaran University of Science and Technology, Babol, Iran +
+
+
+
+
+ + ☆ EyeTrAES: Fine-grained, Low-Latency Eye Tracking via Adaptive Event + Slicing + + +
+ Eye-tracking technology has gained significant attention in recent years due +to its wide range of applications in human-computer interaction, virtual and +augmented reality, and wearable health. Traditional RGB camera-based +eye-tracking systems often struggle with poor temporal resolution and +computational constraints, limiting their effectiveness in capturing rapid eye +movements. To address these limitations, we propose EyeTrAES, a novel approach +using neuromorphic event cameras for high-fidelity tracking of natural +pupillary movement that shows significant kinematic variance. One of EyeTrAES's +highlights is the use of a novel adaptive windowing/slicing algorithm that +ensures just the right amount of descriptive asynchronous event data +accumulation within an event frame, across a wide range of eye movement +patterns. EyeTrAES then applies lightweight image processing functions over +accumulated event frames from just a single eye to perform pupil segmentation +and tracking. We show that these methods boost pupil tracking fidelity by 6+%, +achieving IoU~=92%, while incurring at least 3x lower latency than competing +pure event-based eye tracking alternatives [38]. We additionally demonstrate +that the microscopic pupillary motion captured by EyeTrAES exhibits distinctive +variations across individuals and can thus serve as a biometric fingerprint. +For robust user authentication, we train a lightweight per-user Random Forest +classifier using a novel feature vector of short-term pupillary kinematics, +comprising a sliding window of pupil (location, velocity, acceleration) +triples. Experimental studies with two different datasets demonstrate that the +EyeTrAES-based authentication technique can simultaneously achieve high +authentication accuracy (~=0.82) and low processing latency (~=12ms), and +significantly outperform multiple state-of-the-art competitive baselines. + +
+
+ comment: 32 pages,15 figures, +
+
+
+
+
+ + ☆ MiniVLN: Efficient Vision-and-Language Navigation by Progressive + Knowledge Distillation + + +
+ In recent years, Embodied Artificial Intelligence (Embodied AI) has advanced +rapidly, yet the increasing size of models conflicts with the limited +computational capabilities of Embodied AI platforms. To address this challenge, +we aim to achieve both high model performance and practical deployability. +Specifically, we focus on Vision-and-Language Navigation (VLN), a core task in +Embodied AI. This paper introduces a two-stage knowledge distillation +framework, producing a student model, MiniVLN, and showcasing the significant +potential of distillation techniques in developing lightweight models. The +proposed method aims to capture fine-grained knowledge during the pretraining +phase and navigation-specific knowledge during the fine-tuning phase. Our +findings indicate that the two-stage distillation approach is more effective in +narrowing the performance gap between the teacher model and the student model +compared to single-stage distillation. On the public R2R and REVERIE +benchmarks, MiniVLN achieves performance on par with the teacher model while +having only about 12% of the teacher model's parameter count. + +
+
+
+
+
+ + ☆ Open-Nav: Exploring Zero-Shot Vision-and-Language Navigation in + Continuous Environment with Open-Source LLMs + + +
+ Vision-and-Language Navigation (VLN) tasks require an agent to follow textual +instructions to navigate through 3D environments. Traditional approaches use +supervised learning methods, relying heavily on domain-specific datasets to +train VLN models. Recent methods try to utilize closed-source large language +models (LLMs) like GPT-4 to solve VLN tasks in zero-shot manners, but face +challenges related to expensive token costs and potential data breaches in +real-world applications. In this work, we introduce Open-Nav, a novel study +that explores open-source LLMs for zero-shot VLN in the continuous environment. +Open-Nav employs a spatial-temporal chain-of-thought (CoT) reasoning approach +to break down tasks into instruction comprehension, progress estimation, and +decision-making. It enhances scene perceptions with fine-grained object and +spatial knowledge to improve LLM's reasoning in navigation. Our extensive +experiments in both simulated and real-world environments demonstrate that +Open-Nav achieves competitive performance compared to using closed-source LLMs. + +
+
+
+
+
+ + ☆ Excavating in the Wild: The GOOSE-Ex Dataset for Semantic Segmentation + + +
+ The successful deployment of deep learning-based techniques for autonomous +systems is highly dependent on the data availability for the respective system +in its deployment environment. Especially for unstructured outdoor +environments, very few datasets exist for even fewer robotic platforms and +scenarios. In an earlier work, we presented the German Outdoor and Offroad +Dataset (GOOSE) framework along with 10000 multimodal frames from an offroad +vehicle to enhance the perception capabilities in unstructured environments. In +this work, we address the generalizability of the GOOSE framework. To +accomplish this, we open-source the GOOSE-Ex dataset, which contains additional +5000 labeled multimodal frames from various completely different environments, +recorded on a robotic excavator and a quadruped platform. We perform a +comprehensive analysis of the semantic segmentation performance on different +platforms and sensor modalities in unseen environments. In addition, we +demonstrate how the combined datasets can be utilized for different downstream +applications or competitions such as offroad navigation, object manipulation or +scene completion. The dataset, its platform documentation and pre-trained +state-of-the-art models for offroad perception will be made available on +https://goose-dataset.de/. + \ + +
+
+ comment: Submitted to IEEE for review +
+
+
+
+
+ + ☆ Student-Oriented Teacher Knowledge Refinement for Knowledge Distillation + + +
+ Knowledge distillation has become widely recognized for its ability to +transfer knowledge from a large teacher network to a compact and more +streamlined student network. Traditional knowledge distillation methods +primarily follow a teacher-oriented paradigm that imposes the task of learning +the teacher's complex knowledge onto the student network. However, significant +disparities in model capacity and architectural design hinder the student's +comprehension of the complex knowledge imparted by the teacher, resulting in +sub-optimal performance. This paper introduces a novel perspective emphasizing +student-oriented and refining the teacher's knowledge to better align with the +student's needs, thereby improving knowledge transfer effectiveness. +Specifically, we present the Student-Oriented Knowledge Distillation (SoKD), +which incorporates a learnable feature augmentation strategy during training to +refine the teacher's knowledge of the student dynamically. Furthermore, we +deploy the Distinctive Area Detection Module (DAM) to identify areas of mutual +interest between the teacher and student, concentrating knowledge transfer +within these critical areas to avoid transferring irrelevant information. This +customized module ensures a more focused and effective knowledge distillation +process. Our approach, functioning as a plug-in, could be integrated with +various knowledge distillation methods. Extensive experimental results +demonstrate the efficacy and generalizability of our method. + +
+
+
+
+
+ + ☆ DualDn: Dual-domain Denoising via Differentiable ISP ECCV 2024 + + +
+ Image denoising is a critical component in a camera's Image Signal Processing +(ISP) pipeline. There are two typical ways to inject a denoiser into the ISP +pipeline: applying a denoiser directly to captured raw frames (raw domain) or +to the ISP's output sRGB images (sRGB domain). However, both approaches have +their limitations. Residual noise from raw-domain denoising can be amplified by +the subsequent ISP processing, and the sRGB domain struggles to handle +spatially varying noise since it only sees noise distorted by the ISP. +Consequently, most raw or sRGB domain denoising works only for specific noise +distributions and ISP configurations. To address these challenges, we propose +DualDn, a novel learning-based dual-domain denoising. Unlike previous +single-domain denoising, DualDn consists of two denoising networks: one in the +raw domain and one in the sRGB domain. The raw domain denoising adapts to +sensor-specific noise as well as spatially varying noise levels, while the sRGB +domain denoising adapts to ISP variations and removes residual noise amplified +by the ISP. Both denoising networks are connected with a differentiable ISP, +which is trained end-to-end and discarded during the inference stage. With this +design, DualDn achieves greater generalizability compared to most +learning-based denoising methods, as it can adapt to different unseen noises, +ISP parameters, and even novel ISP pipelines. Experiments show that DualDn +achieves state-of-the-art performance and can adapt to different denoising +architectures. Moreover, DualDn can be used as a plug-and-play denoising module +with real cameras without retraining, and still demonstrate better performance +than commercial on-camera denoising. The project website is available at: +https://openimaginglab.github.io/DualDn/ + +
+
+ comment: Accepted at ECCV 2024, Project page: + https://openimaginglab.github.io/DualDn/ +
+
+
+
+
+ + ☆ Relighting from a Single Image: Datasets and Deep Intrinsic-based + Architecture + + +
+ Single image scene relighting aims to generate a realistic new version of an +input image so that it appears to be illuminated by a new target light +condition. Although existing works have explored this problem from various +perspectives, generating relit images under arbitrary light conditions remains +highly challenging, and related datasets are scarce. Our work addresses this +problem from both the dataset and methodological perspectives. We propose two +new datasets: a synthetic dataset with the ground truth of intrinsic components +and a real dataset collected under laboratory conditions. These datasets +alleviate the scarcity of existing datasets. To incorporate physical +consistency in the relighting pipeline, we establish a two-stage network based +on intrinsic decomposition, giving outputs at intermediate steps, thereby +introducing physical constraints. When the training set lacks ground truth for +intrinsic decomposition, we introduce an unsupervised module to ensure that the +intrinsic outputs are satisfactory. Our method outperforms the state-of-the-art +methods in performance, as tested on both existing datasets and our newly +developed datasets. Furthermore, pretraining our method or other prior methods +using our synthetic dataset can enhance their performance on other datasets. +Since our method can accommodate any light conditions, it is capable of +producing animated results. The dataset, method, and videos are publicly +available. + +
+
+ comment: Accepted for publication as a Regular paper in the IEEE Transactions + on Multimedia +
+
+
+
+
+ + ☆ State-of-the-Art Periorbital Distance Prediction and Disease + Classification Using Periorbital Features + + +
+ Periorbital distances and features around the eyes and lids hold valuable +information for disease quantification and monitoring of surgical and medical +intervention. These distances are commonly measured manually, a process that is +both subjective and highly time-consuming. Here, we set out to developed three +deep-learning methods for segmentation and periorbital distance prediction, and +also evaluate the utility of periorbital distances for disease classification. +The MAE of our deep learning predicted distances was less than or very close to +the error observed between trained human annotators. We compared our models to +the current state-of-the-art (SOTA) method for periorbital distance prediction +and found that our methods outperformed SOTA on all of our datasets on all but +one periorbital measurement. We also show that robust segmentation can be +achieved on diseased eyes using models trained on open-source, healthy eyes, +and that periorbital distances have can be used as high-quality features in +downstream classification models. Leveraging segmentation networks as +intermediary steps in classification has broad implications for increasing the +generalizability of classification models in ophthalmic plastic and +craniofacial surgery by avoiding the out-of-distribution problem observed in +traditional convolutional neural networks. + +
+
+ comment: 16 pages, 4 figures, 4 tables +
+
+
+
+
+ + ☆ Charting the Future: Using Chart Question-Answering for Scalable + Evaluation of LLM-Driven Data Visualizations + + +
+ We propose a novel framework that leverages Visual Question Answering (VQA) +models to automate the evaluation of LLM-generated data visualizations. +Traditional evaluation methods often rely on human judgment, which is costly +and unscalable, or focus solely on data accuracy, neglecting the effectiveness +of visual communication. By employing VQA models, we assess data representation +quality and the general communicative clarity of charts. Experiments were +conducted using two leading VQA benchmark datasets, ChartQA and PlotQA, with +visualizations generated by OpenAI's GPT-3.5 Turbo and Meta's Llama 3.1 +70B-Instruct models. Our results indicate that LLM-generated charts do not +match the accuracy of the original non-LLM-generated charts based on VQA +performance measures. Moreover, while our results demonstrate that few-shot +prompting significantly boosts the accuracy of chart generation, considerable +progress remains to be made before LLMs can fully match the precision of +human-generated graphs. This underscores the importance of our work, which +expedites the research process by enabling rapid iteration without the need for +human annotation, thus accelerating advancements in this field. + +
+
+
+
+
+ + ☆ Enhancing Explainability in Multimodal Large Language Models Using + Ontological Context + + +
+ Recently, there has been a growing interest in Multimodal Large Language +Models (MLLMs) due to their remarkable potential in various tasks integrating +different modalities, such as image and text, as well as applications such as +image captioning and visual question answering. However, such models still face +challenges in accurately captioning and interpreting specific visual concepts +and classes, particularly in domain-specific applications. We argue that +integrating domain knowledge in the form of an ontology can significantly +address these issues. In this work, as a proof of concept, we propose a new +framework that combines ontology with MLLMs to classify images of plant +diseases. Our method uses concepts about plant diseases from an existing +disease ontology to query MLLMs and extract relevant visual concepts from +images. Then, we use the reasoning capabilities of the ontology to classify the +disease according to the identified concepts. Ensuring that the model +accurately uses the concepts describing the disease is crucial in +domain-specific applications. By employing an ontology, we can assist in +verifying this alignment. Additionally, using the ontology's inference +capabilities increases transparency, explainability, and trust in the +decision-making process while serving as a judge by checking if the annotations +of the concepts by MLLMs are aligned with those in the ontology and displaying +the rationales behind their errors. Our framework offers a new direction for +synergizing ontologies and MLLMs, supported by an empirical study using +different well-known MLLMs. + +
+
+
+
+
+ + ☆ Effectiveness of learning-based image codecs on fingerprint storage + + +
+ The success of learning-based coding techniques and the development of +learning-based image coding standards, such as JPEG-AI, point towards the +adoption of such solutions in different fields, including the storage of +biometric data, like fingerprints. However, the peculiar nature of +learning-based compression artifacts poses several issues concerning their +impact and effectiveness on extracting biometric features and landmarks, e.g., +minutiae. This problem is utterly stressed by the fact that most models are +trained on natural color images, whose characteristics are very different from +usual biometric images, e.g, fingerprint or iris pictures. As a matter of fact, +these issues are deemed to be accurately questioned and investigated, being +such analysis still largely unexplored. + This study represents the first investigation about the adaptability of +learning-based image codecs in the storage of fingerprint images by measuring +its impact on the extraction and characterization of minutiae. Experimental +results show that at a fixed rate point, learned solutions considerably +outperform previous fingerprint coding standards, like JPEG2000, both in terms +of distortion and minutiae preservation. Indeed, experimental results prove +that the peculiarities of learned compression artifacts do not prevent +automatic fingerprint identification (since minutiae types and locations are +not significantly altered), nor do compromise image quality for human visual +inspection (as they gain in terms of BD rate and PSNR of 47.8% and +3.97dB +respectively). + +
+
+ comment: Accepted ad Wifs 2024 +
+
+
+
+
+ + ☆ A Generalized Tensor Formulation for Hyperspectral Image + Super-Resolution Under General Spatial Blurring + + +
+ Hyperspectral super-resolution is commonly accomplished by the fusing of a +hyperspectral imaging of low spatial resolution with a multispectral image of +high spatial resolution, and many tensor-based approaches to this task have +been recently proposed. Yet, it is assumed in such tensor-based methods that +the spatial-blurring operation that creates the observed hyperspectral image +from the desired super-resolved image is separable into independent horizontal +and vertical blurring. Recent work has argued that such separable spatial +degradation is ill-equipped to model the operation of real sensors which may +exhibit, for example, anisotropic blurring. To accommodate this fact, a +generalized tensor formulation based on a Kronecker decomposition is proposed +to handle any general spatial-degradation matrix, including those that are not +separable as previously assumed. Analysis of the generalized formulation +reveals conditions under which exact recovery of the desired super-resolved +image is guaranteed, and a practical algorithm for such recovery, driven by a +blockwise-group-sparsity regularization, is proposed. Extensive experimental +results demonstrate that the proposed generalized tensor approach outperforms +not only traditional matrix-based techniques but also state-of-the-art +tensor-based methods; the gains with respect to the latter are especially +significant in cases of anisotropic spatial blurring. + +
+
+
+
+
+ + ☆ Multi-modal Medical Image Fusion For Non-Small Cell Lung Cancer + Classification + + +
+ The early detection and nuanced subtype classification of non-small cell lung +cancer (NSCLC), a predominant cause of cancer mortality worldwide, is a +critical and complex issue. In this paper, we introduce an innovative +integration of multi-modal data, synthesizing fused medical imaging (CT and PET +scans) with clinical health records and genomic data. This unique fusion +methodology leverages advanced machine learning models, notably MedClip and +BEiT, for sophisticated image feature extraction, setting a new standard in +computational oncology. Our research surpasses existing approaches, as +evidenced by a substantial enhancement in NSCLC detection and classification +precision. The results showcase notable improvements across key performance +metrics, including accuracy, precision, recall, and F1-score. Specifically, our +leading multi-modal classifier model records an impressive accuracy of 94.04%. +We believe that our approach has the potential to transform NSCLC diagnostics, +facilitating earlier detection and more effective treatment planning and, +ultimately, leading to superior patient outcomes in lung cancer care. + +
+
+
+
+
+ + ☆ 3DPX: Single Panoramic X-ray Analysis Guided by 3D Oral Structure + Reconstruction + + +
+ Panoramic X-ray (PX) is a prevalent modality in dentistry practice owing to +its wide availability and low cost. However, as a 2D projection of a 3D +structure, PX suffers from anatomical information loss and PX diagnosis is +limited compared to that with 3D imaging modalities. 2D-to-3D reconstruction +methods have been explored for the ability to synthesize the absent 3D +anatomical information from 2D PX for use in PX image analysis. However, there +are challenges in leveraging such 3D synthesized reconstructions. First, +inferring 3D depth from 2D images remains a challenging task with limited +accuracy. The second challenge is the joint analysis of 2D PX with its 3D +synthesized counterpart, with the aim to maximize the 2D-3D synergy while +minimizing the errors arising from the synthesized image. In this study, we +propose a new method termed 3DPX - PX image analysis guided by 2D-to-3D +reconstruction, to overcome these challenges. 3DPX consists of (i) a novel +progressive reconstruction network to improve 2D-to-3D reconstruction and, (ii) +a contrastive-guided bidirectional multimodality alignment module for 3D-guided +2D PX classification and segmentation tasks. The reconstruction network +progressively reconstructs 3D images with knowledge imposed on the intermediate +reconstructions at multiple pyramid levels and incorporates Multilayer +Perceptrons to improve semantic understanding. The downstream networks leverage +the reconstructed images as 3D anatomical guidance to the PX analysis through +feature alignment, which increases the 2D-3D synergy with bidirectional feature +projection and decease the impact of potential errors with contrastive +guidance. Extensive experiments on two oral datasets involving 464 studies +demonstrate that 3DPX outperforms the state-of-the-art methods in various tasks +including 2D-to-3D reconstruction, PX classification and lesion segmentation. + +
+
+
+
+
+ + ☆ Learning from Pattern Completion: Self-supervised Controllable + Generation + + +
+ The human brain exhibits a strong ability to spontaneously associate +different visual attributes of the same or similar visual scene, such as +associating sketches and graffiti with real-world visual objects, usually +without supervising information. In contrast, in the field of artificial +intelligence, controllable generation methods like ControlNet heavily rely on +annotated training datasets such as depth maps, semantic segmentation maps, and +poses, which limits the method's scalability. Inspired by the neural mechanisms +that may contribute to the brain's associative power, specifically the cortical +modularization and hippocampal pattern completion, here we propose a +self-supervised controllable generation (SCG) framework. Firstly, we introduce +an equivariant constraint to promote inter-module independence and intra-module +correlation in a modular autoencoder network, thereby achieving functional +specialization. Subsequently, based on these specialized modules, we employ a +self-supervised pattern completion approach for controllable generation +training. Experimental results demonstrate that the proposed modular +autoencoder effectively achieves functional specialization, including the +modular processing of color, brightness, and edge detection, and exhibits +brain-like features including orientation selectivity, color antagonism, and +center-surround receptive fields. Through self-supervised training, associative +generation capabilities spontaneously emerge in SCG, demonstrating excellent +generalization ability to various tasks such as associative generation on +painting, sketches, and ancient graffiti. Compared to the previous +representative method ControlNet, our proposed approach not only demonstrates +superior robustness in more challenging high-noise scenarios but also possesses +more promising scalability potential due to its self-supervised manner. + +
+
+
+
+
+ + ☆ A Novel Unified Architecture for Low-Shot Counting by Detection and + Segmentation NeurIPS2024 + + +
+ Low-shot object counters estimate the number of objects in an image using few +or no annotated exemplars. Objects are localized by matching them to +prototypes, which are constructed by unsupervised image-wide object appearance +aggregation. Due to potentially diverse object appearances, the existing +approaches often lead to overgeneralization and false positive detections. +Furthermore, the best-performing methods train object localization by a +surrogate loss, that predicts a unit Gaussian at each object center. This loss +is sensitive to annotation error, hyperparameters and does not directly +optimize the detection task, leading to suboptimal counts. We introduce GeCo, a +novel low-shot counter that achieves accurate object detection, segmentation, +and count estimation in a unified architecture. GeCo robustly generalizes the +prototypes across objects appearances through a novel dense object query +formulation. In addition, a novel counting loss is proposed, that directly +optimizes the detection task and avoids the issues of the standard surrogate +loss. GeCo surpasses the leading few-shot detection-based counters by +$\sim$25\% in the total count MAE, achieves superior detection accuracy and +sets a new solid state-of-the-art result across all low-shot counting setups. + +
+
+ comment: Accepted to NeurIPS2024 +
+
+
+
+
+ + ☆ Image-guided topic modeling for interpretable privacy classification ECCV 2024 + + +
+ Predicting and explaining the private information contained in an image in +human-understandable terms is a complex and contextual task. This task is +challenging even for large language models. To facilitate the understanding of +privacy decisions, we propose to predict image privacy based on a set of +natural language content descriptors. These content descriptors are associated +with privacy scores that reflect how people perceive image content. We generate +descriptors with our novel Image-guided Topic Modeling (ITM) approach. ITM +leverages, via multimodality alignment, both vision information and image +textual descriptions from a vision language model. We use the ITM-generated +descriptors to learn a privacy predictor, Priv$\times$ITM, whose decisions are +interpretable by design. Our Priv$\times$ITM classifier outperforms the +reference interpretable method by 5 percentage points in accuracy and performs +comparably to the current non-interpretable state-of-the-art model. + +
+
+ comment: Paper accepted at the eXCV Workshop at ECCV 2024. Supplementary + material included. Code available at https://github.com/idiap/itm +
+
+
+
+
+ + ☆ Exploiting Motion Prior for Accurate Pose Estimation of Dashboard + Cameras + + +
+ Dashboard cameras (dashcams) record millions of driving videos daily, +offering a valuable potential data source for various applications, including +driving map production and updates. A necessary step for utilizing these +dashcam data involves the estimation of camera poses. However, the low-quality +images captured by dashcams, characterized by motion blurs and dynamic objects, +pose challenges for existing image-matching methods in accurately estimating +camera poses. In this study, we propose a precise pose estimation method for +dashcam images, leveraging the inherent camera motion prior. Typically, image +sequences captured by dash cameras exhibit pronounced motion prior, such as +forward movement or lateral turns, which serve as essential cues for +correspondence estimation. Building upon this observation, we devise a pose +regression module aimed at learning camera motion prior, subsequently +integrating these prior into both correspondences and pose estimation +processes. The experiment shows that, in real dashcams dataset, our method is +22% better than the baseline for pose estimation in AUC5\textdegree, and it can +estimate poses for 19% more images with less reprojection error in Structure +from Motion (SfM). + +
+
+
+
+
+ + ☆ When SAM2 Meets Video Camouflaged Object Segmentation: A Comprehensive + Evaluation and Adaptation + + +
+ This study investigates the application and performance of the Segment +Anything Model 2 (SAM2) in the challenging task of video camouflaged object +segmentation (VCOS). VCOS involves detecting objects that blend seamlessly in +the surroundings for videos, due to similar colors and textures, poor light +conditions, etc. Compared to the objects in normal scenes, camouflaged objects +are much more difficult to detect. SAM2, a video foundation model, has shown +potential in various tasks. But its effectiveness in dynamic camouflaged +scenarios remains under-explored. This study presents a comprehensive study on +SAM2's ability in VCOS. First, we assess SAM2's performance on camouflaged +video datasets using different models and prompts (click, box, and mask). +Second, we explore the integration of SAM2 with existing multimodal large +language models (MLLMs) and VCOS methods. Third, we specifically adapt SAM2 by +fine-tuning it on the video camouflaged dataset. Our comprehensive experiments +demonstrate that SAM2 has excellent zero-shot ability of detecting camouflaged +objects in videos. We also show that this ability could be further improved by +specifically adjusting SAM2's parameters for VCOS. The code will be available +at https://github.com/zhoustan/SAM2-VCOS + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Enhanced Convolution Neural Network with Optimized Pooling and + Hyperparameter Tuning for Network Intrusion Detection + + +
+ Network Intrusion Detection Systems (NIDS) are essential for protecting +computer networks from malicious activities, including Denial of Service (DoS), +Probing, User-to-Root (U2R), and Remote-to-Local (R2L) attacks. Without +effective NIDS, networks are vulnerable to significant security breaches and +data loss. Machine learning techniques provide a promising approach to enhance +NIDS by automating threat detection and improving accuracy. In this research, +we propose an Enhanced Convolutional Neural Network (EnCNN) for NIDS and +evaluate its performance using the KDDCUP'99 dataset. Our methodology includes +comprehensive data preprocessing, exploratory data analysis (EDA), and feature +engineering. We compare EnCNN with various machine learning algorithms, +including Logistic Regression, Decision Trees, Support Vector Machines (SVM), +and ensemble methods like Random Forest, AdaBoost, and Voting Ensemble. The +results show that EnCNN significantly improves detection accuracy, with a +notable 10% increase over state-of-art approaches. This demonstrates the +effectiveness of EnCNN in real-time network intrusion detection, offering a +robust solution for identifying and mitigating security threats, and enhancing +overall network resilience. + +
+
+ comment: 7 Pages , 2 figures , 4 Tables , Conference paper +
+
+
+
+
+ + ☆ Unsupervised Fingerphoto Presentation Attack Detection With Diffusion + Models + + +
+ Smartphone-based contactless fingerphoto authentication has become a reliable +alternative to traditional contact-based fingerprint biometric systems owing to +rapid advances in smartphone camera technology. Despite its convenience, +fingerprint authentication through fingerphotos is more vulnerable to +presentation attacks, which has motivated recent research efforts towards +developing fingerphoto Presentation Attack Detection (PAD) techniques. However, +prior PAD approaches utilized supervised learning methods that require labeled +training data for both bona fide and attack samples. This can suffer from two +key issues, namely (i) generalization:the detection of novel presentation +attack instruments (PAIs) unseen in the training data, and (ii) scalability:the +collection of a large dataset of attack samples using different PAIs. To +address these challenges, we propose a novel unsupervised approach based on a +state-of-the-art deep-learning-based diffusion model, the Denoising Diffusion +Probabilistic Model (DDPM), which is trained solely on bona fide samples. The +proposed approach detects Presentation Attacks (PA) by calculating the +reconstruction similarity between the input and output pairs of the DDPM. We +present extensive experiments across three PAI datasets to test the accuracy +and generalization capability of our approach. The results show that the +proposed DDPM-based PAD method achieves significantly better detection error +rates on several PAI classes compared to other baseline unsupervised +approaches. + +
+
+ comment: Accepted by IJCB 2024 +
+
+
+
+
+ + ☆ Towards Integrating Epistemic Uncertainty Estimation into the + Radiotherapy Workflow + + +
+ The precision of contouring target structures and organs-at-risk (OAR) in +radiotherapy planning is crucial for ensuring treatment efficacy and patient +safety. Recent advancements in deep learning (DL) have significantly improved +OAR contouring performance, yet the reliability of these models, especially in +the presence of out-of-distribution (OOD) scenarios, remains a concern in +clinical settings. This application study explores the integration of epistemic +uncertainty estimation within the OAR contouring workflow to enable OOD +detection in clinically relevant scenarios, using specifically compiled data. +Furthermore, we introduce an advanced statistical method for OOD detection to +enhance the methodological framework of uncertainty estimation. Our empirical +evaluation demonstrates that epistemic uncertainty estimation is effective in +identifying instances where model predictions are unreliable and may require an +expert review. Notably, our approach achieves an AUC-ROC of 0.95 for OOD +detection, with a specificity of 0.95 and a sensitivity of 0.92 for implant +cases, underscoring its efficacy. This study addresses significant gaps in the +current research landscape, such as the lack of ground truth for uncertainty +estimation and limited empirical evaluations. Additionally, it provides a +clinically relevant application of epistemic uncertainty estimation in an +FDA-approved and widely used clinical solution for OAR segmentation from +Varian, a Siemens Healthineers company, highlighting its practical benefits. + +
+
+ comment: Keywords: Epistemic Uncertainty - Out-of-Distribution Detection - CT + Segmentation - OAR contouring - Radiotherapy +
+
+
+
+
+ + ☆ Metasurface-generated large and arbitrary analog convolution kernels for + accelerated machine vision + + +
+ In the rapidly evolving field of artificial intelligence, convolutional +neural networks are essential for tackling complex challenges such as machine +vision and medical diagnosis. Recently, to address the challenges in processing +speed and power consumption of conventional digital convolution operations, +many optical components have been suggested to replace the digital convolution +layer in the neural network, accelerating various machine vision tasks. +Nonetheless, the analog nature of the optical convolution kernel has not been +fully explored. Here, we develop a spatial frequency domain training method to +create arbitrarily shaped analog convolution kernels using an optical +metasurface as the convolution layer, with its receptive field largely +surpassing digital convolution kernels. By employing spatial multiplexing, the +multiple parallel convolution kernels with both positive and negative weights +are generated under the incoherent illumination condition. We experimentally +demonstrate a 98.59% classification accuracy on the MNIST dataset, with +simulations showing 92.63% and 68.67% accuracy on the Fashion-MNIST and +CIFAR-10 datasets with additional digital layers. This work underscores the +unique advantage of analog optical convolution, offering a promising avenue to +accelerate machine vision tasks, especially in edge devices. + +
+
+
+
+
+ + ☆ From One to the Power of Many: Augmentations for Invariance to + Multi-LiDAR Perception from Single-Sensor Datasets + + +
+ Recently, LiDAR perception methods for autonomous vehicles, powered by deep +neural networks have experienced steep growth in performance on classic +benchmarks, such as nuScenes and SemanticKITTI. However, there are still large +gaps in performance when deploying models trained on such single-sensor setups +to modern multi-sensor vehicles. In this work, we investigate if a lack of +invariance may be responsible for these performance gaps, and propose some +initial solutions in the form of application-specific data augmentations, which +can facilitate better transfer to multi-sensor LiDAR setups. We provide +experimental evidence that our proposed augmentations improve generalization +across LiDAR sensor setups, and investigate how these augmentations affect the +models' invariance properties on simulations of different LiDAR sensor setups. + +
+
+
+
+
+ + ☆ Off to new Shores: A Dataset & Benchmark for (near-)coastal Flood + Inundation Forecasting NeurIPS 2024 + + +
+ Floods are among the most common and devastating natural hazards, imposing +immense costs on our society and economy due to their disastrous consequences. +Recent progress in weather prediction and spaceborne flood mapping demonstrated +the feasibility of anticipating extreme events and reliably detecting their +catastrophic effects afterwards. However, these efforts are rarely linked to +one another and there is a critical lack of datasets and benchmarks to enable +the direct forecasting of flood extent. To resolve this issue, we curate a +novel dataset enabling a timely prediction of flood extent. Furthermore, we +provide a representative evaluation of state-of-the-art methods, structured +into two benchmark tracks for forecasting flood inundation maps i) in general +and ii) focused on coastal regions. Altogether, our dataset and benchmark +provide a comprehensive platform for evaluating flood forecasts, enabling +future solutions for this critical challenge. Data, code & models are shared at +https://github.com/Multihuntr/GFF under a CC0 license. + +
+
+ comment: Accepted at NeurIPS 2024 Datasets & Benchmarks +
+
+
+
+
+ + ☆ Cross-video Identity Correlating for Person Re-identification + Pre-training NeurIPS 2024 + + +
+ Recent researches have proven that pre-training on large-scale person images +extracted from internet videos is an effective way in learning better +representations for person re-identification. However, these researches are +mostly confined to pre-training at the instance-level or single-video +tracklet-level. They ignore the identity-invariance in images of the same +person across different videos, which is a key focus in person +re-identification. To address this issue, we propose a Cross-video +Identity-cOrrelating pre-traiNing (CION) framework. Defining a noise concept +that comprehensively considers both intra-identity consistency and +inter-identity discrimination, CION seeks the identity correlation from +cross-video images by modeling it as a progressive multi-level denoising +problem. Furthermore, an identity-guided self-distillation loss is proposed to +implement better large-scale pre-training by mining the identity-invariance +within person images. We conduct extensive experiments to verify the +superiority of our CION in terms of efficiency and performance. CION achieves +significantly leading performance with even fewer training samples. For +example, compared with the previous state-of-the-art~\cite{ISR}, CION with the +same ResNet50-IBN achieves higher mAP of 93.3\% and 74.3\% on Market1501 and +MSMT17, while only utilizing 8\% training samples. Finally, with CION +demonstrating superior model-agnostic ability, we contribute a model zoo named +ReIDZoo to meet diverse research and application needs in this field. It +contains a series of CION pre-trained models with spanning structures and +parameters, totaling 32 models with 10 different structures, including +GhostNet, ConvNext, RepViT, FastViT and so on. The code and models will be made +publicly available at https://github.com/Zplusdragon/CION_ReIDZoo. + +
+
+ comment: NeurIPS 2024 Accepted Paper +
+
+
+
+
+ + ☆ Harmonizing knowledge Transfer in Neural Network with Unified + Distillation + + +
+ Knowledge distillation (KD), known for its ability to transfer knowledge from +a cumbersome network (teacher) to a lightweight one (student) without altering +the architecture, has been garnering increasing attention. Two primary +categories emerge within KD methods: feature-based, focusing on intermediate +layers' features, and logits-based, targeting the final layer's logits. This +paper introduces a novel perspective by leveraging diverse knowledge sources +within a unified KD framework. Specifically, we aggregate features from +intermediate layers into a comprehensive representation, effectively gathering +semantic information from different stages and scales. Subsequently, we predict +the distribution parameters from this representation. These steps transform +knowledge from the intermediate layers into corresponding distributive forms, +thereby allowing for knowledge distillation through a unified distribution +constraint at different stages of the network, ensuring the comprehensiveness +and coherence of knowledge transfer. Numerous experiments were conducted to +validate the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ AL-GTD: Deep Active Learning for Gaze Target Detection + + +
+ Gaze target detection aims at determining the image location where a person +is looking. While existing studies have made significant progress in this area +by regressing accurate gaze heatmaps, these achievements have largely relied on +access to extensive labeled datasets, which demands substantial human labor. In +this paper, our goal is to reduce the reliance on the size of labeled training +data for gaze target detection. To achieve this, we propose AL-GTD, an +innovative approach that integrates supervised and self-supervised losses +within a novel sample acquisition function to perform active learning (AL). +Additionally, it utilizes pseudo-labeling to mitigate distribution shifts +during the training phase. AL-GTD achieves the best of all AUC results by +utilizing only 40-50% of the training data, in contrast to state-of-the-art +(SOTA) gaze target detectors requiring the entire training dataset to achieve +the same performance. Importantly, AL-GTD quickly reaches satisfactory +performance with 10-20% of the training data, showing the effectiveness of our +acquisition function, which is able to acquire the most informative samples. We +provide a comprehensive experimental analysis by adapting several AL methods +for the task. AL-GTD outperforms AL competitors, simultaneously exhibiting +superior performance compared to SOTA gaze target detectors when all are +trained within a low-data regime. Code is available at +https://github.com/francescotonini/al-gtd. + +
+
+ comment: Accepted to ACM Multimedia 2024 +
+
+
+
+
+ + ☆ CodeSCAN: ScreenCast ANalysis for Video Programming Tutorials + + +
+ Programming tutorials in the form of coding screencasts play a crucial role +in programming education, serving both novices and experienced developers. +However, the video format of these tutorials presents a challenge due to the +difficulty of searching for and within videos. Addressing the absence of +large-scale and diverse datasets for screencast analysis, we introduce the +CodeSCAN dataset. It comprises 12,000 screenshots captured from the Visual +Studio Code environment during development, featuring 24 programming languages, +25 fonts, and over 90 distinct themes, in addition to diverse layout changes +and realistic user interactions. Moreover, we conduct detailed quantitative and +qualitative evaluations to benchmark the performance of Integrated Development +Environment (IDE) element detection, color-to-black-and-white conversion, and +Optical Character Recognition (OCR). We hope that our contributions facilitate +more research in coding screencast analysis, and we make the source code for +creating the dataset and the benchmark publicly available on this website. + +
+
+
+
+
+ + ☆ Efficient Noise Mitigation for Enhancing Inference Accuracy in DNNs on + Mixed-Signal Accelerators + + +
+ In this paper, we propose a framework to enhance the robustness of the neural +models by mitigating the effects of process-induced and aging-related +variations of analog computing components on the accuracy of the analog neural +networks. We model these variations as the noise affecting the precision of the +activations and introduce a denoising block inserted between selected layers of +a pre-trained model. We demonstrate that training the denoising block +significantly increases the model's robustness against various noise levels. To +minimize the overhead associated with adding these blocks, we present an +exploration algorithm to identify optimal insertion points for the denoising +blocks. Additionally, we propose a specialized architecture to efficiently +execute the denoising blocks, which can be integrated into mixed-signal +accelerators. We evaluate the effectiveness of our approach using Deep Neural +Network (DNN) models trained on the ImageNet and CIFAR-10 datasets. The results +show that on average, by accepting 2.03% parameter count overhead, the accuracy +drop due to the variations reduces from 31.7% to 1.15%. + +
+
+
+
+
+ + ☆ Reducing Semantic Ambiguity In Domain Adaptive Semantic Segmentation Via + Probabilistic Prototypical Pixel Contrast + + +
+ Domain adaptation aims to reduce the model degradation on the target domain +caused by the domain shift between the source and target domains. Although +encouraging performance has been achieved by combining cognitive learning with +the self-training paradigm, they suffer from ambiguous scenarios caused by +scale, illumination, or overlapping when deploying deterministic embedding. To +address these issues, we propose probabilistic proto-typical pixel contrast +(PPPC), a universal adaptation framework that models each pixel embedding as a +probability via multivariate Gaussian distribution to fully exploit the +uncertainty within them, eventually improving the representation quality of the +model. In addition, we derive prototypes from probability estimation posterior +probability estimation which helps to push the decision boundary away from the +ambiguity points. Moreover, we employ an efficient method to compute similarity +between distributions, eliminating the need for sampling and +reparameterization, thereby significantly reducing computational overhead. +Further, we dynamically select the ambiguous crops at the image level to +enlarge the number of boundary points involved in contrastive learning, which +benefits the establishment of precise distributions for each category. +Extensive experimentation demonstrates that PPPC not only helps to address +ambiguity at the pixel level, yielding discriminative representations but also +achieves significant improvements in both synthetic-to-real and day-to-night +adaptation tasks. It surpasses the previous state-of-the-art (SOTA) by +5.2% +mIoU in the most challenging daytime-to-nighttime adaptation scenario, +exhibiting stronger generalization on other unseen datasets. The code and +models are available at +https://github.com/DarlingInTheSV/Probabilistic-Prototypical-Pixel-Contrast. + +
+
+ comment: revise +
+
+
+
+
+ + ☆ How Effective is Pre-training of Large Masked Autoencoders for + Downstream Earth Observation Tasks? + + +
+ Self-supervised pre-training has proven highly effective for many computer +vision tasks, particularly when labelled data are scarce. In the context of +Earth Observation (EO), foundation models and various other Vision Transformer +(ViT)-based approaches have been successfully applied for transfer learning to +downstream tasks. However, it remains unclear under which conditions +pre-trained models offer significant advantages over training from scratch. In +this study, we investigate the effectiveness of pre-training ViT-based Masked +Autoencoders (MAE) for downstream EO tasks, focusing on reconstruction, +segmentation, and classification. We consider two large ViT-based MAE +pre-trained models: a foundation model (Prithvi) and SatMAE. We evaluate +Prithvi on reconstruction and segmentation-based downstream tasks, and for +SatMAE we assess its performance on a classification downstream task. Our +findings suggest that pre-training is particularly beneficial when the +fine-tuning task closely resembles the pre-training task, e.g. reconstruction. +In contrast, for tasks such as segmentation or classification, training from +scratch with specific hyperparameter adjustments proved to be equally or more +effective. + +
+
+
+
+
+ + ☆ Prompt-Driven Temporal Domain Adaptation for Nighttime UAV Tracking IROS2024 + + +
+ Nighttime UAV tracking under low-illuminated scenarios has achieved great +progress by domain adaptation (DA). However, previous DA training-based works +are deficient in narrowing the discrepancy of temporal contexts for UAV +trackers. To address the issue, this work proposes a prompt-driven temporal +domain adaptation training framework to fully utilize temporal contexts for +challenging nighttime UAV tracking, i.e., TDA. Specifically, the proposed +framework aligns the distribution of temporal contexts from daytime and +nighttime domains by training the temporal feature generator against the +discriminator. The temporal-consistent discriminator progressively extracts +shared domain-specific features to generate coherent domain discrimination +results in the time series. Additionally, to obtain high-quality training +samples, a prompt-driven object miner is employed to precisely locate objects +in unannotated nighttime videos. Moreover, a new benchmark for long-term +nighttime UAV tracking is constructed. Exhaustive evaluations on both public +and self-constructed nighttime benchmarks demonstrate the remarkable +performance of the tracker trained in TDA framework, i.e., TDA-Track. +Real-world tests at nighttime also show its practicality. The code and demo +videos are available at https://github.com/vision4robotics/TDA-Track. + +
+
+ comment: Accepted by IROS2024 +
+
+
+
+
+ + ☆ Token Caching for Diffusion Transformer Acceleration + + +
+ Diffusion transformers have gained substantial interest in diffusion +generative modeling due to their outstanding performance. However, their high +computational cost, arising from the quadratic computational complexity of +attention mechanisms and multi-step inference, presents a significant +bottleneck. To address this challenge, we propose TokenCache, a novel +post-training acceleration method that leverages the token-based multi-block +architecture of transformers to reduce redundant computations among tokens +across inference steps. TokenCache specifically addresses three critical +questions in the context of diffusion transformers: (1) which tokens should be +pruned to eliminate redundancy, (2) which blocks should be targeted for +efficient pruning, and (3) at which time steps caching should be applied to +balance speed and quality. In response to these challenges, TokenCache +introduces a Cache Predictor that assigns importance scores to tokens, enabling +selective pruning without compromising model performance. Furthermore, we +propose an adaptive block selection strategy to focus on blocks with minimal +impact on the network's output, along with a Two-Phase Round-Robin (TPRR) +scheduling policy to optimize caching intervals throughout the denoising +process. Experimental results across various models demonstrate that TokenCache +achieves an effective trade-off between generation quality and inference speed +for diffusion transformers. Our code will be publicly available. + +
+
+
+
+
+ + ☆ Med-IC: Fusing a Single Layer Involution with Convolutions for Enhanced + Medical Image Classification and Segmentation + + +
+ The majority of medical images, especially those that resemble cells, have +similar characteristics. These images, which occur in a variety of shapes, +often show abnormalities in the organ or cell region. The convolution operation +possesses a restricted capability to extract visual patterns across several +spatial regions of an image. The involution process, which is the inverse +operation of convolution, complements this inherent lack of spatial information +extraction present in convolutions. In this study, we investigate how applying +a single layer of involution prior to a convolutional neural network (CNN) +architecture can significantly improve classification and segmentation +performance, with a comparatively negligible amount of weight parameters. The +study additionally shows how excessive use of involution layers might result in +inaccurate predictions in a particular type of medical image. According to our +findings from experiments, the strategy of adding only a single involution +layer before a CNN-based model outperforms most of the previous works. + +
+
+ comment: 13 pages, 5 figures, 4 tables, preprint submitted to an Elsevier + journal +
+
+
+
+
+ + ☆ Neural Video Representation for Redundancy Reduction and Consistency + Preservation + + +
+ Implicit neural representations (INRs) embed various signals into networks. +They have gained attention in recent years because of their versatility in +handling diverse signal types. For videos, INRs achieve video compression by +embedding video signals into networks and compressing them. Conventional +methods use an index that expresses the time of the frame or the features +extracted from the frame as inputs to the network. The latter method provides +greater expressive capability as the input is specific to each video. However, +the features extracted from frames often contain redundancy, which contradicts +the purpose of video compression. Moreover, since frame time information is not +explicitly provided to the network, learning the relationships between frames +is challenging. To address these issues, we aim to reduce feature redundancy by +extracting features based on the high-frequency components of the frames. In +addition, we use feature differences between adjacent frames in order for the +network to learn frame relationships smoothly. We propose a video +representation method that uses the high-frequency components of frames and the +differences in features between adjacent frames. The experimental results show +that our method outperforms the existing HNeRV method in 90 percent of the +videos. + +
+
+
+
+
+ + ☆ Temporal2Seq: A Unified Framework for Temporal Video Understanding Tasks + + +
+ With the development of video understanding, there is a proliferation of +tasks for clip-level temporal video analysis, including temporal action +detection (TAD), temporal action segmentation (TAS), and generic event boundary +detection (GEBD). While task-specific video understanding models have exhibited +outstanding performance in each task, there remains a dearth of a unified +framework capable of simultaneously addressing multiple tasks, which is a +promising direction for the next generation of AI. To this end, in this paper, +we propose a single unified framework, coined as Temporal2Seq, to formulate the +output of these temporal video understanding tasks as a sequence of discrete +tokens. With this unified token representation, Temporal2Seq can train a +generalist model within a single architecture on different video understanding +tasks. In the absence of multi-task learning (MTL) benchmarks, we compile a +comprehensive co-training dataset by borrowing the datasets from TAD, TAS, and +GEBD tasks. We evaluate our Temporal2Seq generalist model on the corresponding +test sets of three tasks, demonstrating that Temporal2Seq can produce +reasonable results on various tasks and achieve advantages compared with +single-task training on this framework. We also investigate the generalization +performance of our generalist model on new datasets from different tasks, which +yields superior performance to the specific model. + +
+
+
+
+
+ + ☆ Underwater Image Enhancement with Physical-based Denoising Diffusion + Implicit Models + + +
+ Underwater vision is crucial for autonomous underwater vehicles (AUVs), and +enhancing degraded underwater images in real-time on a resource-constrained AUV +is a key challenge due to factors like light absorption and scattering, or the +sufficient model computational complexity to resolve such factors. Traditional +image enhancement techniques lack adaptability to varying underwater +conditions, while learning-based methods, particularly those using +convolutional neural networks (CNNs) and generative adversarial networks +(GANs), offer more robust solutions but face limitations such as inadequate +enhancement, unstable training, or mode collapse. Denoising diffusion +probabilistic models (DDPMs) have emerged as a state-of-the-art approach in +image-to-image tasks but require intensive computational complexity to achieve +the desired underwater image enhancement (UIE) using the recent UW-DDPM +solution. To address these challenges, this paper introduces UW-DiffPhys, a +novel physical-based and diffusion-based UIE approach. UW-DiffPhys combines +light-computation physical-based UIE network components with a denoising U-Net +to replace the computationally intensive distribution transformation U-Net in +the existing UW-DDPM framework, reducing complexity while maintaining +performance. Additionally, the Denoising Diffusion Implicit Model (DDIM) is +employed to accelerate the inference process through non-Markovian sampling. +Experimental results demonstrate that UW-DiffPhys achieved a substantial +reduction in computational complexity and inference time compared to UW-DDPM, +with competitive performance in key metrics such as PSNR, SSIM, UCIQE, and an +improvement in the overall underwater image quality UIQM metric. The +implementation code can be found at the following repository: +https://github.com/bachzz/UW-DiffPhys + +
+
+
+
+
+ + ☆ Towards Diverse Device Heterogeneous Federated Learning via Task + Arithmetic Knowledge Integration NeurIPS 2024 + + +
+ Federated Learning has emerged as a promising paradigm for collaborative +machine learning, while preserving user data privacy. Despite its potential, +standard FL lacks support for diverse heterogeneous device prototypes, which +vary significantly in model and dataset sizes -- from small IoT devices to +large workstations. This limitation is only partially addressed by existing +knowledge distillation techniques, which often fail to transfer knowledge +effectively across a broad spectrum of device prototypes with varied +capabilities. This failure primarily stems from two issues: the dilution of +informative logits from more capable devices by those from less capable ones, +and the use of a single integrated logits as the distillation target across all +devices, which neglects their individual learning capacities and and the unique +contributions of each. To address these challenges, we introduce TAKFL, a novel +KD-based framework that treats the knowledge transfer from each device +prototype's ensemble as a separate task, independently distilling each to +preserve its unique contributions and avoid dilution. TAKFL also incorporates a +KD-based self-regularization technique to mitigate the issues related to the +noisy and unsupervised ensemble distillation process. To integrate the +separately distilled knowledge, we introduce an adaptive task arithmetic +knowledge integration process, allowing each student model to customize the +knowledge integration for optimal performance. Additionally, we present +theoretical results demonstrating the effectiveness of task arithmetic in +transferring knowledge across heterogeneous devices with varying capacities. +Comprehensive evaluations of our method across both CV and NLP tasks +demonstrate that TAKFL achieves SOTA results in a variety of datasets and +settings, significantly outperforming existing KD-based methods. Code is +released at https://github.com/MMorafah/TAKFL + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ FoodMLLM-JP: Leveraging Multimodal Large Language Models for Japanese + Recipe Generation + + +
+ Research on food image understanding using recipe data has been a +long-standing focus due to the diversity and complexity of the data. Moreover, +food is inextricably linked to people's lives, making it a vital research area +for practical applications such as dietary management. Recent advancements in +Multimodal Large Language Models (MLLMs) have demonstrated remarkable +capabilities, not only in their vast knowledge but also in their ability to +handle languages naturally. While English is predominantly used, they can also +support multiple languages including Japanese. This suggests that MLLMs are +expected to significantly improve performance in food image understanding +tasks. We fine-tuned open MLLMs LLaVA-1.5 and Phi-3 Vision on a Japanese recipe +dataset and benchmarked their performance against the closed model GPT-4o. We +then evaluated the content of generated recipes, including ingredients and +cooking procedures, using 5,000 evaluation samples that comprehensively cover +Japanese food culture. Our evaluation demonstrates that the open models trained +on recipe data outperform GPT-4o, the current state-of-the-art model, in +ingredient generation. Our model achieved F1 score of 0.531, surpassing +GPT-4o's F1 score of 0.481, indicating a higher level of accuracy. Furthermore, +our model exhibited comparable performance to GPT-4o in generating cooking +procedure text. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Enhancing Crime Scene Investigations through Virtual Reality and Deep + Learning Techniques + + +
+ The analysis of a crime scene is a pivotal activity in forensic +investigations. Crime Scene Investigators and forensic science practitioners +rely on best practices, standard operating procedures, and critical thinking, +to produce rigorous scientific reports to document the scenes of interest and +meet the quality standards expected in the courts. However, crime scene +examination is a complex and multifaceted task often performed in environments +susceptible to deterioration, contamination, and alteration, despite the use of +contact-free and non-destructive methods of analysis. In this context, the +documentation of the sites, and the identification and isolation of traces of +evidential value remain challenging endeavours. In this paper, we propose a +photogrammetric reconstruction of the crime scene for inspection in virtual +reality (VR) and focus on fully automatic object recognition with deep learning +(DL) algorithms through a client-server architecture. A pre-trained Faster-RCNN +model was chosen as the best method that can best categorize relevant objects +at the scene, selected by experts in the VR environment. These operations can +considerably improve and accelerate crime scene analysis and help the forensic +expert in extracting measurements and analysing in detail the objects under +analysis. Experimental results on a simulated crime scene have shown that the +proposed method can be effective in finding and recognizing objects with +potential evidentiary value, enabling timely analyses of crime scenes, +particularly those with health and safety risks (e.g. fires, explosions, +chemicals, etc.), while minimizing subjective bias and contamination of the +scene. + +
+
+
+
+
+ + ☆ DynaWeightPnP: Toward global real-time 3D-2D solver in PnP without + correspondences + + +
+ This paper addresses a special Perspective-n-Point (PnP) problem: estimating +the optimal pose to align 3D and 2D shapes in real-time without +correspondences, termed as correspondence-free PnP. While several studies have +focused on 3D and 2D shape registration, achieving both real-time and accurate +performance remains challenging. This study specifically targets the 3D-2D +geometric shape registration tasks, applying the recently developed Reproducing +Kernel Hilbert Space (RKHS) to address the "big-to-small" issue. An iterative +reweighted least squares method is employed to solve the RKHS-based formulation +efficiently. Moreover, our work identifies a unique and interesting +observability issue in correspondence-free PnP: the numerical ambiguity between +rotation and translation. To address this, we proposed DynaWeightPnP, +introducing a dynamic weighting sub-problem and an alternative searching +algorithm designed to enhance pose estimation and alignment accuracy. +Experiments were conducted on a typical case, that is, a 3D-2D vascular +centerline registration task within Endovascular Image-Guided Interventions +(EIGIs). Results demonstrated that the proposed algorithm achieves registration +processing rates of 60 Hz (without post-refinement) and 31 Hz (with +post-refinement) on modern single-core CPUs, with competitive accuracy +comparable to existing methods. These results underscore the suitability of +DynaWeightPnP for future robot navigation tasks like EIGIs. + +
+
+
+
+
+ + ☆ Gradient-free Decoder Inversion in Latent Diffusion Models NeurIPS 2024 + + +
+ In latent diffusion models (LDMs), denoising diffusion process efficiently +takes place on latent space whose dimension is lower than that of pixel space. +Decoder is typically used to transform the representation in latent space to +that in pixel space. While a decoder is assumed to have an encoder as an +accurate inverse, exact encoder-decoder pair rarely exists in practice even +though applications often require precise inversion of decoder. Prior works for +decoder inversion in LDMs employed gradient descent inspired by inversions of +generative adversarial networks. However, gradient-based methods require larger +GPU memory and longer computation time for larger latent space. For example, +recent video LDMs can generate more than 16 frames, but GPUs with 24 GB memory +can only perform gradient-based decoder inversion for 4 frames. Here, we +propose an efficient gradient-free decoder inversion for LDMs, which can be +applied to diverse latent models. Theoretical convergence property of our +proposed inversion has been investigated not only for the forward step method, +but also for the inertial Krasnoselskii-Mann (KM) iterations under mild +assumption on cocoercivity that is satisfied by recent LDMs. Our proposed +gradient-free method with Adam optimizer and learning rate scheduling +significantly reduced computation time and memory usage over prior +gradient-based methods and enabled efficient computation in applications such +as noise-space watermarking while achieving comparable error levels. + +
+
+ comment: 19 pages, Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Search3D: Hierarchical Open-Vocabulary 3D Segmentation + + +
+ Open-vocabulary 3D segmentation enables the exploration of 3D spaces using +free-form text descriptions. Existing methods for open-vocabulary 3D instance +segmentation primarily focus on identifying object-level instances in a scene. +However, they face challenges when it comes to understanding more fine-grained +scene entities such as object parts, or regions described by generic +attributes. In this work, we introduce Search3D, an approach that builds a +hierarchical open-vocabulary 3D scene representation, enabling the search for +entities at varying levels of granularity: fine-grained object parts, entire +objects, or regions described by attributes like materials. Our method aims to +expand the capabilities of open vocabulary instance-level 3D segmentation by +shifting towards a more flexible open-vocabulary 3D search setting less +anchored to explicit object-centric queries, compared to prior work. To ensure +a systematic evaluation, we also contribute a scene-scale open-vocabulary 3D +part segmentation benchmark based on MultiScan, along with a set of +open-vocabulary fine-grained part annotations on ScanNet++. We verify the +effectiveness of Search3D across several tasks, demonstrating that our approach +outperforms baselines in scene-scale open-vocabulary 3D part segmentation, +while maintaining strong performance in segmenting 3D objects and materials. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Robust Network Learning via Inverse Scale Variational Sparsification + + +
+ While neural networks have made significant strides in many AI tasks, they +remain vulnerable to a range of noise types, including natural corruptions, +adversarial noise, and low-resolution artifacts. Many existing approaches focus +on enhancing robustness against specific noise types, limiting their +adaptability to others. Previous studies have addressed general robustness by +adopting a spectral perspective, which tends to blur crucial features like +texture and object contours. Our proposed solution, however, introduces an +inverse scale variational sparsification framework within a time-continuous +inverse scale space formulation. This framework progressively learns +finer-scale features by discerning variational differences between pixels, +ultimately preserving only large-scale features in the smoothed image. Unlike +frequency-based methods, our approach not only removes noise by smoothing +small-scale features where corruptions often occur but also retains +high-contrast details such as textures and object contours. Moreover, our +framework offers simplicity and efficiency in implementation. By integrating +this algorithm into neural network training, we guide the model to prioritize +learning large-scale features. We show the efficacy of our approach through +enhanced robustness against various noise types. + +
+
+ comment: 21 pages, 7 figures +
+
+
+
+
+ + ☆ A3: Active Adversarial Alignment for Source-Free Domain Adaptation ICML + + +
+ Unsupervised domain adaptation (UDA) aims to transfer knowledge from a +labeled source domain to an unlabeled target domain. Recent works have focused +on source-free UDA, where only target data is available. This is challenging as +models rely on noisy pseudo-labels and struggle with distribution shifts. We +propose Active Adversarial Alignment (A3), a novel framework combining +self-supervised learning, adversarial training, and active learning for robust +source-free UDA. A3 actively samples informative and diverse data using an +acquisition function for training. It adapts models via adversarial losses and +consistency regularization, aligning distributions without source data access. +A3 advances source-free UDA through its synergistic integration of active and +adversarial learning for effective domain alignment and noise reduction. + +
+
+ comment: Accepted at ICMLA 2024 +
+
+
+
+
+ + ☆ Query matching for spatio-temporal action detection with query-based + object detector + + +
+ In this paper, we propose a method that extends the query-based object +detection model, DETR, to spatio-temporal action detection, which requires +maintaining temporal consistency in videos. Our proposed method applies DETR to +each frame and uses feature shift to incorporate temporal information. However, +DETR's object queries in each frame may correspond to different objects, making +a simple feature shift ineffective. To overcome this issue, we propose query +matching across different frames, ensuring that queries for the same object are +matched and used for the feature shift. Experimental results show that +performance on the JHMDB21 dataset improves significantly when query features +are shifted using the proposed query matching. + +
+
+
+
+
+ + ☆ GenesisTex2: Stable, Consistent and High-Quality Text-to-Texture + Generation + + +
+ Large-scale text-guided image diffusion models have shown astonishing results +in text-to-image (T2I) generation. However, applying these models to synthesize +textures for 3D geometries remains challenging due to the domain gap between 2D +images and textures on a 3D surface. Early works that used a +projecting-and-inpainting approach managed to preserve generation diversity but +often resulted in noticeable artifacts and style inconsistencies. While recent +methods have attempted to address these inconsistencies, they often introduce +other issues, such as blurring, over-saturation, or over-smoothing. To overcome +these challenges, we propose a novel text-to-texture synthesis framework that +leverages pretrained diffusion models. We first introduce a local attention +reweighing mechanism in the self-attention layers to guide the model in +concentrating on spatial-correlated patches across different views, thereby +enhancing local details while preserving cross-view consistency. Additionally, +we propose a novel latent space merge pipeline, which further ensures +consistency across different viewpoints without sacrificing too much diversity. +Our method significantly outperforms existing state-of-the-art techniques +regarding texture consistency and visual quality, while delivering results much +faster than distillation-based methods. Importantly, our framework does not +require additional training or fine-tuning, making it highly adaptable to a +wide range of models available on public platforms. + +
+
+
+
+
+ + ☆ You Only Speak Once to See ICASSP 2025 + + +
+ Grounding objects in images using visual cues is a well-established approach +in computer vision, yet the potential of audio as a modality for object +recognition and grounding remains underexplored. We introduce YOSS, "You Only +Speak Once to See," to leverage audio for grounding objects in visual scenes, +termed Audio Grounding. By integrating pre-trained audio models with visual +models using contrastive learning and multi-modal alignment, our approach +captures speech commands or descriptions and maps them directly to +corresponding objects within images. Experimental results indicate that audio +guidance can be effectively applied to object grounding, suggesting that +incorporating audio guidance may enhance the precision and robustness of +current object grounding methods and improve the performance of robotic systems +and computer vision applications. This finding opens new possibilities for +advanced object recognition, scene understanding, and the development of more +intuitive and capable robotic systems. + +
+
+ comment: 7 pages, 4 figures, submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Multi-hypotheses Conditioned Point Cloud Diffusion for 3D Human + Reconstruction from Occluded Images NeurIPS 2024 + + +
+ 3D human shape reconstruction under severe occlusion due to human-object or +human-human interaction is a challenging problem. Parametric models i.e., +SMPL(-X), which are based on the statistics across human shapes, can represent +whole human body shapes but are limited to minimally-clothed human shapes. +Implicit-function-based methods extract features from the parametric models to +employ prior knowledge of human bodies and can capture geometric details such +as clothing and hair. However, they often struggle to handle misaligned +parametric models and inpaint occluded regions given a single RGB image. In +this work, we propose a novel pipeline, MHCDIFF, Multi-hypotheses Conditioned +Point Cloud Diffusion, composed of point cloud diffusion conditioned on +probabilistic distributions for pixel-aligned detailed 3D human reconstruction +under occlusion. Compared to previous implicit-function-based methods, the +point cloud diffusion model can capture the global consistent features to +generate the occluded regions, and the denoising process corrects the +misaligned SMPL meshes. The core of MHCDIFF is extracting local features from +multiple hypothesized SMPL(-X) meshes and aggregating the set of features to +condition the diffusion model. In the experiments on CAPE and MultiHuman +datasets, the proposed method outperforms various SOTA methods based on SMPL, +implicit functions, point cloud diffusion, and their combined, under synthetic +and real occlusions. + +
+
+ comment: 17 pages, 7 figures, accepted NeurIPS 2024 +
+
+
+
+
+ + ☆ SinoSynth: A Physics-based Domain Randomization Approach for + Generalizable CBCT Image Enhancement MICCAI 2024 + + +
+ Cone Beam Computed Tomography (CBCT) finds diverse applications in medicine. +Ensuring high image quality in CBCT scans is essential for accurate diagnosis +and treatment delivery. Yet, the susceptibility of CBCT images to noise and +artifacts undermines both their usefulness and reliability. Existing methods +typically address CBCT artifacts through image-to-image translation approaches. +These methods, however, are limited by the artifact types present in the +training data, which may not cover the complete spectrum of CBCT degradations +stemming from variations in imaging protocols. Gathering additional data to +encompass all possible scenarios can often pose a challenge. To address this, +we present SinoSynth, a physics-based degradation model that simulates various +CBCT-specific artifacts to generate a diverse set of synthetic CBCT images from +high-quality CT images without requiring pre-aligned data. Through extensive +experiments, we demonstrate that several different generative networks trained +on our synthesized data achieve remarkable results on heterogeneous +multi-institutional datasets, outperforming even the same networks trained on +actual data. We further show that our degradation model conveniently provides +an avenue to enforce anatomical constraints in conditional generative models, +yielding high-quality and structure-preserving synthetic CT images. + +
+
+ comment: MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ InterNet: Unsupervised Cross-modal Homography Estimation Based on + Interleaved Modality Transfer and Self-supervised Homography Prediction + + +
+ We propose a novel unsupervised cross-modal homography estimation framework, +based on interleaved modality transfer and self-supervised homography +prediction, named InterNet. InterNet integrates modality transfer and +self-supervised homography estimation, introducing an innovative interleaved +optimization framework to alternately promote both components. The modality +transfer gradually narrows the modality gaps, facilitating the self-supervised +homography estimation to fully leverage the synthetic intra-modal data. The +self-supervised homography estimation progressively achieves reliable +predictions, thereby providing robust cross-modal supervision for the modality +transfer. To further boost the estimation accuracy, we also formulate a +fine-grained homography feature loss to improve the connection between two +components. Furthermore, we employ a simple yet effective distillation training +technique to reduce model parameters and improve cross-domain generalization +ability while maintaining comparable performance. Experiments reveal that +InterNet achieves the state-of-the-art (SOTA) performance among unsupervised +methods, and even outperforms many supervised methods such as MHN and +LocalTrans. + +
+
+
+
+
+ + ♻ ☆ A New Dataset for Monocular Depth Estimation Under Viewpoint Shifts ECCV 2024 + + +
+ Monocular depth estimation is a critical task for autonomous driving and many +other computer vision applications. While significant progress has been made in +this field, the effects of viewpoint shifts on depth estimation models remain +largely underexplored. This paper introduces a novel dataset and evaluation +methodology to quantify the impact of different camera positions and +orientations on monocular depth estimation performance. We propose a ground +truth strategy based on homography estimation and object detection, eliminating +the need for expensive lidar sensors. We collect a diverse dataset of road +scenes from multiple viewpoints and use it to assess the robustness of a modern +depth estimation model to geometric shifts. After assessing the validity of our +strategy on a public dataset, we provide valuable insights into the limitations +of current models and highlight the importance of considering viewpoint +variations in real-world applications. + +
+
+ comment: 17 pages, 5 figures. Accepted at ECCV 2024 2nd Workshop on + Vision-Centric Autonomous Driving (VCAD) +
+
+
+
+
+ + ♻ ☆ Confidence intervals uncovered: Are we ready for real-world medical + imaging AI? MICCAI 2024 + + +
+ Medical imaging is spearheading the AI transformation of healthcare. +Performance reporting is key to determine which methods should be translated +into clinical practice. Frequently, broad conclusions are simply derived from +mean performance values. In this paper, we argue that this common practice is +often a misleading simplification as it ignores performance variability. Our +contribution is threefold. (1) Analyzing all MICCAI segmentation papers (n = +221) published in 2023, we first observe that more than 50% of papers do not +assess performance variability at all. Moreover, only one (0.5%) paper reported +confidence intervals (CIs) for model performance. (2) To address the reporting +bottleneck, we show that the unreported standard deviation (SD) in segmentation +papers can be approximated by a second-order polynomial function of the mean +Dice similarity coefficient (DSC). Based on external validation data from 56 +previous MICCAI challenges, we demonstrate that this approximation can +accurately reconstruct the CI of a method using information provided in +publications. (3) Finally, we reconstructed 95% CIs around the mean DSC of +MICCAI 2023 segmentation papers. The median CI width was 0.03 which is three +times larger than the median performance gap between the first and second +ranked method. For more than 60% of papers, the mean performance of the +second-ranked method was within the CI of the first-ranked method. We conclude +that current publications typically do not provide sufficient evidence to +support which models could potentially be translated into clinical practice. + +
+
+ comment: Paper accepted at MICCAI 2024 conference +
+
+
+
+
+ + ♻ ☆ Leveraging Anthropometric Measurements to Improve Human Mesh Estimation + and Ensure Consistent Body Shapes + + +
+ The basic body shape of a person does not change within a single video. +However, most SOTA human mesh estimation (HME) models output a slightly +different body shape for each video frame, which results in inconsistent body +shapes for the same person. In contrast, we leverage anthropometric +measurements like tailors are already obtaining from humans for centuries. We +create a model called A2B that converts such anthropometric measurements to +body shape parameters of human mesh models. Moreover, we find that finetuned +SOTA 3D human pose estimation (HPE) models outperform HME models regarding the +precision of the estimated keypoints. We show that applying inverse kinematics +(IK) to the results of such a 3D HPE model and combining the resulting body +pose with the A2B body shape leads to superior and consistent human meshes for +challenging datasets like ASPset or fit3D, where we can lower the MPJPE by over +30 mm compared to SOTA HME models. Further, replacing HME models estimates of +the body shape parameters with A2B model results not only increases the +performance of these HME models, but also leads to consistent body shapes. + +
+
+
+
+
+ + ♻ ☆ VideoPatchCore: An Effective Method to Memorize Normality for Video + Anomaly Detection ACCV 2024 + + +
+ Video anomaly detection (VAD) is a crucial task in video analysis and +surveillance within computer vision. Currently, VAD is gaining attention with +memory techniques that store the features of normal frames. The stored features +are utilized for frame reconstruction, identifying an abnormality when a +significant difference exists between the reconstructed and input frames. +However, this approach faces several challenges due to the simultaneous +optimization required for both the memory and encoder-decoder model. These +challenges include increased optimization difficulty, complexity of +implementation, and performance variability depending on the memory size. To +address these challenges,we propose an effective memory method for VAD, called +VideoPatchCore. Inspired by PatchCore, our approach introduces a structure that +prioritizes memory optimization and configures three types of memory tailored +to the characteristics of video data. This method effectively addresses the +limitations of existing memory-based methods, achieving good performance +comparable to state-of-the-art methods. Furthermore, our method requires no +training and is straightforward to implement, making VAD tasks more accessible. +Our code is available online at github.com/SkiddieAhn/Paper-VideoPatchCore. + +
+
+ comment: Accepted to ACCV 2024 +
+
+
+
+
+ + ♻ ☆ SpaRED benchmark: Enhancing Gene Expression Prediction from Histology + Images with Spatial Transcriptomics Completion + + +
+ Spatial Transcriptomics is a novel technology that aligns histology images +with spatially resolved gene expression profiles. Although groundbreaking, it +struggles with gene capture yielding high corruption in acquired data. Given +potential applications, recent efforts have focused on predicting +transcriptomic profiles solely from histology images. However, differences in +databases, preprocessing techniques, and training hyperparameters hinder a fair +comparison between methods. To address these challenges, we present a +systematically curated and processed database collected from 26 public sources, +representing an 8.6-fold increase compared to previous works. Additionally, we +propose a state-of-the-art transformer based completion technique for inferring +missing gene expression, which significantly boosts the performance of +transcriptomic profile predictions across all datasets. Altogether, our +contributions constitute the most comprehensive benchmark of gene expression +prediction from histology images to date and a stepping stone for future +research on spatial transcriptomics. + +
+
+
+
+
+ + ♻ ☆ ChaosBench: A Multi-Channel, Physics-Based Benchmark for + Subseasonal-to-Seasonal Climate Prediction NeurIPS'24 + + +
+ Accurate prediction of climate in the subseasonal-to-seasonal scale is +crucial for disaster preparedness and robust decision making amidst climate +change. Yet, forecasting beyond the weather timescale is challenging because it +deals with problems other than initial condition, including boundary +interaction, butterfly effect, and our inherent lack of physical understanding. +At present, existing benchmarks tend to have shorter forecasting range of up-to +15 days, do not include a wide range of operational baselines, and lack +physics-based constraints for explainability. Thus, we propose ChaosBench, a +challenging benchmark to extend the predictability range of data-driven weather +emulators to S2S timescale. First, ChaosBench is comprised of variables beyond +the typical surface-atmospheric ERA5 to also include ocean, ice, and land +reanalysis products that span over 45 years to allow for full Earth system +emulation that respects boundary conditions. We also propose physics-based, in +addition to deterministic and probabilistic metrics, to ensure a +physically-consistent ensemble that accounts for butterfly effect. Furthermore, +we evaluate on a diverse set of physics-based forecasts from four national +weather agencies as baselines to our data-driven counterpart such as +ViT/ClimaX, PanguWeather, GraphCast, and FourCastNetV2. Overall, we find +methods originally developed for weather-scale applications fail on S2S task: +their performance simply collapse to an unskilled climatology. Nonetheless, we +outline and demonstrate several strategies that can extend the predictability +range of existing weather emulators, including the use of ensembles, robust +control of error propagation, and the use of physics-informed models. Our +benchmark, datasets, and instructions are available at +https://leap-stc.github.io/ChaosBench. + +
+
+ comment: Accepted as Oral in NeurIPS'24 D&B Track +
+
+
+
+
+ + ♻ ☆ A preliminary study on continual learning in computer vision using + Kolmogorov-Arnold Networks + + +
+ Deep learning has long been dominated by multi-layer perceptrons (MLPs), +which have demonstrated superiority over other optimizable models in various +domains. Recently, a new alternative to MLPs has emerged - Kolmogorov-Arnold +Networks (KAN)- which are based on a fundamentally different mathematical +framework. According to their authors, KANs address several major issues in +MLPs, such as catastrophic forgetting in continual learning scenarios. However, +this claim has only been supported by results from a regression task on a toy +1D dataset. In this paper, we extend the investigation by evaluating the +performance of KANs in continual learning tasks within computer vision, +specifically using the MNIST datasets. To this end, we conduct a structured +analysis of the behavior of MLPs and two KAN-based models in a +class-incremental learning scenario, ensuring that the architectures involved +have the same number of trainable parameters. Our results demonstrate that an +efficient version of KAN outperforms both traditional MLPs and the original KAN +implementation. We further analyze the influence of hyperparameters in MLPs and +KANs, as well as the impact of certain trainable parameters in KANs, such as +bias and scale weights. Additionally, we provide a preliminary investigation of +recent KAN-based convolutional networks and compare their performance with that +of traditional convolutional neural networks. Our codes can be found at +https://github.com/MrPio/KAN-Continual_Learning_tests. + +
+
+
+
+
+ + ♻ ☆ A Novel Framework for the Automated Characterization of Gram-Stained + Blood Culture Slides Using a Large-Scale Vision Transformer + + +
+ This study introduces a new framework for the artificial +intelligence-assisted characterization of Gram-stained whole-slide images +(WSIs). As a test for the diagnosis of bloodstream infections, Gram stains +provide critical early data to inform patient treatment. Rapid and reliable +analysis of Gram stains has been shown to be positively associated with better +clinical outcomes, underscoring the need for improved tools to automate Gram +stain analysis. In this work, we developed a novel transformer-based model for +Gram-stained WSI classification, which is more scalable to large datasets than +previous convolutional neural network (CNN) -based methods as it does not +require patch-level manual annotations. We also introduce a large Gram stain +dataset from Dartmouth-Hitchcock Medical Center (Lebanon, New Hampshire, USA) +to evaluate our model, exploring the classification of five major categories of +Gram-stained WSIs: Gram-positive cocci in clusters, Gram-positive cocci in +pairs/chains, Gram-positive rods, Gram-negative rods, and slides with no +bacteria. Our model achieves a classification accuracy of 0.858 (95% CI: 0.805, +0.905) and an AUC of 0.952 (95% CI: 0.922, 0.976) using five-fold nested +cross-validation on our 475-slide dataset, demonstrating the potential of +large-scale transformer models for Gram stain classification. We further +demonstrate the generalizability of our trained model, which achieves strong +performance on external datasets without additional fine-tuning. + +
+
+
+
+
+ + ♻ ☆ The Role of Masking for Efficient Supervised Knowledge Distillation of + Vision Transformers ECCV 2024 + + +
+ Knowledge distillation is an effective method for training lightweight vision +models. However, acquiring teacher supervision for training samples is often +costly, especially from large-scale models like vision transformers (ViTs). In +this paper, we develop a simple framework to reduce the supervision cost of ViT +distillation: masking out a fraction of input tokens given to the teacher. By +masking input tokens, one can skip the computations associated with the masked +tokens without requiring any change to teacher parameters or architecture. We +find that masking patches with the lowest student attention scores is highly +effective, saving up to 50% of teacher FLOPs without any drop in student +accuracy, while other masking criterion leads to suboptimal efficiency gains. +Through in-depth analyses, we reveal that the student-guided masking provides a +good curriculum to the student, making teacher supervision easier to follow +during the early stage and challenging in the later stage. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Deep Bayesian Future Fusion for Self-Supervised, High-Resolution, + Off-Road Mapping + + +
+ High-speed off-road navigation requires long-range, high-resolution maps to +enable robots to safely navigate over different surfaces while avoiding +dangerous obstacles. However, due to limited computational power and sensing +noise, most approaches to off-road mapping focus on producing coarse (20-40cm) +maps of the environment. In this paper, we propose Future Fusion, a framework +capable of generating dense, high-resolution maps from sparse sensing data (30m +forward at 2cm). This is accomplished by - (1) the efficient realization of the +well-known Bayes filtering within the standard deep learning models that +explicitly accounts for the sparsity pattern in stereo and LiDAR depth data, +and (2) leveraging perceptual losses common in generative image completion. The +proposed methodology outperforms the conventional baselines. Moreover, the +learned features and the completed dense maps lead to improvements in the +downstream navigation task. + +
+
+
+
+
+ + ♻ ☆ Lego: Learning to Disentangle and Invert Personalized Concepts Beyond + Object Appearance in Text-to-Image Diffusion Models + + +
+ Text-to-Image (T2I) models excel at synthesizing concepts such as nouns, +appearances, and styles. To enable customized content creation based on a few +example images of a concept, methods such as Textual Inversion and DreamBooth +invert the desired concept and enable synthesizing it in new scenes. However, +inverting personalized concepts that go beyond object appearance and style +(adjectives and verbs) through natural language remains a challenge. Two key +characteristics of these concepts contribute to the limitations of current +inversion methods. 1) Adjectives and verbs are entangled with nouns (subject) +and can hinder appearance-based inversion methods, where the subject appearance +leaks into the concept embedding, and 2) describing such concepts often extends +beyond single word embeddings. + In this study, we introduce Lego, a textual inversion method designed to +invert subject-entangled concepts from a few example images. Lego disentangles +concepts from their associated subjects using a simple yet effective Subject +Separation step and employs a Context Loss that guides the inversion of +single/multi-embedding concepts. In a thorough user study, Lego-generated +concepts were preferred over 70% of the time when compared to the baseline in +terms of authentically generating concepts according to a reference. +Additionally, visual question answering using an LLM suggested Lego-generated +concepts are better aligned with the text description of the concept. + +
+
+
+
+
+ + ♻ ☆ DeRainGS: Gaussian Splatting for Enhanced Scene Reconstruction in Rainy + Environments + + +
+ Reconstruction under adverse rainy conditions poses significant challenges +due to reduced visibility and the distortion of visual perception. These +conditions can severely impair the quality of geometric maps, which is +essential for applications ranging from autonomous planning to environmental +monitoring. In response to these challenges, this study introduces the novel +task of 3D Reconstruction in Rainy Environments (3DRRE), specifically designed +to address the complexities of reconstructing 3D scenes under rainy conditions. +To benchmark this task, we construct the HydroViews dataset that comprises a +diverse collection of both synthesized and real-world scene images +characterized by various intensities of rain streaks and raindrops. +Furthermore, we propose DeRainGS, the first 3DGS method tailored for +reconstruction in adverse rainy environments. Extensive experiments across a +wide range of rain scenarios demonstrate that our method delivers +state-of-the-art performance, remarkably outperforming existing occlusion-free +methods. + +
+
+
+
+
+ + ♻ ☆ High-Frequency Anti-DreamBooth: Robust Defense against Personalized + Image Synthesis ECCV 2024 + + +
+ Recently, text-to-image generative models have been misused to create +unauthorized malicious images of individuals, posing a growing social problem. +Previous solutions, such as Anti-DreamBooth, add adversarial noise to images to +protect them from being used as training data for malicious generation. +However, we found that the adversarial noise can be removed by adversarial +purification methods such as DiffPure. Therefore, we propose a new adversarial +attack method that adds strong perturbation on the high-frequency areas of +images to make it more robust to adversarial purification. Our experiment +showed that the adversarial images retained noise even after adversarial +purification, hindering malicious image generation. + +
+
+ comment: ECCV 2024 Workshop The Dark Side of Generative AIs and Beyond +
+
+
+
+
+ + ♻ ☆ Diffusion-based RGB-D Semantic Segmentation with Deformable Attention + Transformer + + +
+ Vision-based perception and reasoning is essential for scene understanding in +any autonomous system. RGB and depth images are commonly used to capture both +the semantic and geometric features of the environment. Developing methods to +reliably interpret this data is critical for real-world applications, where +noisy measurements are often unavoidable. In this work, we introduce a +diffusion-based framework to address the RGB-D semantic segmentation problem. +Additionally, we demonstrate that utilizing a Deformable Attention Transformer +as the encoder to extract features from depth images effectively captures the +characteristics of invalid regions in depth measurements. Our generative +framework shows a greater capacity to model the underlying distribution of +RGB-D images, achieving robust performance in challenging scenarios with +significantly less training time compared to discriminative methods. +Experimental results indicate that our approach achieves State-of-the-Art +performance on both the NYUv2 and SUN-RGBD datasets in general and especially +in the most challenging of their image data. Our project page will be available +at https://diffusionmms.github.io/ + +
+
+
+
+
+ + ♻ ☆ I2EBench: A Comprehensive Benchmark for Instruction-based Image Editing NeurIPS2024 + + +
+ Significant progress has been made in the field of Instruction-based Image +Editing (IIE). However, evaluating these models poses a significant challenge. +A crucial requirement in this field is the establishment of a comprehensive +evaluation benchmark for accurately assessing editing results and providing +valuable insights for its further development. In response to this need, we +propose I2EBench, a comprehensive benchmark designed to automatically evaluate +the quality of edited images produced by IIE models from multiple dimensions. +I2EBench consists of 2,000+ images for editing, along with 4,000+ corresponding +original and diverse instructions. It offers three distinctive characteristics: +1) Comprehensive Evaluation Dimensions: I2EBench comprises 16 evaluation +dimensions that cover both high-level and low-level aspects, providing a +comprehensive assessment of each IIE model. 2) Human Perception Alignment: To +ensure the alignment of our benchmark with human perception, we conducted an +extensive user study for each evaluation dimension. 3) Valuable Research +Insights: By analyzing the advantages and disadvantages of existing IIE models +across the 16 dimensions, we offer valuable research insights to guide future +development in the field. We will open-source I2EBench, including all +instructions, input images, human annotations, edited images from all evaluated +methods, and a simple script for evaluating the results from new IIE models. +The code, dataset and generated images from all IIE models are provided in +github: https://github.com/cocoshe/I2EBench. + +
+
+ comment: NeurIPS2024, 15 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Hierarchical Windowed Graph Attention Network and a Large Scale Dataset + for Isolated Indian Sign Language Recognition + + +
+ Automatic Sign Language (SL) recognition is an important task in the computer +vision community. To build a robust SL recognition system, we need a +considerable amount of data which is lacking particularly in Indian sign +language (ISL). In this paper, we introduce a large-scale isolated ISL dataset +and a novel SL recognition model based on skeleton graph structure. The dataset +covers 2002 daily used common words in the deaf community recorded by 20 (10 +male and 10 female) deaf adult signers (contains 40033 videos). We propose a SL +recognition model namely Hierarchical Windowed Graph Attention Network (HWGAT) +by utilizing the human upper body skeleton graph. The HWGAT tries to capture +distinctive motions by giving attention to different body parts induced by the +human skeleton graph. The utility of the proposed dataset and the usefulness of +our model are evaluated through extensive experiments. We pre-trained the +proposed model on the presented dataset and fine-tuned it across different sign +language datasets further boosting the performance of 1.10, 0.46, 0.78, and +6.84 percentage points on INCLUDE, LSA64, AUTSL and WLASL respectively compared +to the existing state-of-the-art keypoints-based models. + +
+
+
+
+
+ + ♻ ☆ TinyVLA: Towards Fast, Data-Efficient Vision-Language-Action Models for + Robotic Manipulation + + +
+ Vision-Language-Action (VLA) models have shown remarkable potential in +visuomotor control and instruction comprehension through end-to-end learning +processes. However, current VLA models face significant challenges: they are +slow during inference and require extensive pre-training on large amounts of +robotic data, making real-world deployment difficult. In this paper, we +introduce a new family of compact vision-language-action models, called +TinyVLA, which offers two key advantages over existing VLA models: (1) faster +inference speeds, and (2) improved data efficiency, eliminating the need for +pre-training stage. Our framework incorporates two essential components to +build TinyVLA: (1) initializing the policy backbone with robust, high-speed +multimodal models, and (2) integrating a diffusion policy decoder during +fine-tuning to enable precise robot actions. We conducted extensive evaluations +of TinyVLA in both simulation and on real robots, demonstrating that our +approach significantly outperforms the state-of-the-art VLA model, OpenVLA, in +terms of speed and data efficiency, while delivering comparable or superior +performance. Additionally, TinyVLA exhibits strong generalization capabilities +across various dimensions, including language instructions, novel objects, +unseen positions, changes in object appearance, background variations, and +environmental shifts, often matching or exceeding the performance of OpenVLA. +We believe that \methodname offers an interesting perspective on utilizing +pre-trained multimodal models for policy learning. Our project is at +https://tiny-vla.github.io. + +
+
+ comment: add more citations +
+
+
+
+
+ + ♻ ☆ Implicit Image-to-Image Schrodinger Bridge for Image Restoration + + +
+ Diffusion-based models are widely recognized for their effectiveness in image +restoration tasks; however, their iterative denoising process, which begins +from Gaussian noise, often results in slow inference speeds. The Image-to-Image +Schr\"odinger Bridge (I$^2$SB) presents a promising alternative by starting the +generative process from corrupted images and leveraging training techniques +from score-based diffusion models. In this paper, we introduce the Implicit +Image-to-Image Schr\"odinger Bridge (I$^3$SB) to further accelerate the +generative process of I$^2$SB. I$^3$SB reconfigures the generative process into +a non-Markovian framework by incorporating the initial corrupted image into +each step, while ensuring that the marginal distribution aligns with that of +I$^2$SB. This allows for the direct use of the pretrained network from I$^2$SB. +Extensive experiments on natural images, human face images, and medical images +validate the acceleration benefits of I$^3$SB. Compared to I$^2$SB, I$^3$SB +achieves the same perceptual quality with fewer generative steps, while +maintaining equal or improved fidelity to the ground truth. + +
+
+ comment: 23 pages, 8 figures, submitted to Pattern Recognition +
+
+
+
+
+ + ♻ ☆ CCFExp: Facial Image Synthesis with Cycle Cross-Fusion Diffusion Model + for Facial Paralysis Individuals + + +
+ Facial paralysis is a debilitating condition that affects the movement of +facial muscles, leading to a significant loss of facial expressions. Currently, +the diagnosis of facial paralysis remains a challenging task, often relying +heavily on the subjective judgment and experience of clinicians, which can +introduce variability and uncertainty in the assessment process. One promising +application in real-life situations is the automatic estimation of facial +paralysis. However, the scarcity of facial paralysis datasets limits the +development of robust machine learning models for automated diagnosis and +therapeutic interventions. To this end, this study aims to synthesize a +high-quality facial paralysis dataset to address this gap, enabling more +accurate and efficient algorithm training. Specifically, a novel Cycle +Cross-Fusion Expression Generative Model (CCFExp) based on the diffusion model +is proposed to combine different features of facial information and enhance the +visual details of facial appearance and texture in facial regions, thus +creating synthetic facial images that accurately represent various degrees and +types of facial paralysis. We have qualitatively and quantitatively evaluated +the proposed method on the commonly used public clinical datasets of facial +paralysis to demonstrate its effectiveness. Experimental results indicate that +the proposed method surpasses state-of-the-art methods, generating more +realistic facial images and maintaining identity consistency. + +
+
+
+
+
+ + ♻ ☆ Platypose: Calibrated Zero-Shot Multi-Hypothesis 3D Human Motion + Estimation + + +
+ Single camera 3D pose estimation is an ill-defined problem due to inherent +ambiguities from depth, occlusion or keypoint noise. Multi-hypothesis pose +estimation accounts for this uncertainty by providing multiple 3D poses +consistent with the 2D measurements. Current research has predominantly +concentrated on generating multiple hypotheses for single frame static pose +estimation or single hypothesis motion estimation. In this study we focus on +the new task of multi-hypothesis motion estimation. Multi-hypothesis motion +estimation is not simply multi-hypothesis pose estimation applied to multiple +frames, which would ignore temporal correlation across frames. Instead, it +requires distributions which are capable of generating temporally consistent +samples, which is significantly more challenging than multi-hypothesis pose +estimation or single-hypothesis motion estimation. To this end, we introduce +Platypose, a framework that uses a diffusion model pretrained on 3D human +motion sequences for zero-shot 3D pose sequence estimation. Platypose +outperforms baseline methods on multiple hypotheses for motion estimation. +Additionally, Platypose also achieves state-of-the-art calibration and +competitive joint error when tested on static poses from Human3.6M, +MPI-INF-3DHP and 3DPW. Finally, because it is zero-shot, our method generalizes +flexibly to different settings such as multi-camera inference. + +
+
+
+
+
+ + ♻ ☆ EMR-Merging: Tuning-Free High-Performance Model Merging NeurIPS 2024 + + +
+ The success of pretrain-finetune paradigm brings about the release of +numerous model weights. In this case, merging models finetuned on different +tasks to enable a single model with multi-task capabilities is gaining +increasing attention for its practicability. Existing model merging methods +usually suffer from (1) significant performance degradation or (2) requiring +tuning by additional data or training. In this paper, we rethink and analyze +the existing model merging paradigm. We discover that using a single model's +weights can hardly simulate all the models' performance. To tackle this issue, +we propose Elect, Mask & Rescale-Merging (EMR-Merging). We first (a) elect a +unified model from all the model weights and then (b) generate extremely +lightweight task-specific modulators, including masks and rescalers, to align +the direction and magnitude between the unified model and each specific model, +respectively. EMR-Merging is tuning-free, thus requiring no data availability +or any additional training while showing impressive performance. We find that +EMR-Merging shows outstanding performance compared to existing merging methods +under different classical and newly-established settings, including merging +different numbers of vision models (up to 30), NLP models, PEFT models, and +multi-modal models. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ FracGM: A Fast Fractional Programming Technique for Geman-McClure Robust + Estimator + + +
+ Robust estimation is essential in computer vision, robotics, and navigation, +aiming to minimize the impact of outlier measurements for improved accuracy. We +present a fast algorithm for Geman-McClure robust estimation, FracGM, +leveraging fractional programming techniques. This solver reformulates the +original non-convex fractional problem to a convex dual problem and a linear +equation system, iteratively solving them in an alternating optimization +pattern. Compared to graduated non-convexity approaches, this strategy exhibits +a faster convergence rate and better outlier rejection capability. In addition, +the global optimality of the proposed solver can be guaranteed under given +conditions. We demonstrate the proposed FracGM solver with Wahba's rotation +problem and 3-D point-cloud registration along with relaxation pre-processing +and projection post-processing. Compared to state-of-the-art algorithms, when +the outlier rates increase from 20% to 80%, FracGM shows 53% and 88% lower +rotation and translation increases. In real-world scenarios, FracGM achieves +better results in 13 out of 18 outcomes, while having a 19.43% improvement in +the computation time. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ 2D or not 2D: How Does the Dimensionality of Gesture Representation + Affect 3D Co-Speech Gesture Generation? + + +
+ Co-speech gestures are fundamental for communication. The advent of recent +deep learning techniques has facilitated the creation of lifelike, synchronous +co-speech gestures for Embodied Conversational Agents. "In-the-wild" datasets, +aggregating video content from platforms like YouTube via human pose detection +technologies, provide a feasible solution by offering 2D skeletal sequences +aligned with speech. Concurrent developments in lifting models enable the +conversion of these 2D sequences into 3D gesture databases. However, it is +important to note that the 3D poses estimated from the 2D extracted poses are, +in essence, approximations of the ground-truth, which remains in the 2D domain. +This distinction raises questions about the impact of gesture representation +dimensionality on the quality of generated motions - a topic that, to our +knowledge, remains largely unexplored. Our study examines the effect of using +either 2D or 3D joint coordinates as training data on the performance of +speech-to-gesture deep generative models. We employ a lifting model for +converting generated 2D pose sequences into 3D and assess how gestures created +directly in 3D stack up against those initially generated in 2D and then +converted to 3D. We perform an objective evaluation using widely used metrics +in the gesture generation field as well as a user study to qualitatively +evaluate the different approaches. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2406.15111 +
+
+
+
+
+ + ♻ ☆ JVID: Joint Video-Image Diffusion for Visual-Quality and + Temporal-Consistency in Video Generation + + +
+ We introduce the Joint Video-Image Diffusion model (JVID), a novel approach +to generating high-quality and temporally coherent videos. We achieve this by +integrating two diffusion models: a Latent Image Diffusion Model (LIDM) trained +on images and a Latent Video Diffusion Model (LVDM) trained on video data. Our +method combines these models in the reverse diffusion process, where the LIDM +enhances image quality and the LVDM ensures temporal consistency. This unique +combination allows us to effectively handle the complex spatio-temporal +dynamics in video generation. Our results demonstrate quantitative and +qualitative improvements in producing realistic and coherent videos. + +
+
+
+
+
+ + ♻ ☆ Trio-ViT: Post-Training Quantization and Acceleration for Softmax-Free + Efficient Vision Transformer + + +
+ Motivated by the huge success of Transformers in the field of natural +language processing (NLP), Vision Transformers (ViTs) have been rapidly +developed and achieved remarkable performance in various computer vision tasks. +However, their huge model sizes and intensive computations hinder ViTs' +deployment on embedded devices, calling for effective model compression +methods, such as quantization. Unfortunately, due to the existence of +hardware-unfriendly and quantization-sensitive non-linear operations, +particularly {Softmax}, it is non-trivial to completely quantize all operations +in ViTs, yielding either significant accuracy drops or non-negligible hardware +costs. In response to challenges associated with \textit{standard ViTs}, we +focus our attention towards the quantization and acceleration for +\textit{efficient ViTs}, which not only eliminate the troublesome Softmax but +also integrate linear attention with low computational complexity, and propose +Trio-ViT accordingly. Specifically, at the algorithm level, we develop a +{tailored post-training quantization engine} taking the unique activation +distributions of Softmax-free efficient ViTs into full consideration, aiming to +boost quantization accuracy. Furthermore, at the hardware level, we build an +accelerator dedicated to the specific Convolution-Transformer hybrid +architecture of efficient ViTs, thereby enhancing hardware efficiency. +Extensive experimental results consistently prove the effectiveness of our +Trio-ViT framework. {Particularly, we can gain up to +$\uparrow$$\mathbf{3.6}\times$, $\uparrow$$\mathbf{5.0}\times$, and +$\uparrow$$\mathbf{7.3}\times$ FPS under comparable accuracy over +state-of-the-art ViT accelerators, as well as $\uparrow$$\mathbf{6.0}\times$, +$\uparrow$$\mathbf{1.5}\times$, and $\uparrow$$\mathbf{2.1}\times$ DSP +efficiency.} Codes are available at +\url{https://github.com/shihuihong214/Trio-ViT}. + +
+
+
+
+
+ + ♻ ☆ Personalized Video Relighting With an At-Home Light Stage + + +
+ In this paper, we develop a personalized video relighting algorithm that +produces high-quality and temporally consistent relit videos under any pose, +expression, and lighting condition in real-time. Existing relighting algorithms +typically rely either on publicly available synthetic data, which yields poor +relighting results, or on actual light stage data which is difficult to +acquire. We show that by just capturing recordings of a user watching YouTube +videos on a monitor we can train a personalized algorithm capable of performing +high-quality relighting under any condition. Our key contribution is a novel +image-based neural relighting architecture that effectively separates the +intrinsic appearance features - the geometry and reflectance of the face - from +the source lighting and then combines them with the target lighting to generate +a relit image. This neural architecture enables smoothing of intrinsic +appearance features leading to temporally stable video relighting. Both +qualitative and quantitative evaluations show that our architecture improves +portrait image relighting quality and temporal consistency over +state-of-the-art approaches on both casually captured `Light Stage at Your +Desk' (LSYD) and light-stage-captured `One Light At a Time' (OLAT) datasets. + +
+
+
+
+
+ + ♻ ☆ SharkTrack: an accurate, generalisable software for streamlining shark + and ray underwater video analysis + + +
+ Elasmobranchs (shark sand rays) represent a critical component of marine +ecosystems. Yet, they are experiencing global population declines and effective +monitoring of populations is essential to their protection. Underwater +stationary videos, such as those from Baited Remote Underwater Video Stations +(BRUVS), are critical for understanding elasmobranch spatial ecology and +abundance. However, processing these videos requires time-consuming manual +analysis that can delay conservation. To address this challenge, we developed +SharkTrack, a semi-automatic underwater video analysis software. SharkTrack +uses Convolutional Neural Networks (CNN) and Multi-Object Tracking to +automatically detect and track elasmobranchs and provides an annotation +pipeline to manually classify elasmobranch species and compute species-specific +MaxN (ssMaxN), the standard metric of relative abundance. When tested on BRUVS +footage from locations unseen by the CNN model during training, SharkTrack +computed ssMaxN with 89% accuracy over 207 hours of footage. The semi-automatic +SharkTrack pipeline required two minutes of manual classification per hour of +video, an estimated 95% reduction of manual analysis time compared to +traditional methods. Furthermore, we demonstrate SharkTrack accuracy across +diverse marine ecosystems and elasmobranch species, an advancement compared to +previous models, which were limited to specific species or locations. +SharkTrack applications extend beyond BRUVS, facilitating the analysis of any +underwater stationary video. By making video analysis faster and more +accessible, SharkTrack enables research and conservation organisations to +monitor elasmobranch populations more efficiently, thereby improving +conservation efforts. To further support these goals, we provide public access +to the SharkTrack software. + +
+
+
+
+
+ + ♻ ☆ Efficient Exploration of Image Classifier Failures with Bayesian + Optimization and Text-to-Image Models + + +
+ Image classifiers should be used with caution in the real world. Performance +evaluated on a validation set may not reflect performance in the real world. In +particular, classifiers may perform well for conditions that are frequently +encountered during training, but poorly for other infrequent conditions. In +this study, we hypothesize that recent advances in text-to-image generative +models make them valuable for benchmarking computer vision models such as image +classifiers: they can generate images conditioned by textual prompts that cause +classifier failures, allowing failure conditions to be described with textual +attributes. However, their generation cost becomes an issue when a large number +of synthetic images need to be generated, which is the case when many different +attribute combinations need to be tested. We propose an image classifier +benchmarking method as an iterative process that alternates image generation, +classifier evaluation, and attribute selection. This method efficiently +explores the attributes that ultimately lead to poor behavior detection. + +
+
+
+
+
+ + ♻ ☆ Cross-Domain Few-Shot Object Detection via Enhanced Open-Set Object + Detector ECCV2024 + + +
+ This paper studies the challenging cross-domain few-shot object detection +(CD-FSOD), aiming to develop an accurate object detector for novel domains with +minimal labeled examples. While transformer-based open-set detectors, such as +DE-ViT, show promise in traditional few-shot object detection, their +generalization to CD-FSOD remains unclear: 1) can such open-set detection +methods easily generalize to CD-FSOD? 2) If not, how can models be enhanced +when facing huge domain gaps? To answer the first question, we employ measures +including style, inter-class variance (ICV), and indefinable boundaries (IB) to +understand the domain gap. Based on these measures, we establish a new +benchmark named CD-FSOD to evaluate object detection methods, revealing that +most of the current approaches fail to generalize across domains. Technically, +we observe that the performance decline is associated with our proposed +measures: style, ICV, and IB. Consequently, we propose several novel modules to +address these issues. First, the learnable instance features align initial +fixed instances with target categories, enhancing feature distinctiveness. +Second, the instance reweighting module assigns higher importance to +high-quality instances with slight IB. Third, the domain prompter encourages +features resilient to different styles by synthesizing imaginary domains +without altering semantic contents. These techniques collectively contribute to +the development of the Cross-Domain Vision Transformer for CD-FSOD (CD-ViTO), +significantly improving upon the base DE-ViT. Experimental results validate the +efficacy of our model. + +
+
+ comment: Accepted by ECCV2024 (project website: + http://yuqianfu.com/CDFSOD-benchmark) +
+
+
+
+
+ + ♻ ☆ CauSkelNet: Causal Representation Learning for Human Behaviour Analysis + + +
+ Constrained by the lack of model interpretability and a deep understanding of +human movement in traditional movement recognition machine learning methods, +this study introduces a novel representation learning method based on causal +inference to better understand human joint dynamics and complex behaviors. We +propose a two-stage framework that combines the Peter-Clark (PC) algorithm and +Kullback-Leibler (KL) divergence to identify and quantify causal relationships +between joints. Our method effectively captures interactions and produces +interpretable, robust representations. Experiments on the EmoPain dataset show +that our causal GCN outperforms traditional GCNs in accuracy, F1 score, and +recall, especially in detecting protective behaviors. The model is also highly +invariant to data scale changes, enhancing its reliability in practical +applications. Our approach advances human motion analysis and paves the way for +more adaptive intelligent healthcare solutions. + +
+
+
+
+
+ + ♻ ☆ Ultra-High-Definition Image Restoration: New Benchmarks and A Dual + Interaction Prior-Driven Solution + + +
+ Ultra-High-Definition (UHD) image restoration has acquired remarkable +attention due to its practical demand. In this paper, we construct UHD snow and +rain benchmarks, named UHD-Snow and UHD-Rain, to remedy the deficiency in this +field. The UHD-Snow/UHD-Rain is established by simulating the physics process +of rain/snow into consideration and each benchmark contains 3200 degraded/clear +image pairs of 4K resolution. Furthermore, we propose an effective UHD image +restoration solution by considering gradient and normal priors in model design +thanks to these priors' spatial and detail contributions. Specifically, our +method contains two branches: (a) feature fusion and reconstruction branch in +high-resolution space and (b) prior feature interaction branch in +low-resolution space. The former learns high-resolution features and fuses +prior-guided low-resolution features to reconstruct clear images, while the +latter utilizes normal and gradient priors to mine useful spatial features and +detail features to guide high-resolution recovery better. To better utilize +these priors, we introduce single prior feature interaction and dual prior +feature interaction, where the former respectively fuses normal and gradient +priors with high-resolution features to enhance prior ones, while the latter +calculates the similarity between enhanced prior ones and further exploits dual +guided filtering to boost the feature interaction of dual priors. We conduct +experiments on both new and existing public datasets and demonstrate the +state-of-the-art performance of our method on UHD image low-light enhancement, +dehazing, deblurring, desonwing, and deraining. The source codes and benchmarks +are available at \url{https://github.com/wlydlut/UHDDIP}. + +
+
+
+
+
+ + ♻ ☆ TOP-Nav: Legged Navigation Integrating Terrain, Obstacle and + Proprioception Estimation CoRL 2024 + + +
+ Legged navigation is typically examined within open-world, off-road, and +challenging environments. In these scenarios, estimating external disturbances +requires a complex synthesis of multi-modal information. This underlines a +major limitation in existing works that primarily focus on avoiding obstacles. +In this work, we propose TOP-Nav, a novel legged navigation framework that +integrates a comprehensive path planner with Terrain awareness, Obstacle +avoidance and close-loop Proprioception. TOP-Nav underscores the synergies +between vision and proprioception in both path and motion planning. Within the +path planner, we present and integrate a terrain estimator that enables the +robot to select waypoints on terrains with higher traversability while +effectively avoiding obstacles. In the motion planning level, we not only +implement a locomotion controller to track the navigation commands, but also +construct a proprioception advisor to provide motion evaluations for the path +planner. Based on the close-loop motion feedback, we make online corrections +for the vision-based terrain and obstacle estimations. Consequently, TOP-Nav +achieves open-world navigation that the robot can handle terrains or +disturbances beyond the distribution of prior knowledge and overcomes +constraints imposed by visual conditions. Building upon extensive experiments +conducted in both simulation and real-world environments, TOP-Nav demonstrates +superior performance in open-world navigation compared to existing methods. + +
+
+ comment: Published on CoRL 2024 +
+
+
+
+
+ + ♻ ☆ Transformer with Leveraged Masked Autoencoder for video-based Pain + Assessment + + +
+ Accurate pain assessment is crucial in healthcare for effective diagnosis and +treatment; however, traditional methods relying on self-reporting are +inadequate for populations unable to communicate their pain. Cutting-edge AI is +promising for supporting clinicians in pain recognition using facial video +data. In this paper, we enhance pain recognition by employing facial video +analysis within a Transformer-based deep learning model. By combining a +powerful Masked Autoencoder with a Transformers-based classifier, our model +effectively captures pain level indicators through both expressions and +micro-expressions. We conducted our experiment on the AI4Pain dataset, which +produced promising results that pave the way for innovative healthcare +solutions that are both comprehensive and objective. + +
+
+
+
+
+ + ♻ ☆ Lemon and Orange Disease Classification using CNN-Extracted Features and + Machine Learning Classifier + + +
+ Lemons and oranges, both are the most economically significant citrus fruits +globally. The production of lemons and oranges is severely affected due to +diseases in its growth stages. Fruit quality has degraded due to the presence +of flaws. Thus, it is necessary to diagnose the disease accurately so that we +can avoid major loss of lemons and oranges. To improve citrus farming, we +proposed a disease classification approach for lemons and oranges. This +approach would enable early disease detection and intervention, reduce yield +losses, and optimize resource allocation. For the initial modeling of disease +classification, the research uses innovative deep learning architectures such +as VGG16, VGG19 and ResNet50. In addition, for achieving better accuracy, the +basic machine learning algorithms used for classification problems include +Random Forest, Naive Bayes, K-Nearest Neighbors (KNN) and Logistic Regression. +The lemon and orange fruits diseases are classified more accurately (95.0% for +lemon and 99.69% for orange) by the model. The model's base features were +extracted from the ResNet50 pre-trained model and the diseases are classified +by the Logistic Regression which beats the performance given by VGG16 and VGG19 +for other classifiers. Experimental outcomes show that the proposed model also +outperforms existing models in which most of them classified the diseases using +the Softmax classifier without using any individual classifiers. + +
+
+
+
+
+ + ♻ ☆ FedRepOpt: Gradient Re-parametrized Optimizers in Federated Learning + + +
+ Federated Learning (FL) has emerged as a privacy-preserving method for +training machine learning models in a distributed manner on edge devices. +However, on-device models face inherent computational power and memory +limitations, potentially resulting in constrained gradient updates. As the +model's size increases, the frequency of gradient updates on edge devices +decreases, ultimately leading to suboptimal training outcomes during any +particular FL round. This limits the feasibility of deploying advanced and +large-scale models on edge devices, hindering the potential for performance +enhancements. To address this issue, we propose FedRepOpt, a gradient +re-parameterized optimizer for FL. The gradient re-parameterized method allows +training a simple local model with a similar performance as a complex model by +modifying the optimizer's gradients according to a set of model-specific +hyperparameters obtained from the complex models. In this work, we focus on +VGG-style and Ghost-style models in the FL environment. Extensive experiments +demonstrate that models using FedRepOpt obtain a significant boost in +performance of 16.7% and 11.4% compared to the RepGhost-style and RepVGG-style +networks, while also demonstrating a faster convergence time of 11.7% and 57.4% +compared to their complex structure. + +
+
+
+
+
+ + ♻ ☆ Compact 3D Gaussian Splatting For Dense Visual SLAM + + +
+ Recent work has shown that 3D Gaussian-based SLAM enables high-quality +reconstruction, accurate pose estimation, and real-time rendering of scenes. +However, these approaches are built on a tremendous number of redundant 3D +Gaussian ellipsoids, leading to high memory and storage costs, and slow +training speed. To address the limitation, we propose a compact 3D Gaussian +Splatting SLAM system that reduces the number and the parameter size of +Gaussian ellipsoids. A sliding window-based masking strategy is first proposed +to reduce the redundant ellipsoids. Then we observe that the covariance matrix +(geometry) of most 3D Gaussian ellipsoids are extremely similar, which +motivates a novel geometry codebook to compress 3D Gaussian geometric +attributes, i.e., the parameters. Robust and accurate pose estimation is +achieved by a global bundle adjustment method with reprojection loss. Extensive +experiments demonstrate that our method achieves faster training and rendering +speed while maintaining the state-of-the-art (SOTA) quality of the scene +representation. + +
+
+
+
+
+ + ♻ ☆ GenFace: A Large-Scale Fine-Grained Face Forgery Benchmark and Cross + Appearance-Edge Learning + + +
+ The rapid advancement of photorealistic generators has reached a critical +juncture where the discrepancy between authentic and manipulated images is +increasingly indistinguishable. Thus, benchmarking and advancing techniques +detecting digital manipulation become an urgent issue. Although there have been +a number of publicly available face forgery datasets, the forgery faces are +mostly generated using GAN-based synthesis technology, which does not involve +the most recent technologies like diffusion. The diversity and quality of +images generated by diffusion models have been significantly improved and thus +a much more challenging face forgery dataset shall be used to evaluate SOTA +forgery detection literature. In this paper, we propose a large-scale, diverse, +and fine-grained high-fidelity dataset, namely GenFace, to facilitate the +advancement of deepfake detection, which contains a large number of forgery +faces generated by advanced generators such as the diffusion-based model and +more detailed labels about the manipulation approaches and adopted generators. +In addition to evaluating SOTA approaches on our benchmark, we design an +innovative cross appearance-edge learning (CAEL) detector to capture +multi-grained appearance and edge global representations, and detect +discriminative and general forgery traces. Moreover, we devise an +appearance-edge cross-attention (AECA) module to explore the various +integrations across two domains. Extensive experiment results and +visualizations show that our detection model outperforms the state of the arts +on different settings like cross-generator, cross-forgery, and cross-dataset +evaluations. Code and datasets will be available at +\url{https://github.com/Jenine-321/GenFace + +
+
+ comment: Accepted by IEEE Transactions on Information Forensics and Security +
+
+
+
+
+ + ♻ ☆ Perception-Guided Quality Metric of 3D Point Clouds Using Hybrid + Strategy + + +
+ Full-reference point cloud quality assessment (FR-PCQA) aims to infer the +quality of distorted point clouds with available references. Most of the +existing FR-PCQA metrics ignore the fact that the human visual system (HVS) +dynamically tackles visual information according to different distortion levels +(i.e., distortion detection for high-quality samples and appearance perception +for low-quality samples) and measure point cloud quality using unified +features. To bridge the gap, in this paper, we propose a perception-guided +hybrid metric (PHM) that adaptively leverages two visual strategies with +respect to distortion degree to predict point cloud quality: to measure visible +difference in high-quality samples, PHM takes into account the masking effect +and employs texture complexity as an effective compensatory factor for absolute +difference; on the other hand, PHM leverages spectral graph theory to evaluate +appearance degradation in low-quality samples. Variations in geometric signals +on graphs and changes in the spectral graph wavelet coefficients are utilized +to characterize geometry and texture appearance degradation, respectively. +Finally, the results obtained from the two components are combined in a +non-linear method to produce an overall quality score of the tested point +cloud. The results of the experiment on five independent databases show that +PHM achieves state-of-the-art (SOTA) performance and offers significant +performance improvement in multiple distortion environments. The code is +publicly available at https://github.com/zhangyujie-1998/PHM. + +
+
+
+
+
+ + ♻ ☆ High-Fidelity GAN Inversion for Image Attribute Editing CVPR 2022 + + +
+ We present a novel high-fidelity generative adversarial network (GAN) +inversion framework that enables attribute editing with image-specific details +well-preserved (e.g., background, appearance, and illumination). We first +analyze the challenges of high-fidelity GAN inversion from the perspective of +lossy data compression. With a low bit-rate latent code, previous works have +difficulties in preserving high-fidelity details in reconstructed and edited +images. Increasing the size of a latent code can improve the accuracy of GAN +inversion but at the cost of inferior editability. To improve image fidelity +without compromising editability, we propose a distortion consultation approach +that employs a distortion map as a reference for high-fidelity reconstruction. +In the distortion consultation inversion (DCI), the distortion map is first +projected to a high-rate latent map, which then complements the basic low-rate +latent code with more details via consultation fusion. To achieve high-fidelity +editing, we propose an adaptive distortion alignment (ADA) module with a +self-supervised training scheme, which bridges the gap between the edited and +inversion images. Extensive experiments in the face and car domains show a +clear improvement in both inversion and editing quality. + +
+
+ comment: CVPR 2022; Project Page is at https://tengfei-wang.github.io/HFGI/ +
+
+
+
+
+ + ♻ ☆ DAC: 2D-3D Retrieval with Noisy Labels via Divide-and-Conquer Alignment + and Correction ACM MM 2024 + + +
+ With the recent burst of 2D and 3D data, cross-modal retrieval has attracted +increasing attention recently. However, manual labeling by non-experts will +inevitably introduce corrupted annotations given ambiguous 2D/3D content. +Though previous works have addressed this issue by designing a naive division +strategy with hand-crafted thresholds, their performance generally exhibits +great sensitivity to the threshold value. Besides, they fail to fully utilize +the valuable supervisory signals within each divided subset. To tackle this +problem, we propose a Divide-and-conquer 2D-3D cross-modal Alignment and +Correction framework (DAC), which comprises Multimodal Dynamic Division (MDD) +and Adaptive Alignment and Correction (AAC). Specifically, the former performs +accurate sample division by adaptive credibility modeling for each sample based +on the compensation information within multimodal loss distribution. Then in +AAC, samples in distinct subsets are exploited with different alignment +strategies to fully enhance the semantic compactness and meanwhile alleviate +over-fitting to noisy labels, where a self-correction strategy is introduced to +improve the quality of representation. Moreover. To evaluate the effectiveness +in real-world scenarios, we introduce a challenging noisy benchmark, namely +Objaverse-N200, which comprises 200k-level samples annotated with 1156 +realistic noisy labels. Extensive experiments on both traditional and the newly +proposed benchmarks demonstrate the generality and superiority of our DAC, +where DAC outperforms state-of-the-art models by a large margin. (i.e., with ++5.9% gain on ModelNet40 and +5.8% on Objaverse-N200). + +
+
+ comment: accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ Prompt-Agnostic Adversarial Perturbation for Customized Diffusion Models NIPS 2024 + + +
+ Diffusion models have revolutionized customized text-to-image generation, +allowing for efficient synthesis of photos from personal data with textual +descriptions. However, these advancements bring forth risks including privacy +breaches and unauthorized replication of artworks. Previous researches +primarily center around using prompt-specific methods to generate adversarial +examples to protect personal images, yet the effectiveness of existing methods +is hindered by constrained adaptability to different prompts. In this paper, we +introduce a Prompt-Agnostic Adversarial Perturbation (PAP) method for +customized diffusion models. PAP first models the prompt distribution using a +Laplace Approximation, and then produces prompt-agnostic perturbations by +maximizing a disturbance expectation based on the modeled distribution. This +approach effectively tackles the prompt-agnostic attacks, leading to improved +defense stability. Extensive experiments in face privacy and artistic style +protection, demonstrate the superior generalization of PAP in comparison to +existing techniques. Our project page is available at +https://github.com/vancyland/Prompt-Agnostic-Adversarial-Perturbation-for-Customized-Diffusion-Models.github.io. + +
+
+ comment: Accepted by NIPS 2024 +
+
+
+
+
+ + ♻ ☆ SynRS3D: A Synthetic Dataset for Global 3D Semantic Understanding from + Monocular Remote Sensing Imagery NeurIPS 2024 + + +
+ Global semantic 3D understanding from single-view high-resolution remote +sensing (RS) imagery is crucial for Earth Observation (EO). However, this task +faces significant challenges due to the high costs of annotations and data +collection, as well as geographically restricted data availability. To address +these challenges, synthetic data offer a promising solution by being easily +accessible and thus enabling the provision of large and diverse datasets. We +develop a specialized synthetic data generation pipeline for EO and introduce +SynRS3D, the largest synthetic RS 3D dataset. SynRS3D comprises 69,667 +high-resolution optical images that cover six different city styles worldwide +and feature eight land cover types, precise height information, and building +change masks. To further enhance its utility, we develop a novel multi-task +unsupervised domain adaptation (UDA) method, RS3DAda, coupled with our +synthetic dataset, which facilitates the RS-specific transition from synthetic +to real scenarios for land cover mapping and height estimation tasks, +ultimately enabling global monocular 3D semantic understanding based on +synthetic data. Extensive experiments on various real-world datasets +demonstrate the adaptability and effectiveness of our synthetic dataset and +proposed RS3DAda method. SynRS3D and related codes will be available. + +
+
+ comment: Accepted at NeurIPS 2024 as a Spotlight +
+
+
+
+
+ + ♻ ☆ $\texttt{NePhi}$: Neural Deformation Fields for Approximately + Diffeomorphic Medical Image Registration ECCV 2024 + + +
+ This work proposes NePhi, a generalizable neural deformation model which +results in approximately diffeomorphic transformations. In contrast to the +predominant voxel-based transformation fields used in learning-based +registration approaches, NePhi represents deformations functionally, leading to +great flexibility within the design space of memory consumption during training +and inference, inference time, registration accuracy, as well as transformation +regularity. Specifically, NePhi 1) requires less memory compared to voxel-based +learning approaches, 2) improves inference speed by predicting latent codes, +compared to current existing neural deformation based registration approaches +that \emph{only} rely on optimization, 3) improves accuracy via instance +optimization, and 4) shows excellent deformation regularity which is highly +desirable for medical image registration. We demonstrate the performance of +NePhi on a 2D synthetic dataset as well as for real 3D medical image datasets +(e.g., lungs and brains). Our results show that NePhi can match the accuracy of +voxel-based representations in a single-resolution registration setting. For +multi-resolution registration, our method matches the accuracy of current SOTA +learning-based registration approaches with instance optimization while +reducing memory requirements by a factor of five. Our code is available at +https://github.com/uncbiag/NePhi. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ SpikeGS: Learning 3D Gaussian Fields from Continuous Spike Stream ACCV 2024 + + +
+ A spike camera is a specialized high-speed visual sensor that offers +advantages such as high temporal resolution and high dynamic range compared to +conventional frame cameras. These features provide the camera with significant +advantages in many computer vision tasks. However, the tasks of 3D +reconstruction and novel view synthesis based on spike cameras remain +underdeveloped. Although there are existing methods for learning neural +radiance fields from spike stream, they either lack robustness in extremely +noisy, low-quality lighting conditions or suffer from high computational +complexity due to the deep fully connected neural networks and ray marching +rendering strategies used in neural radiance fields, making it difficult to +recover fine texture details. In contrast, the latest advancements in 3DGS have +achieved high-quality real-time rendering by optimizing the point cloud +representation into Gaussian ellipsoids. Building on this, we introduce +SpikeGS, the first method to learn 3D Gaussian fields solely from spike stream. +We designed a differentiable spike stream rendering framework based on 3DGS, +incorporating noise embedding and spiking neurons. By leveraging the multi-view +consistency of 3DGS and the tile-based multi-threaded parallel rendering +mechanism, we achieved high-quality real-time rendering results. Additionally, +we introduced a spike rendering loss function that generalizes under varying +illumination conditions. Our method can reconstruct view synthesis results with +fine texture details from a continuous spike stream captured by a moving spike +camera, while demonstrating high robustness in extremely noisy low-light +scenarios. Experimental results on both real and synthetic datasets demonstrate +that our method surpasses existing approaches in terms of rendering quality and +speed. Our code will be available at https://github.com/520jz/SpikeGS. + +
+
+ comment: Accepted by ACCV 2024. Project page: https://github.com/520jz/SpikeGS +
+
+
+
+
+ + ♻ ☆ Beyond Raw Videos: Understanding Edited Videos with Large Multimodal + Model + + +
+ The emerging video LMMs (Large Multimodal Models) have achieved significant +improvements on generic video understanding in the form of VQA (Visual Question +Answering), where the raw videos are captured by cameras. However, a large +portion of videos in real-world applications are edited videos, \textit{e.g.}, +users usually cut and add effects/modifications to the raw video before +publishing it on social media platforms. The edited videos usually have high +view counts but they are not covered in existing benchmarks of video LMMs, +\textit{i.e.}, ActivityNet-QA, or VideoChatGPT benchmark. In this paper, we +leverage the edited videos on a popular short video platform, \textit{i.e.}, +TikTok, and build a video VQA benchmark (named EditVid-QA) covering four +typical editing categories, i.e., effect, funny, meme, and game. Funny and meme +videos benchmark nuanced understanding and high-level reasoning, while effect +and game evaluate the understanding capability of artificial design. Most of +the open-source video LMMs perform poorly on the EditVid-QA benchmark, +indicating a huge domain gap between edited short videos on social media and +regular raw videos. To improve the generalization ability of LMMs, we collect a +training set for the proposed benchmark based on both Panda-70M/WebVid raw +videos and small-scale TikTok/CapCut edited videos, which boosts the +performance on the proposed EditVid-QA benchmark, indicating the effectiveness +of high-quality training data. We also identified a serious issue in the +existing evaluation protocol using the GPT-3.5 judge, namely a "sorry" attack, +where a sorry-style naive answer can achieve an extremely high rating from the +GPT judge, e.g., over 4.3 for correctness score on VideoChatGPT evaluation +protocol. To avoid the "sorry" attacks, we evaluate results with GPT-4 judge +and keyword filtering. The dataset is released at +https://github.com/XenonLamb/EditVid-QA. + +
+
+
+
+
+ + ♻ ☆ 3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple + 3D Representations RSS + + +
+ Imitation learning provides an efficient way to teach robots dexterous +skills; however, learning complex skills robustly and generalizablely usually +consumes large amounts of human demonstrations. To tackle this challenging +problem, we present 3D Diffusion Policy (DP3), a novel visual imitation +learning approach that incorporates the power of 3D visual representations into +diffusion policies, a class of conditional action generative models. The core +design of DP3 is the utilization of a compact 3D visual representation, +extracted from sparse point clouds with an efficient point encoder. In our +experiments involving 72 simulation tasks, DP3 successfully handles most tasks +with just 10 demonstrations and surpasses baselines with a 24.2% relative +improvement. In 4 real robot tasks, DP3 demonstrates precise control with a +high success rate of 85%, given only 40 demonstrations of each task, and shows +excellent generalization abilities in diverse aspects, including space, +viewpoint, appearance, and instance. Interestingly, in real robot experiments, +DP3 rarely violates safety requirements, in contrast to baseline methods which +frequently do, necessitating human intervention. Our extensive evaluation +highlights the critical importance of 3D representations in real-world robot +learning. Videos, code, and data are available on +https://3d-diffusion-policy.github.io . + +
+
+ comment: Published at Robotics: Science and Systems (RSS) 2024. Videos, code, + and data: https://3d-diffusion-policy.github.io +
+
+
+
+
+ + ♻ ☆ Simple Drop-in LoRA Conditioning on Attention Layers Will Improve Your + Diffusion Model + + +
+ Current state-of-the-art diffusion models employ U-Net architectures +containing convolutional and (qkv) self-attention layers. The U-Net processes +images while being conditioned on the time embedding input for each sampling +step and the class or caption embedding input corresponding to the desired +conditional generation. Such conditioning involves scale-and-shift operations +to the convolutional layers but does not directly affect the attention layers. +While these standard architectural choices are certainly effective, not +conditioning the attention layers feels arbitrary and potentially suboptimal. +In this work, we show that simply adding LoRA conditioning to the attention +layers without changing or tuning the other parts of the U-Net architecture +improves the image generation quality. For example, a drop-in addition of LoRA +conditioning to EDM diffusion model yields FID scores of 1.91/1.75 for +unconditional and class-conditional CIFAR-10 generation, improving upon the +baseline of 1.97/1.79. + +
+
+
+
+
+ + ♻ ☆ RoCOCO: Robustness Benchmark of MS-COCO to Stress-test Image-Text + Matching Models ECCV + + +
+ With the extensive use of vision-language models in various downstream tasks, +evaluating their robustness is crucial. In this paper, we propose a benchmark +for assessing the robustness of vision-language models. We believe that a +robust model should properly understand both linguistic and visual semantics +and be resilient to explicit variations. In pursuit of this goal, we create new +variants of texts and images in the MS-COCO test set and re-evaluate the +state-of-the-art (SOTA) models with the new data. Specifically, we alter the +meaning of text by replacing a word, and generate visually altered images that +maintain some visual context while introducing noticeable pixel changes through +image mixing techniques.Our evaluations on the proposed benchmark reveal +substantial performance degradation in many SOTA models (e.g., Image-to-Text +Recall@1: 81.9\% $\rightarrow$ 48.4\% in BLIP, 66.1\% $\rightarrow$ 37.6\% in +VSE$\infty$), with the models often favoring the altered texts/images over the +original ones. This indicates the current vision-language models struggle with +subtle changes and often fail to understand the overall context of texts and +images. Based on these findings, we propose semantic contrastive loss and +visual contrastive loss to learn more robust embedding. Datasets and code are +available at {\url{https://github.com/pseulki/rococo}}. + +
+
+ comment: Accepted to ECCV Synthetic Data for Computer Vision Workshop (Oral) +
+
+
+
+
+ + ♻ ☆ Segment Any Change NeurIPS 2024 + + +
+ Visual foundation models have achieved remarkable results in zero-shot image +classification and segmentation, but zero-shot change detection remains an open +problem. In this paper, we propose the segment any change models (AnyChange), a +new type of change detection model that supports zero-shot prediction and +generalization on unseen change types and data distributions. AnyChange is +built on the segment anything model (SAM) via our training-free adaptation +method, bitemporal latent matching. By revealing and exploiting intra-image and +inter-image semantic similarities in SAM's latent space, bitemporal latent +matching endows SAM with zero-shot change detection capabilities in a +training-free way. We also propose a point query mechanism to enable +AnyChange's zero-shot object-centric change detection capability. We perform +extensive experiments to confirm the effectiveness of AnyChange for zero-shot +change detection. AnyChange sets a new record on the SECOND benchmark for +unsupervised change detection, exceeding the previous SOTA by up to 4.4% F$_1$ +score, and achieving comparable accuracy with negligible manual annotations (1 +pixel per image) for supervised change detection. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+
+
+
+ + Systems and Control 32 + +
+
+
+ + ☆ Robust Deep Reinforcement Learning for Volt-VAR Optimization in Active + Distribution System under Uncertainty + + +
+ The deep reinforcement learning (DRL) based Volt-VAR optimization (VVO) +methods have been widely studied for active distribution networks (ADNs). +However, most of them lack safety guarantees in terms of power injection +uncertainties due to the increase in distributed energy resources (DERs) and +load demand, such as electric vehicles. This article proposes a robust deep +reinforcement learning (RDRL) framework for VVO via a robust deep deterministic +policy gradient (DDPG) algorithm. This algorithm can effectively manage hybrid +action spaces, considering control devices like capacitors, voltage regulators, +and smart inverters. Additionally, it is designed to handle uncertainties by +quantifying uncertainty sets with conformal prediction and modeling +uncertainties as adversarial attacks to guarantee safe exploration across +action spaces. Numerical results on three IEEE test cases demonstrate the +sample efficiency and safety of the proposed robust DDPG against uncertainties +compared to the benchmark algorithms. + +
+
+
+
+
+ + ☆ Robust and efficient data-driven predictive control + + +
+ We propose a robust and efficient data-driven predictive control (eDDPC) +scheme which is more sample efficient (requires less offline data) compared to +existing schemes, and is also computationally efficient. This is done by +leveraging an alternative data-based representation of the trajectories of +linear time-invariant (LTI) systems. The proposed scheme relies only on using +(short and potentially irregularly measured) noisy input-output data, the +amount of which is independent of the prediction horizon. To account for +measurement noise, we provide a novel result that quantifies the uncertainty +between the true (unknown) restricted behavior of the system and the estimated +one from noisy data. Furthermore, we show that the robust eDDPC scheme is +recursively feasible and that the resulting closed-loop system is practically +stable. Finally, we compare the performance of this scheme to existing ones on +a case study of a four tank system. + +
+
+ comment: 17 pages, 2 figures, submitted for Automatica +
+
+
+
+
+ + ☆ Safe Decentralized Multi-Agent Control using Black-Box Predictors, + Conformal Decision Policies, and Control Barrier Functions ICRA 2025 + + +
+ We address the challenge of safe control in decentralized multi-agent robotic +settings, where agents use uncertain black-box models to predict other agents' +trajectories. We use the recently proposed conformal decision theory to adapt +the restrictiveness of control barrier functions-based safety constraints based +on observed prediction errors. We use these constraints to synthesize +controllers that balance between the objectives of safety and task +accomplishment, despite the prediction errors. We provide an upper bound on the +average over time of the value of a monotonic function of the difference +between the safety constraint based on the predicted trajectories and the +constraint based on the ground truth ones. We validate our theory through +experimental results showing the performance of our controllers when navigating +a robot in the multi-agent scenes in the Stanford Drone Dataset. + +
+
+ comment: 6 pages, 1 figure, submitted for ICRA 2025 +
+
+
+
+
+ + ☆ Path Following Model Predictive Control of a Coupled Autonomous + Underwater Vehicle + + +
+ The operation of an autonomous underwater vehicle (AUV) faces challenges in +following predetermined waypoints due to coupled motions under environmental +disturbances. To address this, a 3D path following guidance and control system +is developed in this work based on the line-of-sight (LOS) guidance method. +Conventionally, the 3D path following problem is transformed into heading and +depth control problems, assuming that the motion of the vehicle is decoupled in +horizontal and depth coordinates. The proposed control system design avoids +this simplifying assumption by transforming the problem into a 3D position and +orientation tracking problem. This design is achieved by computing a 2D +horizontal coordinate based on the desired heading and then computing a +corresponding LOS depth coordinate. A model predictive controller (MPC) is then +implemented using the 3D LOS coordinate and the computed orientation vector. +The MPC obtains a robust control by solving a minimax optimisation problem +considering the effects of unknown ocean disturbances. The effectiveness of the +proposed guidance and control system is demonstrated through the simulation of +a prototype AUV system. Numerical results show that the AUV can follow +predetermined waypoints in the presence of time-varying disturbances, and the +system is steered at a constant surge speed that is proportional to the radius +of the circle of acceptance used to implement the guidance system. + +
+
+ comment: 6 pages, 4 figures, Presented at the IFAC CAMS 2024, Virginia, USA +
+
+
+
+
+ + ☆ Hierarchical Federated ADMM + + +
+ In this paper, we depart from the widely-used gradient descent-based +hierarchical federated learning (FL) algorithms to develop a novel hierarchical +FL framework based on the alternating direction method of multipliers (ADMM). +Within this framework, we propose two novel FL algorithms, which both use ADMM +in the top layer: one that employs ADMM in the lower layer and another that +uses the conventional gradient descent-based approach. The proposed framework +enhances privacy, and experiments demonstrate the superiority of the proposed +algorithms compared to the conventional algorithms in terms of learning +convergence and accuracy. Additionally, gradient descent on the lower layer +performs well even if the number of local steps is very limited, while ADMM on +both layers lead to better performance otherwise. + +
+
+
+
+
+ + ☆ Asymptotic tracking control of dynamic reference over homomorphically + encrypted data with finite modulus + + +
+ This paper considers a tracking control problem, in which the dynamic +controller is encrypted with an additively homomorphic encryption scheme and +the output of a process tracks a dynamic reference asymptotically. Our paper is +motivated by the following problem: When dealing with both asymptotic tracking +and dynamic reference, we find that the control input is generally subject to +overflow issues under a finite modulus, though the dynamic controller consists +of only integer coefficients. First, we provide a new controller design method +such that the coefficients of the tracking controller can be transformed into +integers leveraging the zooming-in factor of dynamic quantization. + By the Cayley-Hamilton theorem, we represent the control input as linear +combination of the previous control inputs. Leveraging the property above, we +design an algorithm on the actuator side such that it can restore the control +input from the lower bits under a finite modulus. A lower bound of the modulus +is also provided. + As an extension of the first result, we further solve the problem of +unbounded internal state taking place in the actuator. In particular, the +actuator can restore the correct control input under the same modulus. + A simulation example is provided to verify the control schemes proposed in +our paper. + +
+
+
+
+
+ + ☆ Learning from Demonstration with Implicit Nonlinear Dynamics Models + + +
+ Learning from Demonstration (LfD) is a useful paradigm for training policies +that solve tasks involving complex motions. In practice, the successful +application of LfD requires overcoming error accumulation during policy +execution, i.e. the problem of drift due to errors compounding over time and +the consequent out-of-distribution behaviours. Existing works seek to address +this problem through scaling data collection, correcting policy errors with a +human-in-the-loop, temporally ensembling policy predictions or through learning +the parameters of a dynamical system model. In this work, we propose and +validate an alternative approach to overcoming this issue. Inspired by +reservoir computing, we develop a novel neural network layer that includes a +fixed nonlinear dynamical system with tunable dynamical properties. We validate +the efficacy of our neural network layer on the task of reproducing human +handwriting motions using the LASA Human Handwriting Dataset. Through empirical +experiments we demonstrate that incorporating our layer into existing neural +network architectures addresses the issue of compounding errors in LfD. +Furthermore, we perform a comparative evaluation against existing approaches +including a temporal ensemble of policy predictions and an Echo State Networks +(ESNs) implementation. We find that our approach yields greater policy +precision and robustness on the handwriting task while also generalising to +multiple dynamics regimes and maintaining competitive latency scores. + +
+
+ comment: 21 pages, 9 figures +
+
+
+
+
+ + ☆ Dual Pricing to Prioritize Renewable Energy and Consumer Preferences in + Electricity Markets + + +
+ Electricity markets currently fail to incorporate preferences of buyers, +treating polluting and renewable energy sources as having equal social benefit +under a system of uniform clearing prices. Meanwhile, renewable energy is prone +to curtailment due to transmission constraints, forcing grid operators to +reduce or shut down renewable energy production despite its availability and +need. This paper proposes a ``dual pricing mechanism" which allows buyers to +bid both their willingness to pay for electricity, and additionally, their +preference for green energy. Designed for use in deregulated electricity +markets, this mechanism prioritizes the dispatch of more renewable energy +sources according to consumer preferences. Traditional uniform clearing prices, +which treat all energy sources equally, do not reflect the growing share of +green energy in the power grid and the environmental values of consumers. By +allowing load-serving entities to bid their willingness to pay for renewable +energy directly into the clearing market, our proposed framework generates +distinct pricing signals for green and ``black" electricity. + +
+
+
+
+
+ + ☆ Transparency evaluation for the Kinematic Design of the Harnesses + through Human-Exoskeleton Interaction Modeling + + +
+ Lower Limb Exoskeletons (LLEs) are wearable robots that provide mechanical +power to the user. Human-exoskeleton (HE) connections must preserve the user's +natural behavior during the interaction, avoiding undesired forces. Therefore, +numerous works focus on their minimization. Given the inherent complications of +repeatedly prototyping and experimentally testing a device, modeling the +exoskeleton and its physical interaction with the user emerges as a valuable +approach for assessing the design effects. This paper proposes a novel method +to compare different exoskeleton configurations with a flexible simulation +tool. This approach contemplates simulating the dynamics of the device, +including its interaction with the wearer, to evaluate multiple connection +mechanism designs along with the kinematics and actuation of the LLE. This +evaluation is based on the minimization of the interaction wrenches through an +optimization process that includes the impedance parameters at the interfaces +as optimization variables and the similarity of the LLE's joint variables +trajectories with the motion of the wearer's articulations. Exploratory tests +are conducted using the Wearable Walker LLE in different configurations and +measuring the interaction forces. Experimental data are then compared to the +optimization outcomes, proving that the proposed method provides contact wrench +estimations consistent with the collected measurements and previous outcomes +from the literature. Copyright 2024 IEEE. Personal use of this material is +permitted. Permission from IEEE must be obtained for all other uses, in any +current or future media, including reprinting/republishing this material for +advertising or promotional purposes, creating new collective works, for resale +or redistribution to servers or lists, or reuse of any copyrighted component of +this work in other works. + +
+
+
+
+
+ + ☆ A History-Guided Regional Partitioning Evolutionary Optimization for + Solving the Flexible Job Shop Problem with Limited Multi-load Automated + Guided Vehicles + + +
+ In a flexible job shop environment, using Automated Guided Vehicles (AGVs) to +transport jobs and process materials is an important way to promote the +intelligence of the workshop. Compared with single-load AGVs, multi-load AGVs +can improve AGV utilization, reduce path conflicts, etc. Therefore, this study +proposes a history-guided regional partitioning algorithm (HRPEO) for the +flexible job shop scheduling problem with limited multi-load AGVs (FJSPMA). +First, the encoding and decoding rules are designed according to the +characteristics of multi-load AGVs, and then the initialization rule based on +the branch and bound method is used to generate the initial population. Second, +to prevent the algorithm from falling into a local optimum, the algorithm +adopts a regional partitioning strategy. This strategy divides the solution +space into multiple regions and measures the potential of the regions. After +that, cluster the regions into multiple clusters in each iteration, and selects +individuals for evolutionary search based on the set of clusters. Third, a +local search strategy is designed to improve the exploitation ability of the +algorithm, which uses a greedy approach to optimize machines selection and +transportation sequence according to the characteristics of FJSPMA. Finally, a +large number of experiments are carried out on the benchmarks to test the +performance of the algorithm. Compared with multiple advanced algorithms, the +results show that the HRPEO has a better advantage in solving FJSPMA. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ On Adaptive Frequency Sampling for Data-driven MOR Applied to Antenna + Responses + + +
+ Frequency domain sweeps of array antennas are well-known to be +time-intensive, and different surrogate models have been used to improve the +performance. Data-driven model order reduction algorithms, such as the Loewner +framework and vector fitting, can be integrated with these adaptive error +estimates, in an iterative algorithm, to reduce the number of full-wave +simulations required to accurately capture the requested frequency behavior of +multiport array antennas. In this work, we propose two novel adaptive methods +exploiting a block matrix function which is a key part of the Loewner framework +generating system approach. The first algorithm leverages an inherent matrix +parameter freedom in the block matrix function to identify frequency points +with large errors, whereas the second utilizes the condition number of the +block matrix function. Both methods effectively provide frequency domain error +estimates, essential for improved performance. Numerical experiments on +multiport array antenna S-parameters demonstrate the effectiveness of our +proposed algorithms within the Loewner framework. + +
+
+ comment: 10 pages, 12 figures +
+
+
+
+
+ + ☆ Pseudometrics for scalable data-driven comparisons of nonlinear + dynamical systems + + +
+ Novel solutions for pseudometrics quantifying deviation from topological +conjugacy between dynamical systems are presented. Deviation from conjugacy is +quantified in a Pareto optimal sense that accounts for spectral properties of +Koopman operators as well as trajectory geometry. Theoretical justification is +provided for computing such pseudometrics in Koopman eigenfunction space rather +than observable space. Furthermore, it is shown deriving the pseudometrics from +unitary transformations is sufficient to recover a value of zero if two systems +are topologically conjugate. Therefore the pseudometrics for quantifying +deviation from conjugacy are based on analytical solutions for unitary +transformations in Koopman eigenfunction space. Finally, geometric +considerations for the Pareto optimality problem associated with deviation from +conjugacy are used to develop pseudometrics that account for all possible +solutions given just two Pareto points based on analytical solutions. + +
+
+
+
+
+ + ☆ Impact of number of elements on the directivity of planar array of + monopole antenna + + +
+ This research investigates how the number of elements affects the monopole +antenna's planar array's directivity. This study also takes into account the +antenna's effect on the whole field it radiates. The monopole antennas are +arranged in a planar configuration with all the components in their proper +locations using the Hadamard matrix approach. Each matrix's directivities and +array factors were calculated, and a MATLAB tool was used to simulate the +radiation pattern. A range of elements from 4 X 4 to 50 X 50 planar layouts +were taken into consideration during the investigation. Increasing the number +of elements improves the directivity. Increasing the number of elements in the +planar array resulted in a great improvement in directivity, as seen by the +computed and simulated results. Consequently, by increasing the antenna's +directivity, a greater number of elements influences the overall field emitted. + +
+
+ comment: 8 pages, 19 Figures, article +
+
+
+
+
+ + ☆ Pseudo-kinematic trajectory control of tracked vehicles + + +
+ Tracked vehicles are used in complex scenarios, where motion planning and +navigation can be very complex. They have complex dynamics, with many +parameters that are difficult to identify and that change significantly based +on the operating conditions. We propose a simple pseudo-kinematic model, where +the intricate dynamic effects underlying the vehicle's motion are captured in a +small set of velocity-dependent parameters. This choice enables the development +of a Lyapunov-based trajectory controller with guaranteed performance and small +computation time. We demonstrate the correctness of our approach with both +simulation and experimental data. + +
+
+
+
+
+ + ☆ Towards Event-Triggered NMPC for Efficient 6G Communications: + Experimental Results and Open Problems + + +
+ Networked control systems enable real-time control and coordination of +distributed systems, leveraging the low latency, high reliability, and massive +connectivity offered by 5G and future 6G networks. Applications include +autonomous vehicles, robotics, industrial automation, and smart grids. Despite +networked control algorithms admitting nominal stability guarantees even in the +presence of delays and packet dropouts, their practical performance still +heavily depends on the specific characteristics and conditions of the +underlying network. To achieve the desired performance while efficiently using +communication resources, co-design of control and communication is pivotal. +Although periodic schemes, where communication instances are fixed, can provide +reliable control performance, unnecessary transmissions, when updates are not +needed, result in inefficient usage of network resources. In this paper, we +investigate the potential for co-design of model predictive control and network +communication. To this end, we design and implement an event-triggered +nonlinear model predictive controller for stabilizing a Furuta pendulum +communicating over a tailored open radio access network 6G research platform. +We analyze the control performance as well as network utilization under varying +channel conditions and event-triggering criteria. Our results show that the +event-triggered control scheme achieves similar performance to periodic control +with reduced communication demand. + +
+
+
+
+
+ + ☆ Analysis of Truncated Singular Value Decomposition for Koopman + Operator-Based Lane Change Model + + +
+ Understanding and modeling complex dynamic systems is crucial for enhancing +vehicle performance and safety, especially in the context of autonomous +driving. Recently, popular methods such as Koopman operators and their +approximators, known as Extended Dynamic Mode Decomposition (EDMD), have +emerged for their effectiveness in transforming strongly nonlinear system +behavior into linear representations. This allows them to be integrated with +conventional linear controllers. To achieve this, Singular Value Decomposition +(SVD), specifically truncated SVD, is employed to approximate Koopman operators +from extensive datasets efficiently. This study evaluates different basis +functions used in EDMD and ranks for truncated SVD for representing lane change +behavior models, aiming to balance computational efficiency with information +loss. The findings, however, suggest that the technique of truncated SVD does +not necessarily achieve substantial reductions in computational training time +and results in significant information loss. + +
+
+ comment: Submitted to the 21st International Conference on Informatics in + Control, Automation and Robotics (ICINCO 2024) +
+
+
+
+
+ + ☆ Unscented Transform-based Pure Pursuit Path-Tracking Algorithm under + Uncertainty + + +
+ Automated driving has become more and more popular due to its potential to +eliminate road accidents by taking over driving tasks from humans. One of the +remaining challenges is to follow a planned path autonomously, especially when +uncertainties in self-localizing or understanding the surroundings can +influence the decisions made by autonomous vehicles, such as calculating how +much they need to steer to minimize tracking errors. In this paper, a modified +geometric pure pursuit path-tracking algorithm is proposed, taking into +consideration such uncertainties using the unscented transform. The algorithm +is tested through simulations for typical road geometries, such as straight and +circular lines. + +
+
+ comment: Submitted to the 21st International Conference on Informatics in + Control, Automation and Robotics (ICINCO 2024) +
+
+
+
+
+ + ☆ CaΣoS: A nonlinear sum-of-squares optimization suite + + +
+ We present Ca{\Sigma}oS, the first MATLAB software specifically designed for +nonlinear sum-of-squares optimization. A symbolic polynomial algebra system +allows to formulate parametrized sum-of-squares optimization problems and +facilitates their fast, repeated evaluations. To that extent, we make use of +CasADi's symbolic framework and realize concepts of monomial sparsity, linear +operators (including duals), and functions between polynomials. Ca{\Sigma}oS +currently provides interfaces to the conic solvers SeDuMi, Mosek, and SCS as +well as methods to solve quasiconvex optimization problems (via bisection) and +nonconvex optimization problems (via sequential convexification). Numerical +examples for benchmark problems including region-of-attraction and reachable +set estimation for nonlinear dynamic systems demonstrate significant +improvements in computation time compared to existing toolboxes.. Ca{\Sigma}oS +is available open-source at https://github.com/ ifr-acso/casos. + +
+
+ comment: Submitted to 2025 American Control Conference +
+
+
+
+
+ + ☆ Adaptive Knowledge-based Multi-Objective Evolutionary Algorithm for + Hybrid Flow Shop Scheduling Problems with Multiple Parallel Batch Processing + Stages + + +
+ Parallel batch processing machines have extensive applications in the +semiconductor manufacturing process. However, the problem models in previous +studies regard parallel batch processing as a fixed processing stage in the +machining process. This study generalizes the problem model, in which users can +arbitrarily set certain stages as parallel batch processing stages according to +their needs. A Hybrid Flow Shop Scheduling Problem with Parallel Batch +Processing Machines (PBHFSP) is solved in this paper. Furthermore, an Adaptive +Knowledge-based Multi-Objective Evolutionary Algorithm (AMOEA/D) is designed to +simultaneously optimize both makespan and Total Energy Consumption (TEC). +Firstly, a hybrid initialization strategy with heuristic rules based on +knowledge of PBHFSP is proposed to generate promising solutions. Secondly, the +disjunctive graph model has been established based on the knowledge to find the +critical-path of PBHFS. Then, a critical-path based neighborhood search is +proposed to enhance the exploitation ability of AMOEA/D. Moreover, the search +time is adaptively adjusted based on learning experience from Q-learning and +Decay Law. Afterward, to enhance the exploration capability of the algorithm, +AMOEA/D designs an improved population updating strategy with a weight vector +updating strategy. These strategies rematch individuals with weight vectors, +thereby maintaining the diversity of the population. Finally, the proposed +algorithm is compared with state-of-the-art algorithms. The experimental +results show that the AMOEA/D is superior to the comparison algorithms in +solving the PBHFSP. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ CurricuLLM: Automatic Task Curricula Design for Learning Complex Robot + Skills using Large Language Models ICRA 2025 + + +
+ Curriculum learning is a training mechanism in reinforcement learning (RL) +that facilitates the achievement of complex policies by progressively +increasing the task difficulty during training. However, designing effective +curricula for a specific task often requires extensive domain knowledge and +human intervention, which limits its applicability across various domains. Our +core idea is that large language models (LLMs), with their extensive training +on diverse language data and ability to encapsulate world knowledge, present +significant potential for efficiently breaking down tasks and decomposing +skills across various robotics environments. Additionally, the demonstrated +success of LLMs in translating natural language into executable code for RL +agents strengthens their role in generating task curricula. In this work, we +propose CurricuLLM, which leverages the high-level planning and programming +capabilities of LLMs for curriculum design, thereby enhancing the efficient +learning of complex target tasks. CurricuLLM consists of: (Step 1) Generating +sequence of subtasks that aid target task learning in natural language form, +(Step 2) Translating natural language description of subtasks in executable +task code, including the reward code and goal distribution code, and (Step 3) +Evaluating trained policies based on trajectory rollout and subtask +description. We evaluate CurricuLLM in various robotics simulation +environments, ranging from manipulation, navigation, and locomotion, to show +that CurricuLLM can aid learning complex robot control tasks. In addition, we +validate humanoid locomotion policy learned through CurricuLLM in real-world. +The code is provided in https://github.com/labicon/CurricuLLM + +
+
+ comment: Submitted to ICRA 2025 +
+
+
+
+
+ + ☆ iWalker: Imperative Visual Planning for Walking Humanoid Robot + + +
+ Humanoid robots, with the potential to perform a broad range of tasks in +environments designed for humans, have been deemed crucial for the basis of +general AI agents. When talking about planning and controlling, although +traditional models and task-specific methods have been extensively studied over +the past few decades, they are inadequate for achieving the flexibility and +versatility needed for general autonomy. Learning approaches, especially +reinforcement learning, are powerful and popular nowadays, but they are +inherently "blind" during training, relying heavily on trials in simulation +without proper guidance from physical principles or underlying dynamics. In +response, we propose a novel end-to-end pipeline that seamlessly integrates +perception, planning, and model-based control for humanoid robot walking. We +refer to our method as iWalker, which is driven by imperative learning (IL), a +self-supervising neuro-symbolic learning framework. This enables the robot to +learn from arbitrary unlabeled data, significantly improving its adaptability +and generalization capabilities. In experiments, iWalker demonstrates +effectiveness in both simulated and real-world environments, representing a +significant advancement toward versatile and autonomous humanoid robots. + +
+
+
+
+
+ + ♻ ☆ Diffusion Models for Intelligent Transportation Systems: A Survey + + +
+ Intelligent Transportation Systems (ITS) are vital in modern traffic +management and optimization, significantly enhancing traffic efficiency and +safety. Recently, diffusion models have emerged as transformative tools for +addressing complex challenges within ITS. In this paper, we present a +comprehensive survey of diffusion models for ITS, covering both theoretical and +practical aspects. First, we introduce the theoretical foundations of diffusion +models and their key variants, including conditional diffusion models and +latent diffusion models, highlighting their suitability for modeling complex, +multi-modal traffic data and enabling controllable generation. Second, we +outline the primary challenges in ITS and the corresponding advantages of +diffusion models, providing readers with a deeper understanding of the +intersection between ITS and diffusion models. Third, we offer a +multi-perspective investigation of current applications of diffusion models in +ITS domains, including autonomous driving, traffic simulation, trajectory +prediction, and traffic safety. Finally, we discuss state-of-the-art diffusion +model techniques and highlight key ITS research directions that warrant further +investigation. Through this structured overview, we aim to provide researchers +with a comprehensive understanding of diffusion models for ITS, thereby +advancing their future applications in the transportation domain. + +
+
+ comment: 7 figures +
+
+
+
+
+ + ♻ ☆ Enabling On-Chip High-Frequency Adaptive Linear Optimal Control via + Linearized Gaussian Process + + +
+ Unpredictable and complex aerodynamic effects pose significant challenges to +achieving precise flight control, such as the downwash effect from upper +vehicles to lower ones. Conventional methods often struggle to accurately model +these interactions, leading to controllers that require large safety margins +between vehicles. Moreover, the controller on real drones usually requires +high-frequency and has limited on-chip computation, making the adaptive control +design more difficult to implement. To address these challenges, we incorporate +Gaussian process (GP) to model the adaptive external aerodynamics with linear +model predictive control. The GP is linearized to enable real-time +high-frequency solutions. Moreover, to handle the error caused by +linearization, we integrate end-to-end Bayesian optimization during sample +collection stages to improve the control performance. Experimental results on +both simulations and real quadrotors show that we can achieve real-time +solvable computation speed with acceptable tracking errors. + +
+
+
+
+
+ + ♻ ☆ Experimenting with Adaptive Bitrate Algorithms for Virtual Reality + Streaming over Wi-Fi + + +
+ Interactive Virtual Reality (VR) streaming over Wi-Fi networks encounters +significant challenges due to bandwidth fluctuations caused by channel +contention and user mobility. Adaptive BitRate (ABR) algorithms dynamically +adjust the video encoding bitrate based on the available network capacity, +aiming to maximize image quality while mitigating congestion and preserving the +user's Quality of Experience (QoE). In this paper, we experiment with ABR +algorithms for VR streaming using Air Light VR (ALVR), an open-source VR +streaming solution. We extend ALVR with a comprehensive set of metrics that +provide a robust characterization of the network's state, enabling more +informed bitrate adjustments. To demonstrate the utility of these performance +indicators, we develop and test the Network-aware Step-wise ABR algorithm for +VR streaming (NeSt-VR). Results validate the accuracy of the newly implemented +network performance metrics and demonstrate NeSt-VR's video bitrate adaptation +capabilities. + +
+
+
+
+
+ + ♻ ☆ Distributed Model Predictive Control for Piecewise Affine Systems Based + on Switching ADMM + + +
+ This paper presents a novel approach for distributed model predictive control +(MPC) for piecewise affine (PWA) systems. Existing approaches rely on solving +mixed-integer optimization problems, requiring significant computation power or +time. We propose a distributed MPC scheme that requires solving only convex +optimization problems. The key contribution is a novel method, based on the +alternating direction method of multipliers, for solving the non-convex optimal +control problem that arises due to the PWA dynamics. We present a distributed +MPC scheme, leveraging this method, that explicitly accounts for the coupling +between subsystems by reaching agreement on the values of coupled states. +Stability and recursive feasibility are shown under additional assumptions on +the underlying system. Two numerical examples are provided, in which the +proposed controller is shown to significantly improve the CPU time and +closed-loop performance over existing state-of-the-art approaches. + +
+
+ comment: 15 pages, 9 figures, submitted to IEEE Transactions on Automatic + Control, code available at + https://github.com/SamuelMallick/stable-dmpc-pwa/tree/paper_2024 and + https://github.com/SamuelMallick/hybrid-vehicle-platoon/tree/paper-2024 +
+
+
+
+
+ + ♻ ☆ Learning to Boost the Performance of Stable Nonlinear Systems + + +
+ The growing scale and complexity of safety-critical control systems +underscore the need to evolve current control architectures aiming for the +unparalleled performances achievable through state-of-the-art optimization and +machine learning algorithms. However, maintaining closed-loop stability while +boosting the performance of nonlinear control systems using data-driven and +deep-learning approaches stands as an important unsolved challenge. In this +paper, we tackle the performance-boosting problem with closed-loop stability +guarantees. Specifically, we establish a synergy between the Internal Model +Control (IMC) principle for nonlinear systems and state-of-the-art +unconstrained optimization approaches for learning stable dynamics. Our methods +enable learning over arbitrarily deep neural network classes of +performance-boosting controllers for stable nonlinear systems; crucially, we +guarantee L_p closed-loop stability even if optimization is halted prematurely, +and even when the ground-truth dynamics are unknown, with vanishing +conservatism in the class of stabilizing policies as the model uncertainty is +reduced to zero. We discuss the implementation details of the proposed control +schemes, including distributed ones, along with the corresponding optimization +procedures, demonstrating the potential of freely shaping the cost functions +through several numerical experiments. + +
+
+
+
+
+ + ♻ ☆ TOP-Nav: Legged Navigation Integrating Terrain, Obstacle and + Proprioception Estimation CoRL 2024 + + +
+ Legged navigation is typically examined within open-world, off-road, and +challenging environments. In these scenarios, estimating external disturbances +requires a complex synthesis of multi-modal information. This underlines a +major limitation in existing works that primarily focus on avoiding obstacles. +In this work, we propose TOP-Nav, a novel legged navigation framework that +integrates a comprehensive path planner with Terrain awareness, Obstacle +avoidance and close-loop Proprioception. TOP-Nav underscores the synergies +between vision and proprioception in both path and motion planning. Within the +path planner, we present and integrate a terrain estimator that enables the +robot to select waypoints on terrains with higher traversability while +effectively avoiding obstacles. In the motion planning level, we not only +implement a locomotion controller to track the navigation commands, but also +construct a proprioception advisor to provide motion evaluations for the path +planner. Based on the close-loop motion feedback, we make online corrections +for the vision-based terrain and obstacle estimations. Consequently, TOP-Nav +achieves open-world navigation that the robot can handle terrains or +disturbances beyond the distribution of prior knowledge and overcomes +constraints imposed by visual conditions. Building upon extensive experiments +conducted in both simulation and real-world environments, TOP-Nav demonstrates +superior performance in open-world navigation compared to existing methods. + +
+
+ comment: Published on CoRL 2024 +
+
+
+
+
+ + ♻ ☆ SustainDC -- Benchmarking for Sustainable Data Center Control NeurIPS 2024 + + +
+ Machine learning has driven an exponential increase in computational demand, +leading to massive data centers that consume significant amounts of energy and +contribute to climate change. This makes sustainable data center control a +priority. In this paper, we introduce SustainDC, a set of Python environments +for benchmarking multi-agent reinforcement learning (MARL) algorithms for data +centers (DC). SustainDC supports custom DC configurations and tasks such as +workload scheduling, cooling optimization, and auxiliary battery management, +with multiple agents managing these operations while accounting for the effects +of each other. We evaluate various MARL algorithms on SustainDC, showing their +performance across diverse DC designs, locations, weather conditions, grid +carbon intensity, and workload requirements. Our results highlight significant +opportunities for improvement of data center operations using MARL algorithms. +Given the increasing use of DC due to AI, SustainDC provides a crucial platform +for the development and benchmarking of advanced algorithms essential for +achieving sustainable computing and addressing other heterogeneous real-world +challenges. + +
+
+ comment: Under review at Advances in Neural Information Processing Systems + 2024 (NeurIPS 2024) +
+
+
+
+
+ + ♻ ☆ Constraint-Guided Online Data Selection for Scalable Data-Driven Safety + Filters in Uncertain Robotic Systems + + +
+ As the use of autonomous robots expands in tasks that are complex and +challenging to model, the demand for robust data-driven control methods that +can certify safety and stability in uncertain conditions is increasing. +However, the practical implementation of these methods often faces scalability +issues due to the growing amount of data points with system complexity, and a +significant reliance on high-quality training data. In response to these +challenges, this study presents a scalable data-driven controller that +efficiently identifies and infers from the most informative data points for +implementing data-driven safety filters. Our approach is grounded in the +integration of a model-based certificate function-based method and Gaussian +Process (GP) regression, reinforced by a novel online data selection algorithm +that reduces time complexity from quadratic to linear relative to dataset size. +Empirical evidence, gathered from successful real-world cart-pole swing-up +experiments and simulated locomotion of a five-link bipedal robot, demonstrates +the efficacy of our approach. Our findings reveal that our efficient online +data selection algorithm, which strategically selects key data points, enhances +the practicality and efficiency of data-driven certifying filters in complex +robotic systems, significantly mitigating scalability concerns inherent in +nonparametric learning-based control methods. + +
+
+ comment: The first three authors contributed equally to the work. This work + has been submitted to the IEEE for possible publication. Copyright may be + transferred without notice, after which this version may no longer be + accessible +
+
+
+
+
+ + ♻ ☆ MARec: Metadata Alignment for cold-start Recommendation + + +
+ For many recommender systems, the primary data source is a historical record +of user clicks. The associated click matrix is often very sparse, as the number +of users x products can be far larger than the number of clicks. Such sparsity +is accentuated in cold-start settings, which makes the efficient use of +metadata information of paramount importance. In this work, we propose a simple +approach to address cold-start recommendations by leveraging content metadata, +Metadata Alignment for cold-start Recommendation. We show that this approach +can readily augment existing matrix factorization and autoencoder approaches, +enabling a smooth transition to top performing algorithms in warmer set-ups. +Our experimental results indicate three separate contributions: first, we show +that our proposed framework largely beats SOTA results on 4 cold-start datasets +with different sparsity and scale characteristics, with gains ranging from ++8.4% to +53.8% on reported ranking metrics; second, we provide an ablation +study on the utility of semantic features, and proves the additional gain +obtained by leveraging such features ranges between +46.8% and +105.5%; and +third, our approach is by construction highly competitive in warm set-ups, and +we propose a closed-form solution outperformed by SOTA results by only 0.8% on +average. + +
+
+
+
+
+ + ♻ ☆ On Game Based Distributed Decision Approach for Multi-agent Optimal + Coverage Problem with Application to Constellations Reconfiguration + + +
+ This paper focuses on the optimal coverage problem (OCP) for multi-agent +systems with decentralized optimization. A game based distributed decision +approach for the the multi-agent OCP is proposed. The equivalence between the +equilibrium of the game and the extreme value of the global performance +objective is strictly proved. Then, a distributed algorithm only using local +information to obtain the global near-optimal coverage is developed, and its +convergence is proved. Finally, the proposed method is applied to maximize the +covering time of a satellite constellation for a target. The simulation results +under different scenarios show our method costs much less computation time +under some level index than traditional centralized optimization. + +
+
+ comment: 11 pages,11 figures +
+
+
+
+
+ + ♻ ☆ Personalised Outfit Recommendation via History-aware Transformers + + +
+ We present the history-aware transformer (HAT), a transformer-based model +that uses shoppers' purchase history to personalise outfit predictions. The aim +of this work is to recommend outfits that are internally coherent while +matching an individual shopper's style and taste. To achieve this, we stack two +transformer models, one that produces outfit representations and another one +that processes the history of purchased outfits for a given shopper. We use +these models to score an outfit's compatibility in the context of a shopper's +preferences as inferred from their previous purchases. During training, the +model learns to discriminate between purchased and random outfits using 3 +losses: the focal loss for outfit compatibility typically used in the +literature, a contrastive loss to bring closer learned outfit embeddings from a +shopper's history, and an adaptive margin loss to facilitate learning from weak +negatives. Together, these losses enable the model to make personalised +recommendations based on a shopper's purchase history. + Our experiments on the IQON3000 and Polyvore datasets show that HAT +outperforms strong baselines on the outfit Compatibility Prediction (CP) and +the Fill In The Blank (FITB) tasks. The model improves AUC for the CP hard task +by 15.7% (IQON3000) and 19.4% (Polyvore) compared to previous SOTA results. It +further improves accuracy on the FITB hard task by 6.5% and 9.7%, respectively. +We provide ablation studies on the personalisation, constrastive loss, and +adaptive margin loss that highlight the importance of these modelling choices. + +
+
+
+
+
+
+
+
+ + Machine Learning 151 + +
+
+
+ + ☆ PhysGen: Rigid-Body Physics-Grounded Image-to-Video Generation ECCV 2024 + + +
+ We present PhysGen, a novel image-to-video generation method that converts a +single image and an input condition (e.g., force and torque applied to an +object in the image) to produce a realistic, physically plausible, and +temporally consistent video. Our key insight is to integrate model-based +physical simulation with a data-driven video generation process, enabling +plausible image-space dynamics. At the heart of our system are three core +components: (i) an image understanding module that effectively captures the +geometry, materials, and physical parameters of the image; (ii) an image-space +dynamics simulation model that utilizes rigid-body physics and inferred +parameters to simulate realistic behaviors; and (iii) an image-based rendering +and refinement module that leverages generative video diffusion to produce +realistic video footage featuring the simulated motion. The resulting videos +are realistic in both physics and appearance and are even precisely +controllable, showcasing superior results over existing data-driven +image-to-video generation works through quantitative comparison and +comprehensive user study. PhysGen's resulting videos can be used for various +downstream applications, such as turning an image into a realistic animation or +allowing users to interact with the image and create various dynamics. Project +page: https://stevenlsw.github.io/physgen/ + +
+
+ comment: Accepted to ECCV 2024. Project page: + https://stevenlsw.github.io/physgen/ +
+
+
+
+
+ + ☆ Exploring Token Pruning in Vision State Space Models NeurIPS'24 + + +
+ State Space Models (SSMs) have the advantage of keeping linear computational +complexity compared to attention modules in transformers, and have been applied +to vision tasks as a new type of powerful vision foundation model. Inspired by +the observations that the final prediction in vision transformers (ViTs) is +only based on a subset of most informative tokens, we take the novel step of +enhancing the efficiency of SSM-based vision models through token-based +pruning. However, direct applications of existing token pruning techniques +designed for ViTs fail to deliver good performance, even with extensive +fine-tuning. To address this issue, we revisit the unique computational +characteristics of SSMs and discover that naive application disrupts the +sequential token positions. This insight motivates us to design a novel and +general token pruning method specifically for SSM-based vision models. We first +introduce a pruning-aware hidden state alignment method to stabilize the +neighborhood of remaining tokens for performance enhancement. Besides, based on +our detailed analysis, we propose a token importance evaluation method adapted +for SSM models, to guide the token pruning. With efficient implementation and +practical acceleration methods, our method brings actual speedup. Extensive +experiments demonstrate that our approach can achieve significant computation +reduction with minimal impact on performance across different tasks. Notably, +we achieve 81.7\% accuracy on ImageNet with a 41.6\% reduction in the FLOPs for +pruned PlainMamba-L3. Furthermore, our work provides deeper insights into +understanding the behavior of SSM-based vision models for future research. + +
+
+ comment: NeurIPS'24 +
+
+
+
+
+ + ☆ $O(d/T)$ Convergence Theory for Diffusion Probabilistic Models under + Minimal Assumptions + + +
+ Score-based diffusion models, which generate new data by learning to reverse +a diffusion process that perturbs data from the target distribution into noise, +have achieved remarkable success across various generative tasks. Despite their +superior empirical performance, existing theoretical guarantees are often +constrained by stringent assumptions or suboptimal convergence rates. In this +paper, we establish a fast convergence theory for a popular SDE-based sampler +under minimal assumptions. Our analysis shows that, provided +$\ell_{2}$-accurate estimates of the score functions, the total variation +distance between the target and generated distributions is upper bounded by +$O(d/T)$ (ignoring logarithmic factors), where $d$ is the data dimensionality +and $T$ is the number of steps. This result holds for any target distribution +with finite first-order moment. To our knowledge, this improves upon existing +convergence theory for both the SDE-based sampler and another ODE-based +sampler, while imposing minimal assumptions on the target data distribution and +score estimates. This is achieved through a novel set of analytical tools that +provides a fine-grained characterization of how the error propagates at each +step of the reverse process. + +
+
+
+
+
+ + ☆ LML: Language Model Learning a Dataset for Data-Augmented Prediction + + +
+ This paper introduces a new approach to using Large Language Models (LLMs) +for classification tasks, which are typically handled using Machine Learning +(ML) models. Unlike ML models that rely heavily on data cleaning and feature +engineering, this method streamlines the process using LLMs. This paper +proposes a new concept called "Language Model Learning (LML)" powered by a new +method called "Data-Augmented Prediction (DAP)". The classification is +performed by LLMs using a method similar to humans manually exploring and +understanding the data and deciding classifications using data as a reference. +Training data is summarized and evaluated to determine the features that lead +to the classification of each label the most. In the process of DAP, the system +uses the data summary to automatically create a query, which is used to +retrieve relevant rows from the dataset. A classification is generated by the +LLM using data summary and relevant rows, ensuring satisfactory accuracy even +with complex data. Usage of data summary and similar data in DAP ensures +context-aware decision-making. The proposed method uses the words "Act as an +Explainable Machine Learning Model" in the prompt to enhance the +interpretability of the predictions by allowing users to review the logic +behind each prediction. In some test cases, the system scored an accuracy above +90%, proving the effectiveness of the system and its potential to outperform +conventional ML models in various scenarios. The code is available at +https://github.com/Pro-GenAI/LML-DAP + +
+
+ comment: First version +
+
+
+
+
+ + ☆ RepairBench: Leaderboard of Frontier Models for Program Repair + + +
+ AI-driven program repair uses AI models to repair buggy software by producing +patches. Rapid advancements in AI surely impact state-of-the-art performance of +program repair. Yet, grasping this progress requires frequent and standardized +evaluations. We propose RepairBench, a novel leaderboard for AI-driven program +repair. The key characteristics of RepairBench are: 1) it is execution-based: +all patches are compiled and executed against a test suite, 2) it assesses +frontier models in a frequent and standardized way. RepairBench leverages two +high-quality benchmarks, Defects4J and GitBug-Java, to evaluate frontier models +against real-world program repair tasks. We publicly release the evaluation +framework of RepairBench. We will update the leaderboard as new frontier models +are released. + +
+
+
+
+
+ + ☆ Spectral Wavelet Dropout: Regularization in the Wavelet Domain ICML + + +
+ Regularization techniques help prevent overfitting and therefore improve the +ability of convolutional neural networks (CNNs) to generalize. One reason for +overfitting is the complex co-adaptations among different parts of the network, +which make the CNN dependent on their joint response rather than encouraging +each part to learn a useful feature representation independently. Frequency +domain manipulation is a powerful strategy for modifying data that has temporal +and spatial coherence by utilizing frequency decomposition. This work +introduces Spectral Wavelet Dropout (SWD), a novel regularization method that +includes two variants: 1D-SWD and 2D-SWD. These variants improve CNN +generalization by randomly dropping detailed frequency bands in the discrete +wavelet decomposition of feature maps. Our approach distinguishes itself from +the pre-existing Spectral "Fourier" Dropout (2D-SFD), which eliminates +coefficients in the Fourier domain. Notably, SWD requires only a single +hyperparameter, unlike the two required by SFD. We also extend the literature +by implementing a one-dimensional version of Spectral "Fourier" Dropout +(1D-SFD), setting the stage for a comprehensive comparison. Our evaluation +shows that both 1D and 2D SWD variants have competitive performance on +CIFAR-10/100 benchmarks relative to both 1D-SFD and 2D-SFD. Specifically, +1D-SWD has a significantly lower computational complexity compared to +1D/2D-SFD. In the Pascal VOC Object Detection benchmark, SWD variants surpass +1D-SFD and 2D-SFD in performance and demonstrate lower computational complexity +during training. + +
+
+ comment: Accepted by The International Conference on Machine Learning and + Applications (ICMLA) 2024 +
+
+
+
+
+ + ☆ Unconditional stability of a recurrent neural circuit implementing + divisive normalization + + +
+ Stability in recurrent neural models poses a significant challenge, +particularly in developing biologically plausible neurodynamical models that +can be seamlessly trained. Traditional cortical circuit models are notoriously +difficult to train due to expansive nonlinearities in the dynamical system, +leading to an optimization problem with nonlinear stability constraints that +are difficult to impose. Conversely, recurrent neural networks (RNNs) excel in +tasks involving sequential data but lack biological plausibility and +interpretability. In this work, we address these challenges by linking dynamic +divisive normalization (DN) to the stability of ORGaNICs, a biologically +plausible recurrent cortical circuit model that dynamically achieves DN and has +been shown to simulate a wide range of neurophysiological phenomena. By using +the indirect method of Lyapunov, we prove the remarkable property of +unconditional local stability for an arbitrary-dimensional ORGaNICs circuit +when the recurrent weight matrix is the identity. We thus connect ORGaNICs to a +system of coupled damped harmonic oscillators, which enables us to derive the +circuit's energy function, providing a normative principle of what the circuit, +and individual neurons, aim to accomplish. Further, for a generic recurrent +weight matrix, we prove the stability of the 2D model and demonstrate +empirically that stability holds in higher dimensions. Finally, we show that +ORGaNICs can be trained by backpropagation through time without gradient +clipping/scaling, thanks to its intrinsic stability property and adaptive time +constants, which address the problems of exploding, vanishing, and oscillating +gradients. By evaluating the model's performance on RNN benchmarks, we find +that ORGaNICs outperform alternative neurodynamical models on static image +classification tasks and perform comparably to LSTMs on sequential tasks. + +
+
+
+
+
+ + ☆ A-FedPD: Aligning Dual-Drift is All Federated Primal-Dual Learning Needs + + +
+ As a popular paradigm for juggling data privacy and collaborative training, +federated learning (FL) is flourishing to distributively process the large +scale of heterogeneous datasets on edged clients. Due to bandwidth limitations +and security considerations, it ingeniously splits the original problem into +multiple subproblems to be solved in parallel, which empowers primal dual +solutions to great application values in FL. In this paper, we review the +recent development of classical federated primal dual methods and point out a +serious common defect of such methods in non-convex scenarios, which we say is +a "dual drift" caused by dual hysteresis of those longstanding inactive clients +under partial participation training. To further address this problem, we +propose a novel Aligned Federated Primal Dual (A-FedPD) method, which +constructs virtual dual updates to align global consensus and local dual +variables for those protracted unparticipated local clients. Meanwhile, we +provide a comprehensive analysis of the optimization and generalization +efficiency for the A-FedPD method on smooth non-convex objectives, which +confirms its high efficiency and practicality. Extensive experiments are +conducted on several classical FL setups to validate the effectiveness of our +proposed method. + +
+
+
+
+
+ + ☆ Best Arm Identification with Minimal Regret + + +
+ Motivated by real-world applications that necessitate responsible +experimentation, we introduce the problem of best arm identification (BAI) with +minimal regret. This innovative variant of the multi-armed bandit problem +elegantly amalgamates two of its most ubiquitous objectives: regret +minimization and BAI. More precisely, the agent's goal is to identify the best +arm with a prescribed confidence level $\delta$, while minimizing the +cumulative regret up to the stopping time. Focusing on single-parameter +exponential families of distributions, we leverage information-theoretic +techniques to establish an instance-dependent lower bound on the expected +cumulative regret. Moreover, we present an intriguing impossibility result that +underscores the tension between cumulative regret and sample complexity in +fixed-confidence BAI. Complementarily, we design and analyze the Double KL-UCB +algorithm, which achieves asymptotic optimality as the confidence level tends +to zero. Notably, this algorithm employs two distinct confidence bounds to +guide arm selection in a randomized manner. Our findings elucidate a fresh +perspective on the inherent connections between regret minimization and BAI. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ In-depth Analysis of Privacy Threats in Federated Learning for Medical + Data + + +
+ Federated learning is emerging as a promising machine learning technique in +the medical field for analyzing medical images, as it is considered an +effective method to safeguard sensitive patient data and comply with privacy +regulations. However, recent studies have revealed that the default settings of +federated learning may inadvertently expose private training data to privacy +attacks. Thus, the intensity of such privacy risks and potential mitigation +strategies in the medical domain remain unclear. In this paper, we make three +original contributions to privacy risk analysis and mitigation in federated +learning for medical data. First, we propose a holistic framework, MedPFL, for +analyzing privacy risks in processing medical data in the federated learning +environment and developing effective mitigation strategies for protecting +privacy. Second, through our empirical analysis, we demonstrate the severe +privacy risks in federated learning to process medical images, where +adversaries can accurately reconstruct private medical images by performing +privacy attacks. Third, we illustrate that the prevalent defense mechanism of +adding random noises may not always be effective in protecting medical images +against privacy attacks in federated learning, which poses unique and pressing +challenges related to protecting the privacy of medical data. Furthermore, the +paper discusses several unique research questions related to the privacy +protection of medical data in the federated learning environment. We conduct +extensive experiments on several benchmark medical image datasets to analyze +and mitigate the privacy risks associated with federated learning for medical +data. + +
+
+
+
+
+ + ☆ Probabilistic Analysis of Least Squares, Orthogonal Projection, and QR + Factorization Algorithms Subject to Gaussian Noise + + +
+ In this paper, we extend the work of Liesen et al. (2002), which analyzes how +the condition number of an orthonormal matrix Q changes when a column is added +([Q, c]), particularly focusing on the perpendicularity of c to the span of Q. +Their result, presented in Theorem 2.3 of Liesen et al. (2002), assumes exact +arithmetic and orthonormality of Q, which is a strong assumption when applying +these results to numerical methods such as QR factorization algorithms. In our +work, we address this gap by deriving bounds on the condition number increase +for a matrix B without assuming perfect orthonormality, even when a column is +not perfectly orthogonal to the span of B. This framework allows us to analyze +QR factorization methods where orthogonalization is imperfect and subject to +Gaussian noise. We also provide results on the performance of orthogonal +projection and least squares under Gaussian noise, further supporting the +development of this theory. + +
+
+
+
+
+ + ☆ Multi-Source Hard and Soft Information Fusion Approach for Accurate + Cryptocurrency Price Movement Prediction + + +
+ One of the most important challenges in the financial and cryptocurrency +field is accurately predicting cryptocurrency price trends. Leveraging +artificial intelligence (AI) is beneficial in addressing this challenge. +Cryptocurrency markets, marked by substantial growth and volatility, attract +investors and scholars keen on deciphering and forecasting cryptocurrency price +movements. The vast and diverse array of data available for such predictions +increases the complexity of the task. In our study, we introduce a novel +approach termed hard and soft information fusion (HSIF) to enhance the accuracy +of cryptocurrency price movement forecasts. The hard information component of +our approach encompasses historical price records alongside technical +indicators. Complementing this, the soft data component extracts from X +(formerly Twitter), encompassing news headlines and tweets about the +cryptocurrency. To use this data, we use the Bidirectional Encoder +Representations from Transformers (BERT)-based sentiment analysis method, +financial BERT (FinBERT), which performs best. Finally, our model feeds on the +information set including processed hard and soft data. We employ the +bidirectional long short-term memory (BiLSTM) model because processing +information in both forward and backward directions can capture long-term +dependencies in sequential information. Our empirical findings emphasize the +superiority of the HSIF approach over models dependent on single-source data by +testing on Bitcoin-related data. By fusing hard and soft information on Bitcoin +dataset, our model has about 96.8\% accuracy in predicting price movement. +Incorporating information enables our model to grasp the influence of social +sentiment on price fluctuations, thereby supplementing the technical +analysis-based predictions derived from hard information. + +
+
+
+
+
+ + ☆ HM3: Hierarchical Multi-Objective Model Merging for Pretrained Models + + +
+ Model merging is a technique that combines multiple large pretrained models +into a single model with enhanced performance and broader task adaptability. It +has gained popularity in large pretrained model development due to its ability +to bypass the need for original training data and further training processes. +However, most existing model merging approaches focus solely on exploring the +parameter space, merging models with identical architectures. Merging within +the architecture space, despite its potential, remains in its early stages due +to the vast search space and the challenges of layer compatibility. This paper +marks a significant advance toward more flexible and comprehensive model +merging techniques by modeling the architecture-space merging process as a +reinforcement learning task. We train policy and value networks using offline +sampling of weight vectors, which are then employed for the online optimization +of merging strategies. Moreover, a multi-objective optimization paradigm is +introduced to accommodate users' diverse task preferences, learning the Pareto +front of optimal models to offer customized merging suggestions. Experimental +results across multiple tasks, including text translation, mathematical +reasoning, and code generation, validate the effectiveness and superiority of +the proposed framework in model merging. The code will be made publicly +available after the review process. + +
+
+
+
+
+ + ☆ HR-Extreme: A High-Resolution Dataset for Extreme Weather Forecasting + + +
+ The application of large deep learning models in weather forecasting has led +to significant advancements in the field, including higher-resolution +forecasting and extended prediction periods exemplified by models such as Pangu +and Fuxi. Despite these successes, previous research has largely been +characterized by the neglect of extreme weather events, and the availability of +datasets specifically curated for such events remains limited. Given the +critical importance of accurately forecasting extreme weather, this study +introduces a comprehensive dataset that incorporates high-resolution extreme +weather cases derived from the High-Resolution Rapid Refresh (HRRR) data, a +3-km real-time dataset provided by NOAA. We also evaluate the current +state-of-the-art deep learning models and Numerical Weather Prediction (NWP) +systems on HR-Extreme, and provide a improved baseline deep learning model +called HR-Heim which has superior performance on both general loss and +HR-Extreme compared to others. Our results reveal that the errors of extreme +weather cases are significantly larger than overall forecast error, +highlighting them as an crucial source of loss in weather prediction. These +findings underscore the necessity for future research to focus on improving the +accuracy of extreme weather forecasts to enhance their practical utility. + +
+
+ comment: 10 pages, under review +
+
+
+
+
+ + ☆ CESNET-TimeSeries24: Time Series Dataset for Network Traffic Anomaly + Detection and Forecasting + + +
+ Anomaly detection in network traffic is crucial for maintaining the security +of computer networks and identifying malicious activities. One of the primary +approaches to anomaly detection are methods based on forecasting. Nevertheless, +extensive real-world network datasets for forecasting and anomaly detection +techniques are missing, potentially causing performance overestimation of +anomaly detection algorithms. This manuscript addresses this gap by introducing +a dataset comprising time series data of network entities' behavior, collected +from the CESNET3 network. The dataset was created from 40 weeks of network +traffic of 275 thousand active IP addresses. The ISP origin of the presented +data ensures a high level of variability among network entities, which forms a +unique and authentic challenge for forecasting and anomaly detection models. It +provides valuable insights into the practical deployment of forecast-based +anomaly detection approaches. + +
+
+
+
+
+ + ☆ Simulating Dynamic Tumor Contrast Enhancement in Breast MRI using + Conditional Generative Adversarial Networks + + +
+ This paper presents a method for virtual contrast enhancement in breast MRI, +offering a promising non-invasive alternative to traditional contrast +agent-based DCE-MRI acquisition. Using a conditional generative adversarial +network, we predict DCE-MRI images, including jointly-generated sequences of +multiple corresponding DCE-MRI timepoints, from non-contrast-enhanced MRIs, +enabling tumor localization and characterization without the associated health +risks. Furthermore, we qualitatively and quantitatively evaluate the synthetic +DCE-MRI images, proposing a multi-metric Scaled Aggregate Measure (SAMe), +assessing their utility in a tumor segmentation downstream task, and conclude +with an analysis of the temporal patterns in multi-sequence DCE-MRI generation. +Our approach demonstrates promising results in generating realistic and useful +DCE-MRI sequences, highlighting the potential of virtual contrast enhancement +for improving breast cancer diagnosis and treatment, particularly for patients +where contrast agent administration is contraindicated. + +
+
+
+
+
+ + ☆ Individuation in Neural Models with and without Visual Grounding + + +
+ We show differences between a language-and-vision model CLIP and two +text-only models - FastText and SBERT - when it comes to the encoding of +individuation information. We study latent representations that CLIP provides +for substrates, granular aggregates, and various numbers of objects. We +demonstrate that CLIP embeddings capture quantitative differences in +individuation better than models trained on text-only data. Moreover, the +individuation hierarchy we deduce from the CLIP embeddings agrees with the +hierarchies proposed in linguistics and cognitive science. + +
+
+
+
+
+ + ☆ Positional Encoder Graph Quantile Neural Networks for Geographic Data + + +
+ Positional Encoder Graph Neural Networks (PE-GNNs) are a leading approach for +modeling continuous spatial data. However, they often fail to produce +calibrated predictive distributions, limiting their effectiveness for +uncertainty quantification. We introduce the Positional Encoder Graph Quantile +Neural Network (PE-GQNN), a novel method that integrates PE-GNNs, Quantile +Neural Networks, and recalibration techniques in a fully nonparametric +framework, requiring minimal assumptions about the predictive distributions. We +propose a new network architecture that, when combined with a quantile-based +loss function, yields accurate and reliable probabilistic models without +increasing computational complexity. Our approach provides a flexible, robust +framework for conditional density estimation, applicable beyond spatial data +contexts. We further introduce a structured method for incorporating a KNN +predictor into the model while avoiding data leakage through the GNN layer +operation. Experiments on benchmark datasets demonstrate that PE-GQNN +significantly outperforms existing state-of-the-art methods in both predictive +accuracy and uncertainty quantification. + +
+
+ comment: 17 main text pages, 4 figures +
+
+
+
+
+ + ☆ Challenges of Generating Structurally Diverse Graphs + + +
+ For many graph-related problems, it can be essential to have a set of +structurally diverse graphs. For instance, such graphs can be used for testing +graph algorithms or their neural approximations. However, to the best of our +knowledge, the problem of generating structurally diverse graphs has not been +explored in the literature. In this paper, we fill this gap. First, we discuss +how to define diversity for a set of graphs, why this task is non-trivial, and +how one can choose a proper diversity measure. Then, for a given diversity +measure, we propose and compare several algorithms optimizing it: we consider +approaches based on standard random graph models, local graph optimization, +genetic algorithms, and neural generative models. We show that it is possible +to significantly improve diversity over basic random graph generators. +Additionally, our analysis of generated graphs allows us to better understand +the properties of graph distances: depending on which diversity measure is used +for optimization, the obtained graphs may possess very different structural +properties which gives insights about the sensitivity of the graph distance +underlying the diversity measure. + +
+
+
+
+
+ + ☆ Two Sparse Matrices are Better than One: Sparsifying Neural Networks + with Double Sparse Factorization + + +
+ Neural networks are often challenging to work with due to their large size +and complexity. To address this, various methods aim to reduce model size by +sparsifying or decomposing weight matrices, such as magnitude pruning and +low-rank or block-diagonal factorization. In this work, we present Double +Sparse Factorization (DSF), where we factorize each weight matrix into two +sparse matrices. Although solving this problem exactly is computationally +infeasible, we propose an efficient heuristic based on alternating minimization +via ADMM that achieves state-of-the-art results, enabling unprecedented +sparsification of neural networks. For instance, in a one-shot pruning setting, +our method can reduce the size of the LLaMA2-13B model by 50% while maintaining +better performance than the dense LLaMA2-7B model. We also compare favorably +with Optimal Brain Compression, the state-of-the-art layer-wise pruning +approach for convolutional neural networks. Furthermore, accuracy improvements +of our method persist even after further model fine-tuning. + Code available at: https://github.com/usamec/double_sparse. + +
+
+
+
+
+ + ☆ Classical Statistical (In-Sample) Intuitions Don't Generalize Well: A + Note on Bias-Variance Tradeoffs, Overfitting and Moving from Fixed to Random + Designs + + +
+ The sudden appearance of modern machine learning (ML) phenomena like double +descent and benign overfitting may leave many classically trained statisticians +feeling uneasy -- these phenomena appear to go against the very core of +statistical intuitions conveyed in any introductory class on learning from +data. The historical lack of earlier observation of such phenomena is usually +attributed to today's reliance on more complex ML methods, +overparameterization, interpolation and/or higher data dimensionality. In this +note, we show that there is another reason why we observe behaviors today that +appear at odds with intuitions taught in classical statistics textbooks, which +is much simpler to understand yet rarely discussed explicitly. In particular, +many intuitions originate in fixed design settings, in which in-sample +prediction error (under resampling of noisy outcomes) is of interest, while +modern ML evaluates its predictions in terms of generalization error, i.e. +out-of-sample prediction error in random designs. Here, we highlight that this +simple move from fixed to random designs has (perhaps surprisingly) +far-reaching consequences on textbook intuitions relating to the bias-variance +tradeoff, and comment on the resulting (im)possibility of observing double +descent and benign overfitting in fixed versus random designs. + +
+
+
+
+
+ + ☆ Constructing Confidence Intervals for 'the' Generalization Error -- a + Comprehensive Benchmark Study + + +
+ When assessing the quality of prediction models in machine learning, +confidence intervals (CIs) for the generalization error, which measures +predictive performance, are a crucial tool. Luckily, there exist many methods +for computing such CIs and new promising approaches are continuously being +proposed. Typically, these methods combine various resampling procedures, most +popular among them cross-validation and bootstrapping, with different variance +estimation techniques. Unfortunately, however, there is currently no consensus +on when any of these combinations may be most reliably employed and how they +generally compare. In this work, we conduct the first large-scale study +comparing CIs for the generalization error - empirically evaluating 13 +different methods on a total of 18 tabular regression and classification +problems, using four different inducers and a total of eight loss functions. We +give an overview of the methodological foundations and inherent challenges of +constructing CIs for the generalization error and provide a concise review of +all 13 methods in a unified framework. Finally, the CI methods are evaluated in +terms of their relative coverage frequency, width, and runtime. Based on these +findings, we are able to identify a subset of methods that we would recommend. +We also publish the datasets as a benchmarking suite on OpenML and our code on +GitHub to serve as a basis for further studies. + +
+
+
+
+
+ + ☆ Classification and regression of trajectories rendered as images via 2D + Convolutional Neural Networks + + +
+ Trajectories can be regarded as time-series of coordinates, typically arising +from motile objects. Methods for trajectory classification are particularly +important to detect different movement patterns, while methods for regression +to compute motility metrics and forecasting. Recent advances in computer vision +have facilitated the processing of trajectories rendered as images via +artificial neural networks with 2d convolutional layers (CNNs). This approach +leverages the capability of CNNs to learn spatial hierarchies of features from +images, necessary to recognize complex shapes. Moreover, it overcomes the +limitation of other machine learning methods that require input trajectories +with a fixed number of points. However, rendering trajectories as images can +introduce poorly investigated artifacts such as information loss due to the +plotting of coordinates on a discrete grid, and spectral changes due to line +thickness and aliasing. In this study, we investigate the effectiveness of CNNs +for solving classification and regression problems from synthetic trajectories +that have been rendered as images using different modalities. The parameters +considered in this study include line thickness, image resolution, usage of +motion history (color-coding of the temporal component) and anti-aliasing. +Results highlight the importance of choosing an appropriate image resolution +according to model depth and motion history in applications where movement +direction is critical. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ ARLBench: Flexible and Efficient Benchmarking for Hyperparameter + Optimization in Reinforcement Learning + + +
+ Hyperparameters are a critical factor in reliably training well-performing +reinforcement learning (RL) agents. Unfortunately, developing and evaluating +automated approaches for tuning such hyperparameters is both costly and +time-consuming. As a result, such approaches are often only evaluated on a +single domain or algorithm, making comparisons difficult and limiting insights +into their generalizability. We propose ARLBench, a benchmark for +hyperparameter optimization (HPO) in RL that allows comparisons of diverse HPO +approaches while being highly efficient in evaluation. To enable research into +HPO in RL, even in settings with low compute resources, we select a +representative subset of HPO tasks spanning a variety of algorithm and +environment combinations. This selection allows for generating a performance +profile of an automated RL (AutoRL) method using only a fraction of the compute +previously necessary, enabling a broader range of researchers to work on HPO in +RL. With the extensive and large-scale dataset on hyperparameter landscapes +that our selection is based on, ARLBench is an efficient, flexible, and +future-oriented foundation for research on AutoRL. Both the benchmark and the +dataset are available at https://github.com/automl/arlbench. + +
+
+ comment: Accepted at the 17th European Workshop on Reinforcement Learning +
+
+
+
+
+ + ☆ Early diagnosis of Alzheimer's disease from MRI images with deep + learning model + + +
+ It is acknowledged that the most common cause of dementia worldwide is +Alzheimer's disease (AD). This condition progresses in severity from mild to +severe and interferes with people's everyday routines. Early diagnosis plays a +critical role in patient care and clinical trials. Convolutional neural +networks (CNN) are used to create a framework for identifying specific disease +features from MRI scans Classification of dementia involves approaches such as +medical history review, neuropsychological tests, and magnetic resonance +imaging (MRI). However, the image dataset obtained from Kaggle faces a +significant issue of class imbalance, which requires equal distribution of +samples from each class to address. In this article, to address this imbalance, +the Synthetic Minority Oversampling Technique (SMOTE) is utilized. Furthermore, +a pre-trained convolutional neural network has been applied to the DEMNET +dementia network to extract key features from AD images. The proposed model +achieved an impressive accuracy of 98.67%. + +
+
+ comment: 7 pages, 3 figures, Presented at the 20-th CSI International + Symposium on Artificial Intelligence and Signal Processing (AISP) 21-22 + February, 2024, Mazandaran University of Science and Technology, Babol, Iran +
+
+
+
+
+ + ☆ Convergence of Diffusion Models Under the Manifold Hypothesis in + High-Dimensions + + +
+ Denoising Diffusion Probabilistic Models (DDPM) are powerful state-of-the-art +methods used to generate synthetic data from high-dimensional data +distributions and are widely used for image, audio and video generation as well +as many more applications in science and beyond. The manifold hypothesis states +that high-dimensional data often lie on lower-dimensional manifolds within the +ambient space, and is widely believed to hold in provided examples. While +recent results has provided invaluable insight into how diffusion models adapt +to the manifold hypothesis, they do not capture the great empirical success of +these models, making this a very fruitful research direction. + In this work, we study DDPMs under the manifold hypothesis and prove that +they achieve rates independent of the ambient dimension in terms of learning +the score. In terms of sampling, we obtain rates independent of the ambient +dimension w.r.t. the Kullback-Leibler divergence, and $O(\sqrt{D})$ w.r.t. the +Wasserstein distance. We do this by developing a new framework connecting +diffusion models to the well-studied theory of extrema of Gaussian Processes. + +
+
+
+
+
+ + ☆ Esports Debut as a Medal Event at 2023 Asian Games: Exploring Public + Perceptions with BERTopic and GPT-4 Topic Fine-Tuning + + +
+ This study examined the public opinions of esports at the 2023 Asian Games +and value co-creation during the event using an LLM-enhanced BERTopic modeling +analysis. We identified five major themes representing public perceptions, as +well as how major stakeholders co-created value within and beyond the esports +ecosystem. Key findings highlighted the strategic use of social media marketing +to influence public opinion and promote esports events and brands, emphasizing +the importance of event logistics and infrastructure. Additionally, the study +revealed the co-creation value contributed by stakeholders outside the +traditional esports ecosystem, particularly in promoting national +representation and performance. Our findings supported the ongoing efforts to +legitimize esports as a sport, noting that mainstream recognition remains a +challenge. The inclusion of esports as a medal event showcased broader +acceptance and helped mitigate negative public perceptions. Moreover, +contributions from non-traditional stakeholders underscored the value of +cross-subcultural collaborations in esports. + +
+
+
+
+
+ + ☆ Hierarchical Federated ADMM + + +
+ In this paper, we depart from the widely-used gradient descent-based +hierarchical federated learning (FL) algorithms to develop a novel hierarchical +FL framework based on the alternating direction method of multipliers (ADMM). +Within this framework, we propose two novel FL algorithms, which both use ADMM +in the top layer: one that employs ADMM in the lower layer and another that +uses the conventional gradient descent-based approach. The proposed framework +enhances privacy, and experiments demonstrate the superiority of the proposed +algorithms compared to the conventional algorithms in terms of learning +convergence and accuracy. Additionally, gradient descent on the lower layer +performs well even if the number of local steps is very limited, while ADMM on +both layers lead to better performance otherwise. + +
+
+
+
+
+ + ☆ HardCore Generation: Generating Hard UNSAT Problems for Data + Augmentation + + +
+ Efficiently determining the satisfiability of a boolean equation -- known as +the SAT problem for brevity -- is crucial in various industrial problems. +Recently, the advent of deep learning methods has introduced significant +potential for enhancing SAT solving. However, a major barrier to the +advancement of this field has been the scarcity of large, realistic datasets. +The majority of current public datasets are either randomly generated or +extremely limited, containing only a few examples from unrelated problem +families. These datasets are inadequate for meaningful training of deep +learning methods. In light of this, researchers have started exploring +generative techniques to create data that more accurately reflect SAT problems +encountered in practical situations. These methods have so far suffered from +either the inability to produce challenging SAT problems or time-scalability +obstacles. In this paper we address both by identifying and manipulating the +key contributors to a problem's ``hardness'', known as cores. Although some +previous work has addressed cores, the time costs are unacceptably high due to +the expense of traditional heuristic core detection techniques. We introduce a +fast core detection procedure that uses a graph neural network. Our empirical +results demonstrate that we can efficiently generate problems that remain hard +to solve and retain key attributes of the original example problems. We show +via experiment that the generated synthetic SAT problems can be used in a data +augmentation setting to provide improved prediction of solver runtimes. + +
+
+
+
+
+ + ☆ A method of using RSVD in residual calculation of LowBit GEMM + + +
+ The advancements of hardware technology in recent years has brought many +possibilities for low-precision applications. However, the use of low precision +can introduce significant computational errors, posing a considerable challenge +to maintaining the computational accuracy. + We propose low-rank residuals quantized matrix multiplication(LRQMM) method +which introduces low-rank approximation in residual compensation for dense low +precision quantization matrix multiplication. It can bring several times +accuracy improvement with only BLAS-2 level extra time overhead. Moreover, +LRQMM is a completely data-free quantization method that does not require +additional data for pre-training. And it only works with low precision GEMM +operator, which is easy to couple with other methods. + Through experimentation, LRQMM can reduce the error of direct quantized +matrix multiplication by 1~2 orders of magnitude, when dealing with larger +matrix sizes, the computational speed is only reduced by approximately 20\%. In +deep learning networks, LRQMM-4bit achieves 61.8% ImageNet Top-1 accuracy in +Resnet-50, while the Direct Quant accuracy is only 8.3%. + +
+
+
+
+
+ + ☆ Learning from Demonstration with Implicit Nonlinear Dynamics Models + + +
+ Learning from Demonstration (LfD) is a useful paradigm for training policies +that solve tasks involving complex motions. In practice, the successful +application of LfD requires overcoming error accumulation during policy +execution, i.e. the problem of drift due to errors compounding over time and +the consequent out-of-distribution behaviours. Existing works seek to address +this problem through scaling data collection, correcting policy errors with a +human-in-the-loop, temporally ensembling policy predictions or through learning +the parameters of a dynamical system model. In this work, we propose and +validate an alternative approach to overcoming this issue. Inspired by +reservoir computing, we develop a novel neural network layer that includes a +fixed nonlinear dynamical system with tunable dynamical properties. We validate +the efficacy of our neural network layer on the task of reproducing human +handwriting motions using the LASA Human Handwriting Dataset. Through empirical +experiments we demonstrate that incorporating our layer into existing neural +network architectures addresses the issue of compounding errors in LfD. +Furthermore, we perform a comparative evaluation against existing approaches +including a temporal ensemble of policy predictions and an Echo State Networks +(ESNs) implementation. We find that our approach yields greater policy +precision and robustness on the handwriting task while also generalising to +multiple dynamics regimes and maintaining competitive latency scores. + +
+
+ comment: 21 pages, 9 figures +
+
+
+
+
+ + ☆ Geometric deep learning for galaxy-halo connection: a case study for + galaxy intrinsic alignments + + +
+ Forthcoming cosmological imaging surveys, such as the Rubin Observatory LSST, +require large-scale simulations encompassing realistic galaxy populations for a +variety of scientific applications. Of particular concern is the phenomenon of +intrinsic alignments (IA), whereby galaxies orient themselves towards +overdensities, potentially introducing significant systematic biases in weak +gravitational lensing analyses if they are not properly modeled. Due to +computational constraints, simulating the intricate details of galaxy formation +and evolution relevant to IA across vast volumes is impractical. As an +alternative, we propose a Deep Generative Model trained on the IllustrisTNG-100 +simulation to sample 3D galaxy shapes and orientations to accurately reproduce +intrinsic alignments along with correlated scalar features. We model the cosmic +web as a set of graphs, each graph representing a halo with nodes representing +the subhalos/galaxies. The architecture consists of a SO(3) $\times$ +$\mathbb{R}^n$ diffusion generative model, for galaxy orientations and $n$ +scalars, implemented with E(3) equivariant Graph Neural Networks that +explicitly respect the Euclidean symmetries of our Universe. The model is able +to learn and predict features such as galaxy orientations that are +statistically consistent with the reference simulation. Notably, our model +demonstrates the ability to jointly model Euclidean-valued scalars (galaxy +sizes, shapes, and colors) along with non-Euclidean valued SO(3) quantities +(galaxy orientations) that are governed by highly complex galactic physics at +non-linear scales. + +
+
+ comment: 12 pages, 5 figures. submitted to MNRAS +
+
+
+
+
+ + ☆ TensorSocket: Shared Data Loading for Deep Learning Training + + +
+ Training deep learning models is a repetitive and resource-intensive process. +Data scientists often train several models before landing on set of parameters +(e.g., hyper-parameter tuning), model architecture (e.g., neural architecture +search), among other things that yields the highest accuracy. The computational +efficiency of these training tasks depends highly on how well we can supply the +training process with training data. The repetitive nature of these tasks +results in the same data processing pipelines running over and over +exacerbating the need for and costs of computational resources. + In this paper, we present Tensorsocket to reduce the computational needs of +deep learning training by enabling simultaneous training processes to share the +same data loader. Tensorsocket mitigates CPU-side bottlenecks in cases where +the collocated training workloads have high throughput on GPU, but are held +back by lower data-loading throughput on CPU. Tensorsocket achieves this by +reducing redundant computations across collocated training processes and +leveraging modern GPU-GPU interconnects. We demonstrate the hardware- and +pipeline-agnostic nature of Tensorsocket and evaluate it using a variety of +training scenarios. + Our evaluation shows that Tensorsocket enables scenarios that are infeasible +without data sharing, increases training throughput by up to $100\%$, and when +utilizing cloud instances, Tensorsocket achieves cost savings of $50\%$ by +reducing the hardware resource needs on the CPU side. Furthermore, Tensorsocket +outperforms the state-of-the-art solutions for shared data loading such as +CoorDL and Joader. It is easier to use, maintain, and deploy, and either +achieves higher or matches the throughput of other solutions while requiring +less CPU resources. + +
+
+
+
+
+ + ☆ Cottention: Linear Transformers With Cosine Attention + + +
+ Attention mechanisms, particularly softmax attention, have been instrumental +in the success of transformer-based models such as GPT. However, the quadratic +memory complexity of softmax attention with respect to sequence length poses +significant challenges for processing longer sequences. We introduce +Cottention, a novel attention mechanism that replaces the softmax operation +with cosine similarity. By leveraging the properties of cosine similarity and +rearranging the attention equation, Cottention achieves native linear memory +complexity with respect to sequence length, making it inherently more +memory-efficient than softmax attention. We demonstrate that Cottention can be +reformulated as a recurrent neural network (RNN) with a finite hidden state, +allowing for constant memory usage during inference. We evaluate Cottention on +both the bidirectional BERT and causal GPT tasks, demonstrating comparable +performance to softmax attention while significantly reducing memory +requirements. To ensure efficient computation, we develop a custom CUDA kernel +for Cottention. Our results show that Cottention is a promising alternative to +softmax attention, enabling the processing of longer sequences without +sacrificing performance, due to its native linear memory complexity and ability +to maintain a constant memory footprint during inference. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Autoregressive Policy Optimization for Constrained Allocation Tasks NeurIPS 2024 + + +
+ Allocation tasks represent a class of problems where a limited amount of +resources must be allocated to a set of entities at each time step. Prominent +examples of this task include portfolio optimization or distributing +computational workloads across servers. Allocation tasks are typically bound by +linear constraints describing practical requirements that have to be strictly +fulfilled at all times. In portfolio optimization, for example, investors may +be obligated to allocate less than 30\% of the funds into a certain industrial +sector in any investment period. Such constraints restrict the action space of +allowed allocations in intricate ways, which makes learning a policy that +avoids constraint violations difficult. In this paper, we propose a new method +for constrained allocation tasks based on an autoregressive process to +sequentially sample allocations for each entity. In addition, we introduce a +novel de-biasing mechanism to counter the initial bias caused by sequential +sampling. We demonstrate the superior performance of our approach compared to a +variety of Constrained Reinforcement Learning (CRL) methods on three distinct +constrained allocation tasks: portfolio optimization, computational workload +distribution, and a synthetic allocation benchmark. Our code is available at: +https://github.com/niklasdbs/paspo + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ☆ Scalable Cross-Entropy Loss for Sequential Recommendations with Large + Item Catalogs + + +
+ Scalability issue plays a crucial role in productionizing modern recommender +systems. Even lightweight architectures may suffer from high computational +overload due to intermediate calculations, limiting their practicality in +real-world applications. Specifically, applying full Cross-Entropy (CE) loss +often yields state-of-the-art performance in terms of recommendations quality. +Still, it suffers from excessive GPU memory utilization when dealing with large +item catalogs. This paper introduces a novel Scalable Cross-Entropy (SCE) loss +function in the sequential learning setup. It approximates the CE loss for +datasets with large-size catalogs, enhancing both time efficiency and memory +usage without compromising recommendations quality. Unlike traditional negative +sampling methods, our approach utilizes a selective GPU-efficient computation +strategy, focusing on the most informative elements of the catalog, +particularly those most likely to be false positives. This is achieved by +approximating the softmax distribution over a subset of the model outputs +through the maximum inner product search. Experimental results on multiple +datasets demonstrate the effectiveness of SCE in reducing peak memory usage by +a factor of up to 100 compared to the alternatives, retaining or even exceeding +their metrics values. The proposed approach also opens new perspectives for +large-scale developments in different domains, such as large language models. + +
+
+ comment: 11 pages, accepted for RecSys'24 +
+
+
+
+
+ + ☆ Enhancing Spectrum Efficiency in 6G Satellite Networks: A GAIL-Powered + Policy Learning via Asynchronous Federated Inverse Reinforcement Learning + + +
+ In this paper, a novel generative adversarial imitation learning +(GAIL)-powered policy learning approach is proposed for optimizing beamforming, +spectrum allocation, and remote user equipment (RUE) association in NTNs. +Traditional reinforcement learning (RL) methods for wireless network +optimization often rely on manually designed reward functions, which can +require extensive parameter tuning. To overcome these limitations, we employ +inverse RL (IRL), specifically leveraging the GAIL framework, to automatically +learn reward functions without manual design. We augment this framework with an +asynchronous federated learning approach, enabling decentralized +multi-satellite systems to collaboratively derive optimal policies. The +proposed method aims to maximize spectrum efficiency (SE) while meeting minimum +information rate requirements for RUEs. To address the non-convex, NP-hard +nature of this problem, we combine the many-to-one matching theory with a +multi-agent asynchronous federated IRL (MA-AFIRL) framework. This allows agents +to learn through asynchronous environmental interactions, improving training +efficiency and scalability. The expert policy is generated using the Whale +optimization algorithm (WOA), providing data to train the automatic reward +function within GAIL. Simulation results show that the proposed MA-AFIRL method +outperforms traditional RL approaches, achieving a $14.6\%$ improvement in +convergence and reward value. The novel GAIL-driven policy learning establishes +a novel benchmark for 6G NTN optimization. + +
+
+ comment: Submitted to IEEE Transactions on Mobile Computing (16 pages, 10 + figures) +
+
+
+
+
+ + ☆ Rethinking the Power of Timestamps for Robust Time Series Forecasting: A + Global-Local Fusion Perspective NeurIPS 2024 + + +
+ Time series forecasting has played a pivotal role across various industries, +including finance, transportation, energy, healthcare, and climate. Due to the +abundant seasonal information they contain, timestamps possess the potential to +offer robust global guidance for forecasting techniques. However, existing +works primarily focus on local observations, with timestamps being treated +merely as an optional supplement that remains underutilized. When data gathered +from the real world is polluted, the absence of global information will damage +the robust prediction capability of these algorithms. To address these +problems, we propose a novel framework named GLAFF. Within this framework, the +timestamps are modeled individually to capture the global dependencies. Working +as a plugin, GLAFF adaptively adjusts the combined weights for global and local +information, enabling seamless collaboration with any time series forecasting +backbone. Extensive experiments conducted on nine real-world datasets +demonstrate that GLAFF significantly enhances the average performance of widely +used mainstream forecasting models by 12.5%, surpassing the previous +state-of-the-art method by 5.5%. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ MG-Net: Learn to Customize QAOA with Circuit Depth Awareness + + +
+ Quantum Approximate Optimization Algorithm (QAOA) and its variants exhibit +immense potential in tackling combinatorial optimization challenges. However, +their practical realization confronts a dilemma: the requisite circuit depth +for satisfactory performance is problem-specific and often exceeds the maximum +capability of current quantum devices. To address this dilemma, here we first +analyze the convergence behavior of QAOA, uncovering the origins of this +dilemma and elucidating the intricate relationship between the employed mixer +Hamiltonian, the specific problem at hand, and the permissible maximum circuit +depth. Harnessing this understanding, we introduce the Mixer Generator Network +(MG-Net), a unified deep learning framework adept at dynamically formulating +optimal mixer Hamiltonians tailored to distinct tasks and circuit depths. +Systematic simulations, encompassing Ising models and weighted Max-Cut +instances with up to 64 qubits, substantiate our theoretical findings, +highlighting MG-Net's superior performance in terms of both approximation ratio +and efficiency. + +
+
+ comment: 29 pages, 16 figures +
+
+
+
+
+ + ☆ Understanding the Benefits of SimCLR Pre-Training in Two-Layer + Convolutional Neural Networks + + +
+ SimCLR is one of the most popular contrastive learning methods for vision +tasks. It pre-trains deep neural networks based on a large amount of unlabeled +data by teaching the model to distinguish between positive and negative pairs +of augmented images. It is believed that SimCLR can pre-train a deep neural +network to learn efficient representations that can lead to a better +performance of future supervised fine-tuning. Despite its effectiveness, our +theoretical understanding of the underlying mechanisms of SimCLR is still +limited. In this paper, we theoretically introduce a case study of the SimCLR +method. Specifically, we consider training a two-layer convolutional neural +network (CNN) to learn a toy image data model. We show that, under certain +conditions on the number of labeled data, SimCLR pre-training combined with +supervised fine-tuning achieves almost optimal test loss. Notably, the label +complexity for SimCLR pre-training is far less demanding compared to direct +training on supervised data. Our analysis sheds light on the benefits of SimCLR +in learning with fewer labels. + +
+
+ comment: 65 pages, 4 figures +
+
+
+
+
+ + ☆ How green is continual learning, really? Analyzing the energy + consumption in continual training of vision foundation models ECCV 2024 + + +
+ With the ever-growing adoption of AI, its impact on the environment is no +longer negligible. Despite the potential that continual learning could have +towards Green AI, its environmental sustainability remains relatively +uncharted. In this work we aim to gain a systematic understanding of the energy +efficiency of continual learning algorithms. To that end, we conducted an +extensive set of empirical experiments comparing the energy consumption of +recent representation-, prompt-, and exemplar-based continual learning +algorithms and two standard baseline (fine tuning and joint training) when used +to continually adapt a pre-trained ViT-B/16 foundation model. We performed our +experiments on three standard datasets: CIFAR-100, ImageNet-R, and DomainNet. +Additionally, we propose a novel metric, the Energy NetScore, which we use +measure the algorithm efficiency in terms of energy-accuracy trade-off. Through +numerous evaluations varying the number and size of the incremental learning +steps, our experiments demonstrate that different types of continual learning +algorithms have very different impacts on energy consumption during both +training and inference. Although often overlooked in the continual learning +literature, we found that the energy consumed during the inference phase is +crucial for evaluating the environmental sustainability of continual learning +models. + +
+
+ comment: This manuscript has been accepted at the Green FOundation MOdels + (GreenFOMO) ECCV 2024 Workshop +
+
+
+
+
+ + ☆ Entropy, concentration, and learning: a statistical mechanics primer + + +
+ Artificial intelligence models trained through loss minimization have +demonstrated significant success, grounded in principles from fields like +information theory and statistical physics. This work explores these +established connections through the lens of statistical mechanics, starting +from first-principles sample concentration behaviors that underpin AI and +machine learning. Our development of statistical mechanics for modeling +highlights the key role of exponential families, and quantities of statistics, +physics, and information theory. + +
+
+
+
+
+ + ☆ Towards Integrating Epistemic Uncertainty Estimation into the + Radiotherapy Workflow + + +
+ The precision of contouring target structures and organs-at-risk (OAR) in +radiotherapy planning is crucial for ensuring treatment efficacy and patient +safety. Recent advancements in deep learning (DL) have significantly improved +OAR contouring performance, yet the reliability of these models, especially in +the presence of out-of-distribution (OOD) scenarios, remains a concern in +clinical settings. This application study explores the integration of epistemic +uncertainty estimation within the OAR contouring workflow to enable OOD +detection in clinically relevant scenarios, using specifically compiled data. +Furthermore, we introduce an advanced statistical method for OOD detection to +enhance the methodological framework of uncertainty estimation. Our empirical +evaluation demonstrates that epistemic uncertainty estimation is effective in +identifying instances where model predictions are unreliable and may require an +expert review. Notably, our approach achieves an AUC-ROC of 0.95 for OOD +detection, with a specificity of 0.95 and a sensitivity of 0.92 for implant +cases, underscoring its efficacy. This study addresses significant gaps in the +current research landscape, such as the lack of ground truth for uncertainty +estimation and limited empirical evaluations. Additionally, it provides a +clinically relevant application of epistemic uncertainty estimation in an +FDA-approved and widely used clinical solution for OAR segmentation from +Varian, a Siemens Healthineers company, highlighting its practical benefits. + +
+
+ comment: Keywords: Epistemic Uncertainty - Out-of-Distribution Detection - CT + Segmentation - OAR contouring - Radiotherapy +
+
+
+
+
+ + ☆ Unsupervised Cognition + + +
+ Unsupervised learning methods have a soft inspiration in cognition models. To +this day, the most successful unsupervised learning methods revolve around +clustering samples in a mathematical space. In this paper we propose a +state-of-the-art primitive-based unsupervised learning approach for +decision-making inspired by novel cognition models. This representation-centric +approach models the input space constructively as a distributed hierarchical +structure in an input-agnostic way. We compared our approach with current +state-of-the-art in unsupervised learning classification, and with current +state-of-the-art in cancer type classification. We show how our proposal +outperforms previous state-of-the-art. We also evaluate some cognition-like +properties of our proposal where it not only outperforms the compared +algorithms (even supervised learning ones), but it also shows a different, more +cognition-like, behaviour. + +
+
+
+
+
+ + ☆ Differentially Private Non Parametric Copulas: Generating synthetic data + with non parametric copulas under privacy guarantees + + +
+ Creation of synthetic data models has represented a significant advancement +across diverse scientific fields, but this technology also brings important +privacy considerations for users. This work focuses on enhancing a +non-parametric copula-based synthetic data generation model, DPNPC, by +incorporating Differential Privacy through an Enhanced Fourier Perturbation +method. The model generates synthetic data for mixed tabular databases while +preserving privacy. We compare DPNPC with three other models (PrivBayes, +DP-Copula, and DP-Histogram) across three public datasets, evaluating privacy, +utility, and execution time. DPNPC outperforms others in modeling multivariate +dependencies, maintaining privacy for small $\epsilon$ values, and reducing +training times. However, limitations include the need to assess the model's +performance with different encoding methods and consider additional privacy +attacks. Future research should address these areas to enhance +privacy-preserving synthetic data generation. + +
+
+ comment: 12 pages, 5 figures, deciding 2025 conference to which to submit +
+
+
+
+
+ + ☆ TemporalPaD: a reinforcement-learning framework for temporal feature + representation and dimension reduction + + +
+ Recent advancements in feature representation and dimension reduction have +highlighted their crucial role in enhancing the efficacy of predictive +modeling. This work introduces TemporalPaD, a novel end-to-end deep learning +framework designed for temporal pattern datasets. TemporalPaD integrates +reinforcement learning (RL) with neural networks to achieve concurrent feature +representation and feature reduction. The framework consists of three +cooperative modules: a Policy Module, a Representation Module, and a +Classification Module, structured based on the Actor-Critic (AC) framework. The +Policy Module, responsible for dimensionality reduction through RL, functions +as the actor, while the Representation Module for feature extraction and the +Classification Module collectively serve as the critic. We comprehensively +evaluate TemporalPaD using 29 UCI datasets, a well-known benchmark for +validating feature reduction algorithms, through 10 independent tests and +10-fold cross-validation. Additionally, given that TemporalPaD is specifically +designed for time series data, we apply it to a real-world DNA classification +problem involving enhancer category and enhancer strength. The results +demonstrate that TemporalPaD is an efficient and effective framework for +achieving feature reduction, applicable to both structured data and sequence +datasets. The source code of the proposed TemporalPaD is freely available as +supplementary material to this article and at +http://www.healthinformaticslab.org/supp/. + +
+
+
+
+
+ + ☆ ASAG2024: A Combined Benchmark for Short Answer Grading + + +
+ Open-ended questions test a more thorough understanding than closed-ended +questions and are often a preferred assessment method. However, open-ended +questions are tedious to grade and subject to personal bias. Therefore, there +have been efforts to speed up the grading process through automation. Short +Answer Grading (SAG) systems aim to automatically score students' answers. +Despite growth in SAG methods and capabilities, there exists no comprehensive +short-answer grading benchmark across different subjects, grading scales, and +distributions. Thus, it is hard to assess the capabilities of current automated +grading methods in terms of their generalizability. In this preliminary work, +we introduce the combined ASAG2024 benchmark to facilitate the comparison of +automated grading systems. Combining seven commonly used short-answer grading +datasets in a common structure and grading scale. For our benchmark, we +evaluate a set of recent SAG methods, revealing that while LLM-based approaches +reach new high scores, they still are far from reaching human performance. This +opens up avenues for future research on human-machine SAG systems. + +
+
+ comment: Accepted at SIGCSE-Virtual 2024 +
+
+
+
+
+ + ☆ "Oh LLM, I'm Asking Thee, Please Give Me a Decision Tree": Zero-Shot + Decision Tree Induction and Embedding with Large Language Models + + +
+ Large language models (LLMs) provide powerful means to leverage prior +knowledge for predictive modeling when data is limited. In this work, we +demonstrate how LLMs can use their compressed world knowledge to generate +intrinsically interpretable machine learning models, i.e., decision trees, +without any training data. We find that these zero-shot decision trees can +surpass data-driven trees on some small-sized tabular datasets and that +embeddings derived from these trees perform on par with data-driven tree-based +embeddings on average. Our knowledge-driven decision tree induction and +embedding approaches therefore serve as strong new baselines for data-driven +machine learning methods in the low-data regime. + +
+
+
+
+
+ + ☆ Optimistic Games for Combinatorial Bayesian Optimization with + Application to Protein Design + + +
+ Bayesian optimization (BO) is a powerful framework to optimize black-box +expensive-to-evaluate functions via sequential interactions. In several +important problems (e.g. drug discovery, circuit design, neural architecture +search, etc.), though, such functions are defined over large +$\textit{combinatorial and unstructured}$ spaces. This makes existing BO +algorithms not feasible due to the intractable maximization of the acquisition +function over these domains. To address this issue, we propose +$\textbf{GameOpt}$, a novel game-theoretical approach to combinatorial BO. +$\textbf{GameOpt}$ establishes a cooperative game between the different +optimization variables, and selects points that are game $\textit{equilibria}$ +of an upper confidence bound acquisition function. These are stable +configurations from which no variable has an incentive to deviate$-$ analog to +local optima in continuous domains. Crucially, this allows us to efficiently +break down the complexity of the combinatorial domain into individual decision +sets, making $\textbf{GameOpt}$ scalable to large combinatorial spaces. We +demonstrate the application of $\textbf{GameOpt}$ to the challenging +$\textit{protein design}$ problem and validate its performance on four +real-world protein datasets. Each protein can take up to $20^{X}$ possible +configurations, where $X$ is the length of a protein, making standard BO +methods infeasible. Instead, our approach iteratively selects informative +protein configurations and very quickly discovers highly active protein +variants compared to other baselines. + +
+
+
+
+
+ + ☆ Using Deep Autoregressive Models as Causal Inference Engines + + +
+ Existing causal inference (CI) models are limited to primarily handling +low-dimensional confounders and singleton actions. We propose an autoregressive +(AR) CI framework capable of handling complex confounders and sequential +actions common in modern applications. We accomplish this by {\em +sequencification}, transforming data from an underlying causal diagram into a +sequence of tokens. This approach not only enables training with data generated +from any DAG but also extends existing CI capabilities to accommodate +estimating several statistical quantities using a {\em single} model. We can +directly predict interventional probabilities, simplifying inference and +enhancing outcome prediction accuracy. We demonstrate that an AR model adapted +for CI is efficient and effective in various complex applications such as +navigating mazes, playing chess endgames, and evaluating the impact of certain +keywords on paper acceptance rates. + +
+
+
+
+
+ + ☆ An Enhanced Federated Prototype Learning Method under Domain Shift + + +
+ Federated Learning (FL) allows collaborative machine learning training +without sharing private data. Numerous studies have shown that one significant +factor affecting the performance of federated learning models is the +heterogeneity of data across different clients, especially when the data is +sampled from various domains. A recent paper introduces variance-aware +dual-level prototype clustering and uses a novel $\alpha$-sparsity prototype +loss, which increases intra-class similarity and reduces inter-class +similarity. To ensure that the features converge within specific clusters, we +introduce an improved algorithm, Federated Prototype Learning with Convergent +Clusters, abbreviated as FedPLCC. To increase inter-class distances, we weight +each prototype with the size of the cluster it represents. To reduce +intra-class distances, considering that prototypes with larger distances might +come from different domains, we select only a certain proportion of prototypes +for the loss function calculation. Evaluations on the Digit-5, Office-10, and +DomainNet datasets show that our method performs better than existing +approaches. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ Climate Adaptation with Reinforcement Learning: Experiments with + Flooding and Transportation in Copenhagen + + +
+ Due to climate change the frequency and intensity of extreme rainfall events, +which contribute to urban flooding, are expected to increase in many places. +These floods can damage transport infrastructure and disrupt mobility, +highlighting the need for cities to adapt to escalating risks. Reinforcement +learning (RL) serves as a powerful tool for uncovering optimal adaptation +strategies, determining how and where to deploy adaptation measures +effectively, even under significant uncertainty. In this study, we leverage RL +to identify the most effective timing and locations for implementing measures, +aiming to reduce both direct and indirect impacts of flooding. Our framework +integrates climate change projections of future rainfall events and floods, +models city-wide motorized trips, and quantifies direct and indirect impacts on +infrastructure and mobility. Preliminary results suggest that our RL-based +approach can significantly enhance decision-making by prioritizing +interventions in specific urban areas and identifying the optimal periods for +their implementation. + +
+
+
+
+
+ + ☆ Towards an active-learning approach to resource allocation for + population-based damage prognosis + + +
+ Damage prognosis is, arguably, one of the most difficult tasks of structural +health monitoring (SHM). To address common problems of damage prognosis, a +population-based SHM (PBSHM) approach is adopted in the current work. In this +approach the prognosis problem is considered as an information-sharing problem +where data from past structures are exploited to make more accurate inferences +regarding currently-degrading structures. For a given population, there may +exist restrictions on the resources available to conduct monitoring; thus, the +current work studies the problem of allocating such resources within a +population of degrading structures with a view to maximising the +damage-prognosis accuracy. The challenges of the current framework are mainly +associated with the inference of outliers on the level of damage evolution, +given partial data from the damage-evolution phenomenon. The current approach +considers an initial population of structures for which damage evolution is +extensively observed. Subsequently, a second population of structures with +evolving damage is considered for which two monitoring systems are available, a +low-availability and high-fidelity (low-uncertainty) one, and a +widely-available and low-fidelity (high-uncertainty) one. The task of the +current work is to follow an active-learning approach to identify the +structures to which the high-fidelity system should be assigned in order to +enhance the predictive capabilities of the machine-learning model throughout +the population. + +
+
+
+
+
+ + ☆ Experimental Evaluation of Machine Learning Models for Goal-oriented + Customer Service Chatbot with Pipeline Architecture + + +
+ Integrating machine learning (ML) into customer service chatbots enhances +their ability to understand and respond to user queries, ultimately improving +service performance. However, they may appear artificial to some users and +affecting customer experience. Hence, meticulous evaluation of ML models for +each pipeline component is crucial for optimizing performance, though +differences in functionalities can lead to unfair comparisons. In this paper, +we present a tailored experimental evaluation approach for goal-oriented +customer service chatbots with pipeline architecture, focusing on three key +components: Natural Language Understanding (NLU), dialogue management (DM), and +Natural Language Generation (NLG). Our methodology emphasizes individual +assessment to determine optimal ML models. Specifically, we focus on optimizing +hyperparameters and evaluating candidate models for NLU (utilizing BERT and +LSTM), DM (employing DQN and DDQN), and NLG (leveraging GPT-2 and DialoGPT). +The results show that for the NLU component, BERT excelled in intent detection +whereas LSTM was superior for slot filling. For the DM component, the DDQN +model outperformed DQN by achieving fewer turns, higher rewards, as well as +greater success rates. For NLG, the large language model GPT-2 surpassed +DialoGPT in BLEU, METEOR, and ROUGE metrics. These findings aim to provide a +benchmark for future research in developing and optimizing customer service +chatbots, offering valuable insights into model performance and optimal +hyperparameters. + +
+
+
+
+
+ + ☆ Optimizing DNN Inference on Multi-Accelerator SoCs at Training-time + + +
+ The demand for executing Deep Neural Networks (DNNs) with low latency and +minimal power consumption at the edge has led to the development of advanced +heterogeneous Systems-on-Chips (SoCs) that incorporate multiple specialized +computing units (CUs), such as accelerators. Offloading DNN computations to a +specific CU from the available set often exposes accuracy vs efficiency +trade-offs, due to differences in their supported operations (e.g., standard +vs. depthwise convolution) or data representations (e.g., more/less +aggressively quantized). A challenging yet unresolved issue is how to map a DNN +onto these multi-CU systems to maximally exploit the parallelization +possibilities while taking accuracy into account. To address this problem, we +present ODiMO, a hardware-aware tool that efficiently explores fine-grain +mapping of DNNs among various on-chip CUs, during the training phase. ODiMO +strategically splits individual layers of the neural network and executes them +in parallel on the multiple available CUs, aiming to balance the total +inference energy consumption or latency with the resulting accuracy, impacted +by the unique features of the different hardware units. We test our approach on +CIFAR-10, CIFAR-100, and ImageNet, targeting two open-source heterogeneous +SoCs, i.e., DIANA and Darkside. We obtain a rich collection of Pareto-optimal +networks in the accuracy vs. energy or latency space. We show that ODiMO +reduces the latency of a DNN executed on the Darkside SoC by up to 8x at +iso-accuracy, compared to manual heuristic mappings. When targeting energy, on +the same SoC, ODiMO produced up to 50.8x more efficient mappings, with minimal +accuracy drop (< 0.3%). + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ CodeSCAN: ScreenCast ANalysis for Video Programming Tutorials + + +
+ Programming tutorials in the form of coding screencasts play a crucial role +in programming education, serving both novices and experienced developers. +However, the video format of these tutorials presents a challenge due to the +difficulty of searching for and within videos. Addressing the absence of +large-scale and diverse datasets for screencast analysis, we introduce the +CodeSCAN dataset. It comprises 12,000 screenshots captured from the Visual +Studio Code environment during development, featuring 24 programming languages, +25 fonts, and over 90 distinct themes, in addition to diverse layout changes +and realistic user interactions. Moreover, we conduct detailed quantitative and +qualitative evaluations to benchmark the performance of Integrated Development +Environment (IDE) element detection, color-to-black-and-white conversion, and +Optical Character Recognition (OCR). We hope that our contributions facilitate +more research in coding screencast analysis, and we make the source code for +creating the dataset and the benchmark publicly available on this website. + +
+
+
+
+
+ + ☆ Efficient Noise Mitigation for Enhancing Inference Accuracy in DNNs on + Mixed-Signal Accelerators + + +
+ In this paper, we propose a framework to enhance the robustness of the neural +models by mitigating the effects of process-induced and aging-related +variations of analog computing components on the accuracy of the analog neural +networks. We model these variations as the noise affecting the precision of the +activations and introduce a denoising block inserted between selected layers of +a pre-trained model. We demonstrate that training the denoising block +significantly increases the model's robustness against various noise levels. To +minimize the overhead associated with adding these blocks, we present an +exploration algorithm to identify optimal insertion points for the denoising +blocks. Additionally, we propose a specialized architecture to efficiently +execute the denoising blocks, which can be integrated into mixed-signal +accelerators. We evaluate the effectiveness of our approach using Deep Neural +Network (DNN) models trained on the ImageNet and CIFAR-10 datasets. The results +show that on average, by accepting 2.03% parameter count overhead, the accuracy +drop due to the variations reduces from 31.7% to 1.15%. + +
+
+
+
+
+ + ☆ Wasserstein Distance-Weighted Adversarial Network for Cross-Domain + Credit Risk Assessment + + +
+ This paper delves into the application of adversarial domain adaptation (ADA) +for enhancing credit risk assessment in financial institutions. It addresses +two critical challenges: the cold start problem, where historical lending data +is scarce, and the data imbalance issue, where high-risk transactions are +underrepresented. The paper introduces an improved ADA framework, the +Wasserstein Distance Weighted Adversarial Domain Adaptation Network (WD-WADA), +which leverages the Wasserstein distance to align source and target domains +effectively. The proposed method includes an innovative weighted strategy to +tackle data imbalance, adjusting for both the class distribution and the +difficulty level of predictions. The paper demonstrates that WD-WADA not only +mitigates the cold start problem but also provides a more accurate measure of +domain differences, leading to improved cross-domain credit risk assessment. +Extensive experiments on real-world credit datasets validate the model's +effectiveness, showcasing superior performance in cross-domain learning, +classification accuracy, and model stability compared to traditional methods. + +
+
+
+
+
+ + ☆ Robustness of AI-based weather forecasts in a changing climate + + +
+ Data-driven machine learning models for weather forecasting have made +transformational progress in the last 1-2 years, with state-of-the-art ones now +outperforming the best physics-based models for a wide range of skill scores. +Given the strong links between weather and climate modelling, this raises the +question whether machine learning models could also revolutionize climate +science, for example by informing mitigation and adaptation to climate change +or to generate larger ensembles for more robust uncertainty estimates. Here, we +show that current state-of-the-art machine learning models trained for weather +forecasting in present-day climate produce skillful forecasts across different +climate states corresponding to pre-industrial, present-day, and future 2.9K +warmer climates. This indicates that the dynamics shaping the weather on short +timescales may not differ fundamentally in a changing climate. It also +demonstrates out-of-distribution generalization capabilities of the machine +learning models that are a critical prerequisite for climate applications. +Nonetheless, two of the models show a global-mean cold bias in the forecasts +for the future warmer climate state, i.e. they drift towards the colder +present-day climate they have been trained for. A similar result is obtained +for the pre-industrial case where two out of three models show a warming. We +discuss possible remedies for these biases and analyze their spatial +distribution, revealing complex warming and cooling patterns that are partly +related to missing ocean-sea ice and land surface information in the training +data. Despite these current limitations, our results suggest that data-driven +machine learning models will provide powerful tools for climate science and +transform established approaches by complementing conventional physics-based +models. + +
+
+ comment: 14 pages, 4 figures +
+
+
+
+
+ + ☆ Token Caching for Diffusion Transformer Acceleration + + +
+ Diffusion transformers have gained substantial interest in diffusion +generative modeling due to their outstanding performance. However, their high +computational cost, arising from the quadratic computational complexity of +attention mechanisms and multi-step inference, presents a significant +bottleneck. To address this challenge, we propose TokenCache, a novel +post-training acceleration method that leverages the token-based multi-block +architecture of transformers to reduce redundant computations among tokens +across inference steps. TokenCache specifically addresses three critical +questions in the context of diffusion transformers: (1) which tokens should be +pruned to eliminate redundancy, (2) which blocks should be targeted for +efficient pruning, and (3) at which time steps caching should be applied to +balance speed and quality. In response to these challenges, TokenCache +introduces a Cache Predictor that assigns importance scores to tokens, enabling +selective pruning without compromising model performance. Furthermore, we +propose an adaptive block selection strategy to focus on blocks with minimal +impact on the network's output, along with a Two-Phase Round-Robin (TPRR) +scheduling policy to optimize caching intervals throughout the denoising +process. Experimental results across various models demonstrate that TokenCache +achieves an effective trade-off between generation quality and inference speed +for diffusion transformers. Our code will be publicly available. + +
+
+
+
+
+ + ☆ Med-IC: Fusing a Single Layer Involution with Convolutions for Enhanced + Medical Image Classification and Segmentation + + +
+ The majority of medical images, especially those that resemble cells, have +similar characteristics. These images, which occur in a variety of shapes, +often show abnormalities in the organ or cell region. The convolution operation +possesses a restricted capability to extract visual patterns across several +spatial regions of an image. The involution process, which is the inverse +operation of convolution, complements this inherent lack of spatial information +extraction present in convolutions. In this study, we investigate how applying +a single layer of involution prior to a convolutional neural network (CNN) +architecture can significantly improve classification and segmentation +performance, with a comparatively negligible amount of weight parameters. The +study additionally shows how excessive use of involution layers might result in +inaccurate predictions in a particular type of medical image. According to our +findings from experiments, the strategy of adding only a single involution +layer before a CNN-based model outperforms most of the previous works. + +
+
+ comment: 13 pages, 5 figures, 4 tables, preprint submitted to an Elsevier + journal +
+
+
+
+
+ + ☆ WHOMP: Optimizing Randomized Controlled Trials via Wasserstein + Homogeneity + + +
+ We investigate methods for partitioning datasets into subgroups that maximize +diversity within each subgroup while minimizing dissimilarity across subgroups. +We introduce a novel partitioning method called the $\textit{Wasserstein +Homogeneity Partition}$ (WHOMP), which optimally minimizes type I and type II +errors that often result from imbalanced group splitting or partitioning, +commonly referred to as accidental bias, in comparative and controlled trials. +We conduct an analytical comparison of WHOMP against existing partitioning +methods, such as random subsampling, covariate-adaptive randomization, +rerandomization, and anti-clustering, demonstrating its advantages. Moreover, +we characterize the optimal solutions to the WHOMP problem and reveal an +inherent trade-off between the stability of subgroup means and variances among +these solutions. Based on our theoretical insights, we design algorithms that +not only obtain these optimal solutions but also equip practitioners with tools +to select the desired trade-off. Finally, we validate the effectiveness of +WHOMP through numerical experiments, highlighting its superiority over +traditional methods. + +
+
+ comment: 46 pages, 3 figures +
+
+
+
+
+ + ☆ Fairness-aware Multiobjective Evolutionary Learning + + +
+ Multiobjective evolutionary learning (MOEL) has demonstrated its advantages +of training fairer machine learning models considering a predefined set of +conflicting objectives, including accuracy and different fairness measures. +Recent works propose to construct a representative subset of fairness measures +as optimisation objectives of MOEL throughout model training. However, the +determination of a representative measure set relies on dataset, prior +knowledge and requires substantial computational costs. What's more, those +representative measures may differ across different model training processes. +Instead of using a static predefined set determined before model training, this +paper proposes to dynamically and adaptively determine a representative measure +set online during model training. The dynamically determined representative set +is then used as optimising objectives of the MOEL framework and can vary with +time. Extensive experimental results on 12 well-known benchmark datasets +demonstrate that our proposed framework achieves outstanding performance +compared to state-of-the-art approaches for mitigating unfairness in terms of +accuracy as well as 25 fairness measures although only a few of them were +dynamically selected and used as optimisation objectives. The results indicate +the importance of setting optimisation objectives dynamically during training. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Treating Brain-inspired Memories as Priors for Diffusion Model to + Forecast Multivariate Time Series + + +
+ Forecasting Multivariate Time Series (MTS) involves significant challenges in +various application domains. One immediate challenge is modeling temporal +patterns with the finite length of the input. These temporal patterns usually +involve periodic and sudden events that recur across different channels. To +better capture temporal patterns, we get inspiration from humans' memory +mechanisms and propose a channel-shared, brain-inspired memory module for MTS. +Specifically, brain-inspired memory comprises semantic and episodic memory, +where the former is used to capture general patterns, such as periodic events, +and the latter is employed to capture special patterns, such as sudden events, +respectively. Meanwhile, we design corresponding recall and update mechanisms +to better utilize these patterns. Furthermore, acknowledging the capacity of +diffusion models to leverage memory as a prior, we present a brain-inspired +memory-augmented diffusion model. This innovative model retrieves relevant +memories for different channels, utilizing them as distinct priors for MTS +predictions. This incorporation significantly enhances the accuracy and +robustness of predictions. Experimental results on eight datasets consistently +validate the superiority of our approach in capturing and leveraging diverse +recurrent temporal patterns across different channels. + +
+
+
+
+
+ + ☆ HSTFL: A Heterogeneous Federated Learning Framework for Misaligned + Spatiotemporal Forecasting + + +
+ Spatiotemporal forecasting has emerged as an indispensable building block of +diverse smart city applications, such as intelligent transportation and smart +energy management. Recent advancements have uncovered that the performance of +spatiotemporal forecasting can be significantly improved by integrating +knowledge in geo-distributed time series data from different domains, \eg +enhancing real-estate appraisal with human mobility data; joint taxi and bike +demand predictions. While effective, existing approaches assume a centralized +data collection and exploitation environment, overlooking the privacy and +commercial interest concerns associated with data owned by different parties. +In this paper, we investigate multi-party collaborative spatiotemporal +forecasting without direct access to multi-source private data. However, this +task is challenging due to 1) cross-domain feature heterogeneity and 2) +cross-client geographical heterogeneity, where standard horizontal or vertical +federated learning is inapplicable. To this end, we propose a Heterogeneous +SpatioTemporal Federated Learning (HSTFL) framework to enable multiple clients +to collaboratively harness geo-distributed time series data from different +domains while preserving privacy. Specifically, we first devise vertical +federated spatiotemporal representation learning to locally preserve +spatiotemporal dependencies among individual participants and generate +effective representations for heterogeneous data. Then we propose a +cross-client virtual node alignment block to incorporate cross-client +spatiotemporal dependencies via a multi-level knowledge fusion scheme. +Extensive privacy analysis and experimental evaluations demonstrate that HSTFL +not only effectively resists inference attacks but also provides a significant +improvement against various baselines. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Deep Heterogeneous Contrastive Hyper-Graph Learning for In-the-Wild + Context-Aware Human Activity Recognition + + +
+ Human Activity Recognition (HAR) is a challenging, multi-label classification +problem as activities may co-occur and sensor signals corresponding to the same +activity may vary in different contexts (e.g., different device placements). +This paper proposes a Deep Heterogeneous Contrastive Hyper-Graph Learning +(DHC-HGL) framework that captures heterogenous Context-Aware HAR (CA-HAR) +hypergraph properties in a message-passing and neighborhood-aggregation +fashion. Prior work only explored homogeneous or shallow-node-heterogeneous +graphs. DHC-HGL handles heterogeneous CA-HAR data by innovatively 1) +Constructing three different types of sub-hypergraphs that are each passed +through different custom HyperGraph Convolution (HGC) layers designed to handle +edge-heterogeneity and 2) Adopting a contrastive loss function to ensure +node-heterogeneity. In rigorous evaluation on two CA-HAR datasets, DHC-HGL +significantly outperformed state-of-the-art baselines by 5.8% to 16.7% on +Matthews Correlation Coefficient (MCC) and 3.0% to 8.4% on Macro F1 scores. +UMAP visualizations of learned CA-HAR node embeddings are also presented to +enhance model explainability. + +
+
+ comment: IMWUT 2023 +
+
+
+
+
+ + ☆ CycleNet: Enhancing Time Series Forecasting through Modeling Periodic + Patterns + + +
+ The stable periodic patterns present in time series data serve as the +foundation for conducting long-horizon forecasts. In this paper, we pioneer the +exploration of explicitly modeling this periodicity to enhance the performance +of models in long-term time series forecasting (LTSF) tasks. Specifically, we +introduce the Residual Cycle Forecasting (RCF) technique, which utilizes +learnable recurrent cycles to model the inherent periodic patterns within +sequences, and then performs predictions on the residual components of the +modeled cycles. Combining RCF with a Linear layer or a shallow MLP forms the +simple yet powerful method proposed in this paper, called CycleNet. CycleNet +achieves state-of-the-art prediction accuracy in multiple domains including +electricity, weather, and energy, while offering significant efficiency +advantages by reducing over 90% of the required parameter quantity. +Furthermore, as a novel plug-and-play technique, the RCF can also significantly +improve the prediction accuracy of existing models, including PatchTST and +iTransformer. The source code is available at: +https://github.com/ACAT-SCUT/CycleNet. + +
+
+
+
+
+ + ☆ URIEL+: Enhancing Linguistic Inclusion and Usability in a Typological + and Multilingual Knowledge Base + + +
+ URIEL is a knowledge base offering geographical, phylogenetic, and +typological vector representations for 7970 languages. It includes distance +measures between these vectors for 4005 languages, which are accessible via the +lang2vec tool. Despite being frequently cited, URIEL is limited in terms of +linguistic inclusion and overall usability. To tackle these challenges, we +introduce URIEL+, an enhanced version of URIEL and lang2vec addressing these +limitations. In addition to expanding typological feature coverage for 2898 +languages, URIEL+ improves user experience with robust, customizable distance +calculations to better suit the needs of the users. These upgrades also offer +competitive performance on downstream tasks and provide distances that better +align with linguistic distance studies. + +
+
+
+
+
+ + ☆ Fairness without Sensitive Attributes via Knowledge Sharing + + +
+ While model fairness improvement has been explored previously, existing +methods invariably rely on adjusting explicit sensitive attribute values in +order to improve model fairness in downstream tasks. However, we observe a +trend in which sensitive demographic information becomes inaccessible as public +concerns around data privacy grow. In this paper, we propose a confidence-based +hierarchical classifier structure called "Reckoner" for reliable fair model +learning under the assumption of missing sensitive attributes. We first present +results showing that if the dataset contains biased labels or other hidden +biases, classifiers significantly increase the bias gap across different +demographic groups in the subset with higher prediction confidence. Inspired by +these findings, we devised a dual-model system in which a version of the model +initialised with a high-confidence data subset learns from a version of the +model initialised with a low-confidence data subset, enabling it to avoid +biased predictions. Our experimental results show that Reckoner consistently +outperforms state-of-the-art baselines in COMPAS dataset and New Adult dataset, +considering both accuracy and fairness metrics. + +
+
+
+
+
+ + ☆ A TextGCN-Based Decoding Approach for Improving Remote Sensing Image + Captioning + + +
+ Remote sensing images are highly valued for their ability to address complex +real-world issues such as risk management, security, and meteorology. However, +manually captioning these images is challenging and requires specialized +knowledge across various domains. This letter presents an approach for +automatically describing (captioning) remote sensing images. We propose a novel +encoder-decoder setup that deploys a Text Graph Convolutional Network (TextGCN) +and multi-layer LSTMs. The embeddings generated by TextGCN enhance the +decoder's understanding by capturing the semantic relationships among words at +both the sentence and corpus levels. Furthermore, we advance our approach with +a comparison-based beam search method to ensure fairness in the search strategy +for generating the final caption. We present an extensive evaluation of our +approach against various other state-of-the-art encoder-decoder frameworks. We +evaluated our method across three datasets using seven metrics: BLEU-1 to +BLEU-4, METEOR, ROUGE-L, and CIDEr. The results demonstrate that our approach +significantly outperforms other state-of-the-art encoder-decoder methods. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Latent Representation Learning for Multimodal Brain Activity Translation + + +
+ Neuroscience employs diverse neuroimaging techniques, each offering distinct +insights into brain activity, from electrophysiological recordings such as EEG, +which have high temporal resolution, to hemodynamic modalities such as fMRI, +which have increased spatial precision. However, integrating these +heterogeneous data sources remains a challenge, which limits a comprehensive +understanding of brain function. We present the Spatiotemporal Alignment of +Multimodal Brain Activity (SAMBA) framework, which bridges the spatial and +temporal resolution gaps across modalities by learning a unified latent space +free of modality-specific biases. SAMBA introduces a novel attention-based +wavelet decomposition for spectral filtering of electrophysiological +recordings, graph attention networks to model functional connectivity between +functional brain units, and recurrent layers to capture temporal +autocorrelations in brain signal. We show that the training of SAMBA, aside +from achieving translation, also learns a rich representation of brain +information processing. We showcase this classify external stimuli driving +brain activity from the representation learned in hidden layers of SAMBA, +paving the way for broad downstream applications in neuroscience research and +clinical contexts. + +
+
+
+
+
+ + ☆ Towards Diverse Device Heterogeneous Federated Learning via Task + Arithmetic Knowledge Integration NeurIPS 2024 + + +
+ Federated Learning has emerged as a promising paradigm for collaborative +machine learning, while preserving user data privacy. Despite its potential, +standard FL lacks support for diverse heterogeneous device prototypes, which +vary significantly in model and dataset sizes -- from small IoT devices to +large workstations. This limitation is only partially addressed by existing +knowledge distillation techniques, which often fail to transfer knowledge +effectively across a broad spectrum of device prototypes with varied +capabilities. This failure primarily stems from two issues: the dilution of +informative logits from more capable devices by those from less capable ones, +and the use of a single integrated logits as the distillation target across all +devices, which neglects their individual learning capacities and and the unique +contributions of each. To address these challenges, we introduce TAKFL, a novel +KD-based framework that treats the knowledge transfer from each device +prototype's ensemble as a separate task, independently distilling each to +preserve its unique contributions and avoid dilution. TAKFL also incorporates a +KD-based self-regularization technique to mitigate the issues related to the +noisy and unsupervised ensemble distillation process. To integrate the +separately distilled knowledge, we introduce an adaptive task arithmetic +knowledge integration process, allowing each student model to customize the +knowledge integration for optimal performance. Additionally, we present +theoretical results demonstrating the effectiveness of task arithmetic in +transferring knowledge across heterogeneous devices with varying capacities. +Comprehensive evaluations of our method across both CV and NLP tasks +demonstrate that TAKFL achieves SOTA results in a variety of datasets and +settings, significantly outperforming existing KD-based methods. Code is +released at https://github.com/MMorafah/TAKFL + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Review of Digital Asset Development with Graph Neural Network Unlearning + + +
+ In the rapidly evolving landscape of digital assets, the imperative for +robust data privacy and compliance with regulatory frameworks has intensified. +This paper investigates the critical role of Graph Neural Networks (GNNs) in +the management of digital assets and introduces innovative unlearning +techniques specifically tailored to GNN architectures. We categorize unlearning +strategies into two primary classes: data-driven approximation, which +manipulates the graph structure to isolate and remove the influence of specific +nodes, and model-driven approximation, which modifies the internal parameters +and architecture of the GNN itself. By examining recent advancements in these +unlearning methodologies, we highlight their applicability in various use +cases, including fraud detection, risk assessment, token relationship +prediction, and decentralized governance. We discuss the challenges inherent in +balancing model performance with the requirements for data unlearning, +particularly in the context of real-time financial applications. Furthermore, +we propose a hybrid approach that combines the strengths of both unlearning +strategies to enhance the efficiency and effectiveness of GNNs in digital asset +ecosystems. Ultimately, this paper aims to provide a comprehensive framework +for understanding and implementing GNN unlearning techniques, paving the way +for secure and compliant deployment of machine learning in the digital asset +domain. + +
+
+
+
+
+ + ☆ Hierarchical Federated Learning with Multi-Timescale Gradient Correction NeurIPS 2024 + + +
+ While traditional federated learning (FL) typically focuses on a star +topology where clients are directly connected to a central server, real-world +distributed systems often exhibit hierarchical architectures. Hierarchical FL +(HFL) has emerged as a promising solution to bridge this gap, leveraging +aggregation points at multiple levels of the system. However, existing +algorithms for HFL encounter challenges in dealing with multi-timescale model +drift, i.e., model drift occurring across hierarchical levels of data +heterogeneity. In this paper, we propose a multi-timescale gradient correction +(MTGC) methodology to resolve this issue. Our key idea is to introduce distinct +control variables to (i) correct the client gradient towards the group +gradient, i.e., to reduce client model drift caused by local updates based on +individual datasets, and (ii) correct the group gradient towards the global +gradient, i.e., to reduce group model drift caused by FL over clients within +the group. We analytically characterize the convergence behavior of MTGC under +general non-convex settings, overcoming challenges associated with couplings +between correction terms. We show that our convergence bound is immune to the +extent of data heterogeneity, confirming the stability of the proposed +algorithm against multi-level non-i.i.d. data. Through extensive experiments on +various datasets and models, we validate the effectiveness of MTGC in diverse +HFL settings. The code for this project is available at +\href{https://github.com/wenzhifang/MTGC}{https://github.com/wenzhifang/MTGC}. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Gradient-free Decoder Inversion in Latent Diffusion Models NeurIPS 2024 + + +
+ In latent diffusion models (LDMs), denoising diffusion process efficiently +takes place on latent space whose dimension is lower than that of pixel space. +Decoder is typically used to transform the representation in latent space to +that in pixel space. While a decoder is assumed to have an encoder as an +accurate inverse, exact encoder-decoder pair rarely exists in practice even +though applications often require precise inversion of decoder. Prior works for +decoder inversion in LDMs employed gradient descent inspired by inversions of +generative adversarial networks. However, gradient-based methods require larger +GPU memory and longer computation time for larger latent space. For example, +recent video LDMs can generate more than 16 frames, but GPUs with 24 GB memory +can only perform gradient-based decoder inversion for 4 frames. Here, we +propose an efficient gradient-free decoder inversion for LDMs, which can be +applied to diverse latent models. Theoretical convergence property of our +proposed inversion has been investigated not only for the forward step method, +but also for the inertial Krasnoselskii-Mann (KM) iterations under mild +assumption on cocoercivity that is satisfied by recent LDMs. Our proposed +gradient-free method with Adam optimizer and learning rate scheduling +significantly reduced computation time and memory usage over prior +gradient-based methods and enabled efficient computation in applications such +as noise-space watermarking while achieving comparable error levels. + +
+
+ comment: 19 pages, Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ State-free Reinforcement Learning + + +
+ In this work, we study the \textit{state-free RL} problem, where the +algorithm does not have the states information before interacting with the +environment. Specifically, denote the reachable state set by ${S}^\Pi := \{ +s|\max_{\pi\in \Pi}q^{P, \pi}(s)>0 \}$, we design an algorithm which requires +no information on the state space $S$ while having a regret that is completely +independent of ${S}$ and only depend on ${S}^\Pi$. We view this as a concrete +first step towards \textit{parameter-free RL}, with the goal of designing RL +algorithms that require no hyper-parameter tuning. + +
+
+
+
+
+ + ☆ Multi-agent Reinforcement Learning for Dynamic Dispatching in Material + Handling Systems + + +
+ This paper proposes a multi-agent reinforcement learning (MARL) approach to +learn dynamic dispatching strategies, which is crucial for optimizing +throughput in material handling systems across diverse industries. To benchmark +our method, we developed a material handling environment that reflects the +complexities of an actual system, such as various activities at different +locations, physical constraints, and inherent uncertainties. To enhance +exploration during learning, we propose a method to integrate domain knowledge +in the form of existing dynamic dispatching heuristics. Our experimental +results show that our method can outperform heuristics by up to 7.4 percent in +terms of median throughput. Additionally, we analyze the effect of different +architectures on MARL performance when training multiple agents with different +functions. We also demonstrate that the MARL agents performance can be further +improved by using the first iteration of MARL agents as heuristics to train a +second iteration of MARL agents. This work demonstrates the potential of +applying MARL to learn effective dynamic dispatching strategies that may be +deployed in real-world systems to improve business outcomes. + +
+
+
+
+
+ + ☆ Easy2Hard-Bench: Standardized Difficulty Labels for Profiling LLM + Performance and Generalization NeurIPS 2024 + + +
+ While generalization over tasks from easy to hard is crucial to profile +language models (LLMs), the datasets with fine-grained difficulty annotations +for each problem across a broad range of complexity are still blank. Aiming to +address this limitation, we present Easy2Hard-Bench, a consistently formatted +collection of 6 benchmark datasets spanning various domains, such as +mathematics and programming problems, chess puzzles, and reasoning questions. +Each problem within these datasets is annotated with numerical difficulty +scores. To systematically estimate problem difficulties, we collect abundant +performance data on attempts to each problem by humans in the real world or +LLMs on the prominent leaderboard. Leveraging the rich performance data, we +apply well-established difficulty ranking systems, such as Item Response Theory +(IRT) and Glicko-2 models, to uniformly assign numerical difficulty scores to +problems. Moreover, datasets in Easy2Hard-Bench distinguish themselves from +previous collections by a higher proportion of challenging problems. Through +extensive experiments with six state-of-the-art LLMs, we provide a +comprehensive analysis of their performance and generalization capabilities +across varying levels of difficulty, with the aim of inspiring future research +in LLM generalization. The datasets are available at +https://huggingface.co/datasets/furonghuang-lab/Easy2Hard-Bench. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ☆ Neural Collaborative Filtering to Detect Anomalies in Human Semantic + Trajectories + + +
+ Human trajectory anomaly detection has become increasingly important across a +wide range of applications, including security surveillance and public health. +However, existing trajectory anomaly detection methods are primarily focused on +vehicle-level traffic, while human-level trajectory anomaly detection remains +under-explored. Since human trajectory data is often very sparse, machine +learning methods have become the preferred approach for identifying complex +patterns. However, concerns regarding potential biases and the robustness of +these models have intensified the demand for more transparent and explainable +alternatives. In response to these challenges, our research focuses on +developing a lightweight anomaly detection model specifically designed to +detect anomalies in human trajectories. We propose a Neural Collaborative +Filtering approach to model and predict normal mobility. Our method is designed +to model users' daily patterns of life without requiring prior knowledge, +thereby enhancing performance in scenarios where data is sparse or incomplete, +such as in cold start situations. Our algorithm consists of two main modules. +The first is the collaborative filtering module, which applies collaborative +filtering to model normal mobility of individual humans to places of interest. +The second is the neural module, responsible for interpreting the complex +spatio-temporal relationships inherent in human trajectory data. To validate +our approach, we conducted extensive experiments using simulated and real-world +datasets comparing to numerous state-of-the-art trajectory anomaly detection +approaches. + +
+
+ comment: Accepted for publication in the 1st ACM SIGSPATIAL International + Workshop on Geospatial Anomaly Detection (GeoAnomalies'24) +
+
+
+
+
+ + ☆ Dual Cone Gradient Descent for Training Physics-Informed Neural Networks + + +
+ Physics-informed neural networks (PINNs) have emerged as a prominent approach +for solving partial differential equations (PDEs) by minimizing a combined loss +function that incorporates both boundary loss and PDE residual loss. Despite +their remarkable empirical performance in various scientific computing tasks, +PINNs often fail to generate reasonable solutions, and such pathological +behaviors remain difficult to explain and resolve. In this paper, we identify +that PINNs can be adversely trained when gradients of each loss function +exhibit a significant imbalance in their magnitudes and present a negative +inner product value. To address these issues, we propose a novel optimization +framework, Dual Cone Gradient Descent (DCGD), which adjusts the direction of +the updated gradient to ensure it falls within a dual cone region. This region +is defined as a set of vectors where the inner products with both the gradients +of the PDE residual loss and the boundary loss are non-negative. Theoretically, +we analyze the convergence properties of DCGD algorithms in a non-convex +setting. On a variety of benchmark equations, we demonstrate that DCGD +outperforms other optimization algorithms in terms of various evaluation +metrics. In particular, DCGD achieves superior predictive accuracy and enhances +the stability of training for failure modes of PINNs and complex PDEs, compared +to existing optimally tuned models. Moreover, DCGD can be further improved by +combining it with popular strategies for PINNs, including learning rate +annealing and the Neural Tangent Kernel (NTK). + +
+
+
+
+
+ + ☆ A physics-driven sensor placement optimization methodology for + temperature field reconstruction + + +
+ Perceiving the global field from sparse sensors has been a grand challenge in +the monitoring, analysis, and design of physical systems. In this context, +sensor placement optimization is a crucial issue. Most existing works require +large and sufficient data to construct data-based criteria, which are +intractable in data-free scenarios without numerical and experimental data. To +this end, we propose a novel physics-driven sensor placement optimization +(PSPO) method for temperature field reconstruction using a physics-based +criterion to optimize sensor locations. In our methodological framework, we +firstly derive the theoretical upper and lower bounds of the reconstruction +error under noise scenarios by analyzing the optimal solution, proving that +error bounds correlate with the condition number determined by sensor +locations. Furthermore, the condition number, as the physics-based criterion, +is used to optimize sensor locations by the genetic algorithm. Finally, the +best sensors are validated by reconstruction models, including non-invasive +end-to-end models, non-invasive reduced-order models, and physics-informed +models. Experimental results, both on a numerical and an application case, +demonstrate that the PSPO method significantly outperforms random and uniform +selection methods, improving the reconstruction accuracy by nearly an order of +magnitude. Moreover, the PSPO method can achieve comparable reconstruction +accuracy to the existing data-driven placement optimization methods. + +
+
+
+
+
+ + ☆ Robust Network Learning via Inverse Scale Variational Sparsification + + +
+ While neural networks have made significant strides in many AI tasks, they +remain vulnerable to a range of noise types, including natural corruptions, +adversarial noise, and low-resolution artifacts. Many existing approaches focus +on enhancing robustness against specific noise types, limiting their +adaptability to others. Previous studies have addressed general robustness by +adopting a spectral perspective, which tends to blur crucial features like +texture and object contours. Our proposed solution, however, introduces an +inverse scale variational sparsification framework within a time-continuous +inverse scale space formulation. This framework progressively learns +finer-scale features by discerning variational differences between pixels, +ultimately preserving only large-scale features in the smoothed image. Unlike +frequency-based methods, our approach not only removes noise by smoothing +small-scale features where corruptions often occur but also retains +high-contrast details such as textures and object contours. Moreover, our +framework offers simplicity and efficiency in implementation. By integrating +this algorithm into neural network training, we guide the model to prioritize +learning large-scale features. We show the efficacy of our approach through +enhanced robustness against various noise types. + +
+
+ comment: 21 pages, 7 figures +
+
+
+
+
+ + ☆ A3: Active Adversarial Alignment for Source-Free Domain Adaptation ICML + + +
+ Unsupervised domain adaptation (UDA) aims to transfer knowledge from a +labeled source domain to an unlabeled target domain. Recent works have focused +on source-free UDA, where only target data is available. This is challenging as +models rely on noisy pseudo-labels and struggle with distribution shifts. We +propose Active Adversarial Alignment (A3), a novel framework combining +self-supervised learning, adversarial training, and active learning for robust +source-free UDA. A3 actively samples informative and diverse data using an +acquisition function for training. It adapts models via adversarial losses and +consistency regularization, aligning distributions without source data access. +A3 advances source-free UDA through its synergistic integration of active and +adversarial learning for effective domain alignment and noise reduction. + +
+
+ comment: Accepted at ICMLA 2024 +
+
+
+
+
+ + ☆ VickreyFeedback: Cost-efficient Data Construction for Reinforcement + Learning from Human Feedback + + +
+ This paper addresses the cost-efficiency aspect of Reinforcement Learning +from Human Feedback (RLHF). RLHF leverages datasets of human preferences over +outputs of large language models (LLM) to instill human expectations into LLMs. +While preference annotation comes with a monetized cost, the economic utility +of a preference dataset has not been considered by far. What exacerbates this +situation is that given complex intransitive or cyclic relationships in +preference datasets, existing algorithms for fine-tuning LLMs are still far +from capturing comprehensive preferences. This raises severe cost-efficiency +concerns in production environments, where preference data accumulate over +time. In this paper, we see the fine-tuning of LLMs as a monetized economy and +introduce an auction mechanism to improve the efficiency of the preference data +collection in dollar terms. We show that introducing an auction mechanism can +play an essential role in enhancing the cost-efficiency of RLHF while +maintaining satisfactory model performance. Experimental results demonstrate +that our proposed auction-based protocol is cost-efficient for fine-tuning LLMs +by concentrating on high-quality feedback. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Hypergame Theory for Decentralized Resource Allocation in Multi-user + Semantic Communications + + +
+ Semantic communications (SC) is an emerging communication paradigm in which +wireless devices can send only relevant information from a source of data while +relying on computing resources to regenerate missing data points. However, the +design of a multi-user SC system becomes more challenging because of the +computing and communication overhead required for coordination. Existing +solutions for learning the semantic language and performing resource allocation +often fail to capture the computing and communication tradeoffs involved in +multiuser SC. To address this gap, a novel framework for decentralized +computing and communication resource allocation in multiuser SC systems is +proposed. The challenge of efficiently allocating communication and computing +resources (for reasoning) in a decentralized manner to maximize the quality of +task experience for the end users is addressed through the application of +Stackelberg hyper game theory. Leveraging the concept of second-level hyper +games, novel analytical formulations are developed to model misperceptions of +the users about each other's communication and control strategies. Further, +equilibrium analysis of the learned resource allocation protocols examines the +convergence of the computing and communication strategies to a local +Stackelberg equilibria, considering misperceptions. Simulation results show +that the proposed Stackelberg hyper game results in efficient usage of +communication and computing resources while maintaining a high quality of +experience for the users compared to state-of-the-art that does not account for +the misperceptions. + +
+
+
+
+
+ + ♻ ☆ Confidence intervals uncovered: Are we ready for real-world medical + imaging AI? MICCAI 2024 + + +
+ Medical imaging is spearheading the AI transformation of healthcare. +Performance reporting is key to determine which methods should be translated +into clinical practice. Frequently, broad conclusions are simply derived from +mean performance values. In this paper, we argue that this common practice is +often a misleading simplification as it ignores performance variability. Our +contribution is threefold. (1) Analyzing all MICCAI segmentation papers (n = +221) published in 2023, we first observe that more than 50% of papers do not +assess performance variability at all. Moreover, only one (0.5%) paper reported +confidence intervals (CIs) for model performance. (2) To address the reporting +bottleneck, we show that the unreported standard deviation (SD) in segmentation +papers can be approximated by a second-order polynomial function of the mean +Dice similarity coefficient (DSC). Based on external validation data from 56 +previous MICCAI challenges, we demonstrate that this approximation can +accurately reconstruct the CI of a method using information provided in +publications. (3) Finally, we reconstructed 95% CIs around the mean DSC of +MICCAI 2023 segmentation papers. The median CI width was 0.03 which is three +times larger than the median performance gap between the first and second +ranked method. For more than 60% of papers, the mean performance of the +second-ranked method was within the CI of the first-ranked method. We conclude +that current publications typically do not provide sufficient evidence to +support which models could potentially be translated into clinical practice. + +
+
+ comment: Paper accepted at MICCAI 2024 conference +
+
+
+
+
+ + ♻ ☆ Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval + Model EMNLP 2024 + + +
+ A supervised ranking model, despite its advantage of being effective, usually +involves complex processing - typically multiple stages of task-specific +pre-training and fine-tuning. This has motivated researchers to explore simpler +pipelines leveraging large language models (LLMs) that are capable of working +in a zero-shot manner. However, since zero-shot inference does not make use of +a training set of pairs of queries and their relevant documents, its +performance is mostly worse than that of supervised models, which are trained +on such example pairs. Motivated by the existing findings that training +examples generally improve zero-shot performance, in our work, we explore if +this also applies to ranking models. More specifically, given a query and a +pair of documents, the preference prediction task is improved by augmenting +examples of preferences for similar queries from a training set. Our proposed +pairwise few-shot ranker demonstrates consistent improvements over the +zero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset) +retrieval benchmarks. Our method also achieves a close performance to that of a +supervised model without requiring any complex training pipeline. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard + for Prompt Attacks + + +
+ The proliferation of Large Language Models (LLMs) in diverse applications +underscores the pressing need for robust security measures to thwart potential +jailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger +data integrity and user privacy. Guardrails serve as crucial protective +mechanisms against such threats, but existing models often fall short in terms +of both detection accuracy, and computational efficiency. This paper advocates +for the significance of jailbreak attack prevention on LLMs, and emphasises the +role of input guardrails in safeguarding these models. We introduce MoJE +(Mixture of Jailbreak Expert), a novel guardrail architecture designed to +surpass current limitations in existing state-of-the-art guardrails. By +employing simple linguistic statistical techniques, MoJE excels in detecting +jailbreak attacks while maintaining minimal computational overhead during model +inference. Through rigorous experimentation, MoJE demonstrates superior +performance capable of detecting 90% of the attacks without compromising benign +prompts, enhancing LLMs security against jailbreak attacks. + +
+
+
+
+
+ + ♻ ☆ Modulated Intervention Preference Optimization (MIPO): Keep the Easy, + Refine the Difficult AAAI 2025 + + +
+ Preference optimization methods typically begin training with a well-trained +SFT model as a reference model. In RLHF and DPO, a regularization term is used +during the preference optimization process to prevent the policy model from +deviating too far from the reference model's distribution, thereby avoiding the +generation of anomalous responses. When the reference model is already +well-aligned with the given data or only requires slight adjustments, this +approach can produce a well-aligned model. However, if the reference model is +not aligned with the given data and requires significant deviation from its +current state, a regularization term may actually hinder the model alignment. +In this study, we propose \textbf{Modulated Intervention Preference +Optimization (MIPO)} to address this issue. MIPO modulates the degree of +intervention from the reference model based on how well the given data is +aligned with it. If the data is well-aligned, the intervention is increased to +prevent the policy model from diverging significantly from reference model. +Conversely, if the alignment is poor, the interference is reduced to facilitate +more extensive training. We compare the performance of MIPO and DPO using +Mistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental +results demonstrate that MIPO consistently outperforms DPO across various +evaluation scenarios. + +
+
+ comment: 8pages, submitted to AAAI 2025 +
+
+
+
+
+ + ♻ ☆ On Rademacher Complexity-based Generalization Bounds for Deep Learning + + +
+ We show that the Rademacher complexity-based approach can generate +non-vacuous generalisation bounds on Convolutional Neural Networks (CNNs) for +classifying a small number of classes of images. The development of new +Talagrand's contraction lemmas for high-dimensional mappings between function +spaces and CNNs for general Lipschitz activation functions is a key technical +contribution. Our results show that the Rademacher complexity does not depend +on the network length for CNNs with some special types of activation functions +such as ReLU, Leaky ReLU, Parametric Rectifier Linear Unit, Sigmoid, and Tanh. + +
+
+ comment: Extra experiments provided +
+
+
+
+
+ + ♻ ☆ Proprioception Is All You Need: Terrain Classification for Boreal + Forests IROS 2024 + + +
+ Recent works in field robotics highlighted the importance of resiliency +against different types of terrains. Boreal forests, in particular, are home to +many mobility-impeding terrains that should be considered for off-road +autonomous navigation. Also, being one of the largest land biomes on Earth, +boreal forests are an area where autonomous vehicles are expected to become +increasingly common. In this paper, we address this issue by introducing +BorealTC, a publicly available dataset for proprioceptive-based terrain +classification (TC). Recorded with a Husky A200, our dataset contains 116 min +of Inertial Measurement Unit (IMU), motor current, and wheel odometry data, +focusing on typical boreal forest terrains, notably snow, ice, and silty loam. +Combining our dataset with another dataset from the state-of-the-art, we +evaluate both a Convolutional Neural Network (CNN) and the novel state space +model (SSM)-based Mamba architecture on a TC task. Interestingly, we show that +while CNN outperforms Mamba on each separate dataset, Mamba achieves greater +accuracy when trained on a combination of both. In addition, we demonstrate +that Mamba's learning capacity is greater than a CNN for increasing amounts of +data. We show that the combination of two TC datasets yields a latent space +that can be interpreted with the properties of the terrains. We also discuss +the implications of merging datasets on classification. Our source code and +dataset are publicly available online: +https://github.com/norlab-ulaval/BorealTC. + +
+
+ comment: Accepted to the 2024 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2024) +
+
+
+
+
+ + ♻ ☆ Trained Without My Consent: Detecting Code Inclusion In Language Models + Trained on Code + + +
+ Code auditing ensures that the developed code adheres to standards, +regulations, and copyright protection by verifying that it does not contain +code from protected sources. The recent advent of Large Language Models (LLMs) +as coding assistants in the software development process poses new challenges +for code auditing. The dataset for training these models is mainly collected +from publicly available sources. This raises the issue of intellectual property +infringement as developers' codes are already included in the dataset. +Therefore, auditing code developed using LLMs is challenging, as it is +difficult to reliably assert if an LLM used during development has been trained +on specific copyrighted codes, given that we do not have access to the training +datasets of these models. Given the non-disclosure of the training datasets, +traditional approaches such as code clone detection are insufficient for +asserting copyright infringement. To address this challenge, we propose a new +approach, TraWiC; a model-agnostic and interpretable method based on membership +inference for detecting code inclusion in an LLM's training dataset. We extract +syntactic and semantic identifiers unique to each program to train a classifier +for detecting code inclusion. In our experiments, we observe that TraWiC is +capable of detecting 83.87% of codes that were used to train an LLM. In +comparison, the prevalent clone detection tool NiCad is only capable of +detecting 47.64%. In addition to its remarkable performance, TraWiC has low +resource overhead in contrast to pair-wise clone detection that is conducted +during the auditing process of tools like CodeWhisperer reference tracker, +across thousands of code snippets. + +
+
+ comment: Accepted for publication in TOSEM (ACM Transactions on Software + Engineering and Methodology) +
+
+
+
+
+ + ♻ ☆ M$^2$PT: Multimodal Prompt Tuning for Zero-shot Instruction Learning EMNLP 2024 + + +
+ Multimodal Large Language Models (MLLMs) demonstrate remarkable performance +across a wide range of domains, with increasing emphasis on enhancing their +zero-shot generalization capabilities for unseen tasks across various +modalities. Instruction tuning has emerged as an effective strategy for +achieving zero-shot generalization by finetuning pretrained models on diverse +multimodal tasks. As the scale of MLLMs continues to grow, parameter-efficient +finetuning becomes increasingly critical. However, most existing +parameter-efficient approaches focus only on single modalities and often +overlook the multimodal characteristics during finetuning. In this work, we +introduce a novel Multimodal Prompt Tuning (M$^2$PT) approach for efficient +instruction tuning of MLLMs. M$^2$PT effectively integrates visual and textual +prompts into the vision encoder and language processor respectively during +finetuning, facilitating the extraction and alignment of features across +modalities. Empirical results on various multimodal evaluation datasets +demonstrate the superior performance of our approach compared to several +state-of-the-art baselines. A comprehensive set of ablation studies validates +the effectiveness of our prompt design and the efficiency of our approach. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ LLM Detectors Still Fall Short of Real World: Case of LLM-Generated + Short News-Like Posts EMNLP + + +
+ With the emergence of widely available powerful LLMs, disinformation +generated by large Language Models (LLMs) has become a major concern. +Historically, LLM detectors have been touted as a solution, but their +effectiveness in the real world is still to be proven. In this paper, we focus +on an important setting in information operations -- short news-like posts +generated by moderately sophisticated attackers. + We demonstrate that existing LLM detectors, whether zero-shot or +purpose-trained, are not ready for real-world use in that setting. All tested +zero-shot detectors perform inconsistently with prior benchmarks and are highly +vulnerable to sampling temperature increase, a trivial attack absent from +recent benchmarks. A purpose-trained detector generalizing across LLMs and +unseen attacks can be developed, but it fails to generalize to new +human-written texts. + We argue that the former indicates domain-specific benchmarking is needed, +while the latter suggests a trade-off between the adversarial evasion +resilience and overfitting to the reference human text, with both needing +evaluation in benchmarks and currently absent. We believe this suggests a +re-consideration of current LLM detector benchmarking approaches and provides a +dynamically extensible benchmark to allow it +(https://github.com/Reliable-Information-Lab-HEVS/benchmark_llm_texts_detection). + +
+
+ comment: 20 pages, 7 tables, 13 figures, under consideration for EMNLP +
+
+
+
+
+ + ♻ ☆ A preliminary study on continual learning in computer vision using + Kolmogorov-Arnold Networks + + +
+ Deep learning has long been dominated by multi-layer perceptrons (MLPs), +which have demonstrated superiority over other optimizable models in various +domains. Recently, a new alternative to MLPs has emerged - Kolmogorov-Arnold +Networks (KAN)- which are based on a fundamentally different mathematical +framework. According to their authors, KANs address several major issues in +MLPs, such as catastrophic forgetting in continual learning scenarios. However, +this claim has only been supported by results from a regression task on a toy +1D dataset. In this paper, we extend the investigation by evaluating the +performance of KANs in continual learning tasks within computer vision, +specifically using the MNIST datasets. To this end, we conduct a structured +analysis of the behavior of MLPs and two KAN-based models in a +class-incremental learning scenario, ensuring that the architectures involved +have the same number of trainable parameters. Our results demonstrate that an +efficient version of KAN outperforms both traditional MLPs and the original KAN +implementation. We further analyze the influence of hyperparameters in MLPs and +KANs, as well as the impact of certain trainable parameters in KANs, such as +bias and scale weights. Additionally, we provide a preliminary investigation of +recent KAN-based convolutional networks and compare their performance with that +of traditional convolutional neural networks. Our codes can be found at +https://github.com/MrPio/KAN-Continual_Learning_tests. + +
+
+
+
+
+ + ♻ ☆ Towards Physically Consistent Deep Learning For Climate Model + Parameterizations ICML + + +
+ Climate models play a critical role in understanding and projecting climate +change. Due to their complexity, their horizontal resolution of about 40-100 km +remains too coarse to resolve processes such as clouds and convection, which +need to be approximated via parameterizations. These parameterizations are a +major source of systematic errors and large uncertainties in climate +projections. Deep learning (DL)-based parameterizations, trained on data from +computationally expensive short, high-resolution simulations, have shown great +promise for improving climate models in that regard. However, their lack of +interpretability and tendency to learn spurious non-physical correlations +result in reduced trust in the climate simulation. We propose an efficient +supervised learning framework for DL-based parameterizations that leads to +physically consistent models with improved interpretability and negligible +computational overhead compared to standard supervised training. First, key +features determining the target physical processes are uncovered. Subsequently, +the neural network is fine-tuned using only those relevant features. We show +empirically that our method robustly identifies a small subset of the inputs as +actual physical drivers, therefore removing spurious non-physical +relationships. This results in by design physically consistent and +interpretable neural networks while maintaining the predictive performance of +unconstrained black-box DL-based parameterizations. + +
+
+ comment: Accepted at ICMLA 2024 +
+
+
+
+
+ + ♻ ☆ Lens: A Foundation Model for Network Traffic + + +
+ Network traffic refers to the amount of data being sent and received over the +internet or any system that connects computers. Analyzing and understanding +network traffic is vital for improving network security and management. +However, the analysis of network traffic is challenging due to the diverse +nature of data packets, which often feature heterogeneous headers and encrypted +payloads lacking semantics. To capture the latent semantics of traffic, a few +studies have adopted pre-training techniques based on the Transformer encoder +or decoder to learn the representations from massive traffic data. However, +these methods typically excel in traffic understanding (classification) or +traffic generation tasks. To address this issue, we develop Lens, a foundation +model for network traffic that leverages the T5 architecture to learn the +pre-trained representations from large-scale unlabeled data. Harnessing the +strength of the encoder-decoder framework, which captures the global +information while preserving the generative ability, our model can better learn +the representations from raw data. To further enhance pre-training +effectiveness, we design a novel loss that combines three distinct tasks: +Masked Span Prediction (MSP), Packet Order Prediction (POP), and Homologous +Traffic Prediction (HTP). Evaluation results across various benchmark datasets +demonstrate that the proposed Lens outperforms the baselines in most downstream +tasks related to both traffic understanding and generation. Notably, it also +requires much less labeled data for fine-tuning compared to current methods. + +
+
+
+
+
+ + ♻ ☆ Cluster Exploration using Informative Manifold Projections ECAI + + +
+ Dimensionality reduction (DR) is one of the key tools for the visual +exploration of high-dimensional data and uncovering its cluster structure in +two- or three-dimensional spaces. The vast majority of DR methods in the +literature do not take into account any prior knowledge a practitioner may have +regarding the dataset under consideration. We propose a novel method to +generate informative embeddings which not only factor out the structure +associated with different kinds of prior knowledge but also aim to reveal any +remaining underlying structure. To achieve this, we employ a linear combination +of two objectives: firstly, contrastive PCA that discounts the structure +associated with the prior information, and secondly, kurtosis projection +pursuit which ensures meaningful data separation in the obtained embeddings. We +formulate this task as a manifold optimization problem and validate it +empirically across a variety of datasets considering three distinct types of +prior knowledge. Lastly, we provide an automated framework to perform iterative +visual exploration of high-dimensional data. + +
+
+ comment: This paper has been accepted in the 27th European Conference on + Artificial Intelligence (ECAI) 2024 +
+
+
+
+
+ + ♻ ☆ A Differentially Private Weighted Empirical Risk Minimization Procedure + and its Application to Outcome Weighted Learning + + +
+ It is common practice to use data containing personal information to build +predictive models in the framework of empirical risk minimization (ERM). While +these models can be highly accurate in prediction, sharing the results from +these models trained on sensitive data may be susceptible to privacy attacks. +Differential privacy (DP) is an appealing framework for addressing such data +privacy issues by providing mathematically provable bounds on the privacy loss +incurred when releasing information from sensitive data. Previous work has +primarily concentrated on applying DP to unweighted ERM. We consider weighted +ERM (wERM), an important generalization, where each individual's contribution +to the objective function can be assigned varying weights. We propose the first +differentially private algorithm for general wERM, with theoretical DP +guarantees. Extending the existing DP-ERM procedures to wERM creates a pathway +for deriving privacy-preserving learning methods for individualized treatment +rules, including the popular outcome weighted learning (OWL). We evaluate the +performance of the DP-wERM framework applied to OWL in both simulation studies +and in a real clinical trial. All empirical results demonstrate the feasibility +of training OWL models via wERM with DP guarantees while maintaining +sufficiently robust model performance, providing strong evidence for the +practicality of implementing the proposed privacy-preserving OWL procedure in +real-world scenarios involving sensitive data. + +
+
+ comment: 29 pages, 1 figure, and 1 table for the main manuscript; 10 pages, 4 + figures, and 1 table for the supplementary materials +
+
+
+
+
+ + ♻ ☆ RAMBO: Enhancing RAG-based Repository-Level Method Body Completion + + +
+ Code completion is essential in software development, helping developers by +predicting code snippets based on context. Among completion tasks, Method Body +Completion (MBC) is particularly challenging as it involves generating complete +method bodies based on their signatures and context. This task becomes +significantly harder in large repositories, where method bodies must integrate +repositoryspecific elements such as custom APIs, inter-module dependencies, and +project-specific conventions. In this paper, we introduce RAMBO, a novel +RAG-based approach for repository-level MBC. Instead of retrieving similar +method bodies, RAMBO identifies essential repository-specific elements, such as +classes, methods, and variables/fields, and their relevant usages. By +incorporating these elements and their relevant usages into the code generation +process, RAMBO ensures more accurate and contextually relevant method bodies. +Our experimental results with leading code LLMs across 40 Java projects show +that RAMBO significantly outperformed the state-of-the-art repository-level MBC +approaches, with the improvements of up to 46% in BLEU, 57% in CodeBLEU, 36% in +Compilation Rate, and up to 3X in Exact Match. Notably, RAMBO surpassed +RepoCoder Oracle method by up to 12% in Exact Match, setting a new benchmark +for repository-level MBC. + +
+
+
+
+
+ + ♻ ☆ Optical ISAC: Fundamental Performance Limits and Transceiver Design + + +
+ This paper characterizes the optimal Capacity-Distortion (C-D) tradeoff in an +optical point-to-point system with Single-Input Single-Output (SISO) for +communication and Single-Input Multiple-Output (SIMO) for sensing within an +Integrated Sensing and Communication (ISAC) framework. We consider the optimal +Rate-Distortion (R-D) region and explore several Inner (IB) and Outer Bounds +(OB). We introduce practical, asymptotically optimal Maximum A Posteriori (MAP) +and Maximum Likelihood Estimators (MLE) for target distance, addressing +nonlinear measurement-to-state relationships and non-conjugate priors. As the +number of sensing antennas increases, these estimators converge to the Bayesian +Cram\'er-Rao Bound (BCRB). We also establish that the achievable +Rate-Cram\'er-Rao Bound (R-CRB) serves as an OB for the optimal C-D region, +valid for both unbiased estimators and asymptotically large numbers of receive +antennas. To clarify that the input distribution determines the tradeoff across +the Pareto boundary of the C-D region, we propose two algorithms: i) an +iterative Blahut-Arimoto Algorithm (BAA)-type method, and ii) a +memory-efficient Closed-Form (CF) approach. The CF approach includes a CF +optimal distribution for high Optical Signal-to-Noise Ratio (O-SNR) conditions. +Additionally, we adapt and refine the Deterministic-Random Tradeoff (DRT) to +this optical ISAC context. + +
+
+ comment: This paper is 8 pages long and includes 1 algorithm, 3 figures, and 3 + tables. It has been accepted for presentation at the 2024 Global + Communications Conference. For further discussion, please visit AlphaXiv or + email the authors +
+
+
+
+
+ + ♻ ☆ On fundamental aspects of quantum extreme learning machines + + +
+ Quantum Extreme Learning Machines (QELMs) have emerged as a promising +framework for quantum machine learning. Their appeal lies in the rich feature +map induced by the dynamics of a quantum substrate - the quantum reservoir - +and the efficient post-measurement training via linear regression. Here we +study the expressivity of QELMs by decomposing the prediction of QELMs into a +Fourier series. We show that the achievable Fourier frequencies are determined +by the data encoding scheme, while Fourier coefficients depend on both the +reservoir and the measurement. Notably, the expressivity of QELMs is +fundamentally limited by the number of Fourier frequencies and the number of +observables, while the complexity of the prediction hinges on the reservoir. As +a cautionary note on scalability, we identify four sources that can lead to the +exponential concentration of the observables as the system size grows +(randomness, hardware noise, entanglement, and global measurements) and show +how this can turn QELMs into useless input-agnostic oracles. In particular, our +result on the reservoir-induced concentration strongly indicates that quantum +reservoirs drawn from a highly random ensemble make QELM models unscalable. Our +analysis elucidates the potential and fundamental limitations of QELMs, and +lays the groundwork for systematically exploring quantum reservoir systems for +other machine learning tasks. + +
+
+ comment: 20+21 pages, 9+2 figures +
+
+
+
+
+ + ♻ ☆ Physics-informed neural networks for parameter learning of wildfire + spreading + + +
+ Wildland fires pose a terrifying natural hazard, underscoring the urgent need +to develop data-driven and physics-informed digital twins for wildfire +prevention, monitoring, intervention, and response. In this direction of +research, this work introduces a physics-informed neural network (PiNN) +designed to learn the unknown parameters of an interpretable wildfire spreading +model. The considered modeling approach integrates fundamental physical laws +articulated by key model parameters essential for capturing the complex +behavior of wildfires. The proposed machine learning framework leverages the +theory of artificial neural networks with the physical constraints governing +wildfire dynamics, including the first principles of mass and energy +conservation. Training of the PiNN for physics-informed parameter +identification is realized using synthetic data on the spatiotemporal evolution +of one- and two-dimensional firefronts, derived from a high-fidelity simulator, +as well as empirical data (ground surface thermal images) from the Troy Fire +that occurred on June 19, 2002, in California. The parameter learning results +demonstrate the predictive ability of the proposed PiNN in uncovering the +unknown coefficients of the wildfire model in one- and two-dimensional fire +spreading scenarios as well as the Troy Fire. Additionally, this methodology +exhibits robustness by identifying the same parameters even in the presence of +noisy data. By integrating this PiNN approach into a comprehensive framework, +the envisioned physics-informed digital twin will enhance intelligent wildfire +management and risk assessment, providing a powerful tool for proactive and +reactive strategies. + +
+
+ comment: 32 pages, 14 figures, 2 Tables +
+
+
+
+
+ + ♻ ☆ The Role of Masking for Efficient Supervised Knowledge Distillation of + Vision Transformers ECCV 2024 + + +
+ Knowledge distillation is an effective method for training lightweight vision +models. However, acquiring teacher supervision for training samples is often +costly, especially from large-scale models like vision transformers (ViTs). In +this paper, we develop a simple framework to reduce the supervision cost of ViT +distillation: masking out a fraction of input tokens given to the teacher. By +masking input tokens, one can skip the computations associated with the masked +tokens without requiring any change to teacher parameters or architecture. We +find that masking patches with the lowest student attention scores is highly +effective, saving up to 50% of teacher FLOPs without any drop in student +accuracy, while other masking criterion leads to suboptimal efficiency gains. +Through in-depth analyses, we reveal that the student-guided masking provides a +good curriculum to the student, making teacher supervision easier to follow +during the early stage and challenging in the later stage. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ PIM-Opt: Demystifying Distributed Optimization Algorithms on a + Real-World Processing-In-Memory System + + +
+ Modern Machine Learning (ML) training on large-scale datasets is a very +time-consuming workload. It relies on the optimization algorithm Stochastic +Gradient Descent (SGD) due to its effectiveness, simplicity, and generalization +performance. Processor-centric architectures (e.g., CPUs, GPUs) commonly used +for modern ML training workloads based on SGD are bottlenecked by data movement +between the processor and memory units due to the poor data locality in +accessing large datasets. As a result, processor-centric architectures suffer +from low performance and high energy consumption while executing ML training +workloads. Processing-In-Memory (PIM) is a promising solution to alleviate the +data movement bottleneck by placing the computation mechanisms inside or near +memory. + Our goal is to understand the capabilities of popular distributed SGD +algorithms on real-world PIM systems to accelerate data-intensive ML training +workloads. To this end, we 1) implement several representative centralized +parallel SGD algorithms on the real-world UPMEM PIM system, 2) rigorously +evaluate these algorithms for ML training on large-scale datasets in terms of +performance, accuracy, and scalability, 3) compare to conventional CPU and GPU +baselines, and 4) discuss implications for future PIM hardware and highlight +the need for a shift to an algorithm-hardware codesign. + Our results demonstrate three major findings: 1) The UPMEM PIM system can be +a viable alternative to state-of-the-art CPUs and GPUs for many memory-bound ML +training workloads, especially when operations and datatypes are natively +supported by PIM hardware, 2) it is important to carefully choose the +optimization algorithms that best fit PIM, and 3) the UPMEM PIM system does not +scale approximately linearly with the number of nodes for many data-intensive +ML training workloads. We open source all our code to facilitate future +research. + +
+
+ comment: "PIM-Opt: Demystifying Distributed Optimization Algorithms on a + Real-World Processing-In-Memory System" in Proceedings of the 33rd + International Conference on Parallel Architectures and Compilation Techniques + (PACT), Long Beach, CA, USA, October 2024 +
+
+
+
+
+ + ♻ ☆ Deep Bayesian Future Fusion for Self-Supervised, High-Resolution, + Off-Road Mapping + + +
+ High-speed off-road navigation requires long-range, high-resolution maps to +enable robots to safely navigate over different surfaces while avoiding +dangerous obstacles. However, due to limited computational power and sensing +noise, most approaches to off-road mapping focus on producing coarse (20-40cm) +maps of the environment. In this paper, we propose Future Fusion, a framework +capable of generating dense, high-resolution maps from sparse sensing data (30m +forward at 2cm). This is accomplished by - (1) the efficient realization of the +well-known Bayes filtering within the standard deep learning models that +explicitly accounts for the sparsity pattern in stereo and LiDAR depth data, +and (2) leveraging perceptual losses common in generative image completion. The +proposed methodology outperforms the conventional baselines. Moreover, the +learned features and the completed dense maps lead to improvements in the +downstream navigation task. + +
+
+
+
+
+ + ♻ ☆ Lego: Learning to Disentangle and Invert Personalized Concepts Beyond + Object Appearance in Text-to-Image Diffusion Models + + +
+ Text-to-Image (T2I) models excel at synthesizing concepts such as nouns, +appearances, and styles. To enable customized content creation based on a few +example images of a concept, methods such as Textual Inversion and DreamBooth +invert the desired concept and enable synthesizing it in new scenes. However, +inverting personalized concepts that go beyond object appearance and style +(adjectives and verbs) through natural language remains a challenge. Two key +characteristics of these concepts contribute to the limitations of current +inversion methods. 1) Adjectives and verbs are entangled with nouns (subject) +and can hinder appearance-based inversion methods, where the subject appearance +leaks into the concept embedding, and 2) describing such concepts often extends +beyond single word embeddings. + In this study, we introduce Lego, a textual inversion method designed to +invert subject-entangled concepts from a few example images. Lego disentangles +concepts from their associated subjects using a simple yet effective Subject +Separation step and employs a Context Loss that guides the inversion of +single/multi-embedding concepts. In a thorough user study, Lego-generated +concepts were preferred over 70% of the time when compared to the baseline in +terms of authentically generating concepts according to a reference. +Additionally, visual question answering using an LLM suggested Lego-generated +concepts are better aligned with the text description of the concept. + +
+
+
+
+
+ + ♻ ☆ Bi-Directional Transformers vs. word2vec: Discovering Vulnerabilities in + Lifted Compiled Code + + +
+ Detecting vulnerabilities within compiled binaries is challenging due to lost +high-level code structures and other factors such as architectural +dependencies, compilers, and optimization options. To address these obstacles, +this research explores vulnerability detection using natural language +processing (NLP) embedding techniques with word2vec, BERT, and RoBERTa to learn +semantics from intermediate representation (LLVM IR) code. Long short-term +memory (LSTM) neural networks were trained on embeddings from encoders created +using approximately 48k LLVM functions from the Juliet dataset. This study is +pioneering in its comparison of word2vec models with multiple bidirectional +transformers (BERT, RoBERTa) embeddings built using LLVM code to train neural +networks to detect vulnerabilities in compiled binaries. Word2vec Skip-Gram +models achieved 92% validation accuracy in detecting vulnerabilities, +outperforming word2vec Continuous Bag of Words (CBOW), BERT, and RoBERTa. This +suggests that complex contextual embeddings may not provide advantages over +simpler word2vec models for this task when a limited number (e.g. 48K) of data +samples are used to train the bidirectional transformer-based models. The +comparative results provide novel insights into selecting optimal embeddings +for learning compiler-independent semantic code representations to advance +machine learning detection of vulnerabilities in compiled binaries. + +
+
+ comment: Updated with improvements +
+
+
+
+
+ + ♻ ☆ The Impact of Unstated Norms in Bias Analysis of Language Models + + +
+ Bias in large language models (LLMs) has many forms, from overt +discrimination to implicit stereotypes. Counterfactual bias evaluation is a +widely used approach to quantifying bias and often relies on template-based +probes that explicitly state group membership. It measures whether the outcome +of a task, performed by an LLM, is invariant to a change of group membership. +In this work, we find that template-based probes can lead to unrealistic bias +measurements. For example, LLMs appear to mistakenly cast text associated with +White race as negative at higher rates than other groups. We hypothesize that +this arises artificially via a mismatch between commonly unstated norms, in the +form of markedness, in the pretraining text of LLMs (e.g., Black president vs. +president) and templates used for bias measurement (e.g., Black president vs. +White president). The findings highlight the potential misleading impact of +varying group membership through explicit mention in counterfactual bias +quantification. + +
+
+ comment: 23 Pages, 5 Figures, 3 Tables +
+
+
+
+
+ + ♻ ☆ Dual-Layer Training and Decoding of Large Language Model with + Simultaneously Thinking and Speaking + + +
+ Large Language Model can reasonably understand and generate human expressions +but may lack of thorough thinking and reasoning mechanisms. Recently there have +been several studies which enhance the thinking ability of language models but +most of them are not data-driven or training-based. In this paper, we are +motivated by the cognitive mechanism in the natural world, and design a novel +model architecture called TaS which allows it to first consider the thoughts +and then express the response based upon the query. We design several pipelines +to annotate or generate the thought contents from prompt-response samples, then +add language heads in a middle layer which behaves as the thinking layer. We +train the language model by the thoughts-augmented data and successfully let +the thinking layer automatically generate reasonable thoughts and finally +output more reasonable responses. Both qualitative examples and quantitative +results validate the effectiveness and performance of TaS. Our code is +available at https://anonymous.4open.science/r/TadE. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Feature-Based Interpretable Surrogates for Optimization + + +
+ For optimization models to be used in practice, it is crucial that users +trust the results. A key factor in this aspect is the interpretability of the +solution process. A previous framework for inherently interpretable +optimization models used decision trees to map instances to solutions of the +underlying optimization model. Based on this work, we investigate how we can +use more general optimization rules to further increase interpretability and, +at the same time, give more freedom to the decision-maker. The proposed rules +do not map to a concrete solution but to a set of solutions characterized by +common features. To find such optimization rules, we present an exact +methodology using mixed-integer programming formulations as well as heuristics. +We also outline the challenges and opportunities that these methods present. In +particular, we demonstrate the improvement in solution quality that our +approach offers compared to existing interpretable surrogates for optimization, +and we discuss the relationship between interpretability and performance. These +findings are supported by experiments using both synthetic and real-world data. + +
+
+
+
+
+ + ♻ ☆ Reward-Robust RLHF in LLMs + + +
+ As Large Language Models (LLMs) continue to progress toward more advanced +forms of intelligence, Reinforcement Learning from Human Feedback (RLHF) is +increasingly seen as a key pathway toward achieving Artificial General +Intelligence (AGI). However, the reliance on reward-model-based (RM-based) +alignment methods introduces significant challenges due to the inherent +instability and imperfections of Reward Models (RMs), which can lead to +critical issues such as reward hacking and misalignment with human intentions. +In this paper, we introduce a reward-robust RLHF framework aimed at addressing +these fundamental challenges, paving the way for more reliable and resilient +learning in LLMs. Our approach introduces a novel optimization objective that +carefully balances performance and robustness by incorporating Bayesian Reward +Model Ensembles (BRME) to model the uncertainty set of reward functions. This +allows the framework to integrate both nominal performance and minimum reward +signals, ensuring more stable learning even with imperfect RMs. Empirical +results demonstrate that our framework consistently outperforms baselines +across diverse benchmarks, showing improved accuracy and long-term stability. +We also provide a theoretical analysis, demonstrating that reward-robust RLHF +approaches the stability of constant reward settings, which proves to be +acceptable even in a stochastic-case analysis. Together, these contributions +highlight the framework potential to enhance both the performance and stability +of LLM alignment. + +
+
+
+
+
+ + ♻ ☆ Automating Data Annotation under Strategic Human Agents: Risks and + Potential Solutions + + +
+ As machine learning (ML) models are increasingly used in social domains to +make consequential decisions about humans, they often have the power to reshape +data distributions. Humans, as strategic agents, continuously adapt their +behaviors in response to the learning system. As populations change +dynamically, ML systems may need frequent updates to ensure high performance. +However, acquiring high-quality human-annotated samples can be highly +challenging and even infeasible in social domains. A common practice to address +this issue is using the model itself to annotate unlabeled data samples. This +paper investigates the long-term impacts when ML models are retrained with +model-annotated samples when they incorporate human strategic responses. We +first formalize the interactions between strategic agents and the model and +then analyze how they evolve under such dynamic interactions. We find that +agents are increasingly likely to receive positive decisions as the model gets +retrained, whereas the proportion of agents with positive labels may decrease +over time. We thus propose a refined retraining process to stabilize the +dynamics. Last, we examine how algorithmic fairness can be affected by these +retraining processes and find that enforcing common fairness constraints at +every round may not benefit the disadvantaged group in the long run. +Experiments on (semi-)synthetic and real data validate the theoretical +findings. + +
+
+
+
+
+ + ♻ ☆ Implicit Image-to-Image Schrodinger Bridge for Image Restoration + + +
+ Diffusion-based models are widely recognized for their effectiveness in image +restoration tasks; however, their iterative denoising process, which begins +from Gaussian noise, often results in slow inference speeds. The Image-to-Image +Schr\"odinger Bridge (I$^2$SB) presents a promising alternative by starting the +generative process from corrupted images and leveraging training techniques +from score-based diffusion models. In this paper, we introduce the Implicit +Image-to-Image Schr\"odinger Bridge (I$^3$SB) to further accelerate the +generative process of I$^2$SB. I$^3$SB reconfigures the generative process into +a non-Markovian framework by incorporating the initial corrupted image into +each step, while ensuring that the marginal distribution aligns with that of +I$^2$SB. This allows for the direct use of the pretrained network from I$^2$SB. +Extensive experiments on natural images, human face images, and medical images +validate the acceleration benefits of I$^3$SB. Compared to I$^2$SB, I$^3$SB +achieves the same perceptual quality with fewer generative steps, while +maintaining equal or improved fidelity to the ground truth. + +
+
+ comment: 23 pages, 8 figures, submitted to Pattern Recognition +
+
+
+
+
+ + ♻ ☆ HyperBERT: Mixing Hypergraph-Aware Layers with Language Models for Node + Classification on Text-Attributed Hypergraphs EMNLP 2024 + + +
+ Hypergraphs are characterized by complex topological structure, representing +higher-order interactions among multiple entities through hyperedges. Lately, +hypergraph-based deep learning methods to learn informative data +representations for the problem of node classification on text-attributed +hypergraphs have garnered increasing research attention. However, existing +methods struggle to simultaneously capture the full extent of hypergraph +structural information and the rich linguistic attributes inherent in the nodes +attributes, which largely hampers their effectiveness and generalizability. To +overcome these challenges, we explore ways to further augment a pretrained BERT +model with specialized hypergraph-aware layers for the task of node +classification. Such layers introduce higher-order structural inductive bias +into the language model, thus improving the model's capacity to harness both +higher-order context information from the hypergraph structure and semantic +information present in text. In this paper, we propose a new architecture, +HyperBERT, a mixed text-hypergraph model which simultaneously models hypergraph +relational structure while maintaining the high-quality text encoding +capabilities of a pre-trained BERT. Notably, HyperBERT presents results that +achieve a new state-of-the-art on five challenging text-attributed hypergraph +node classification benchmarks. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ CausalBench: A Comprehensive Benchmark for Causal Learning Capability of + LLMs + + +
+ The ability to understand causality significantly impacts the competence of +large language models (LLMs) in output explanation and counterfactual +reasoning, as causality reveals the underlying data distribution. However, the +lack of a comprehensive benchmark currently limits the evaluation of LLMs' +causal learning capabilities. To fill this gap, this paper develops CausalBench +based on data from the causal research community, enabling comparative +evaluations of LLMs against traditional causal learning algorithms. To provide +a comprehensive investigation, we offer three tasks of varying difficulties, +including correlation, causal skeleton, and causality identification. +Evaluations of 19 leading LLMs reveal that, while closed-source LLMs show +potential for simple causal relationships, they significantly lag behind +traditional algorithms on larger-scale networks ($>50$ nodes). Specifically, +LLMs struggle with collider structures but excel at chain structures, +especially at long-chain causality analogous to Chains-of-Thought techniques. +This supports the current prompt approaches while suggesting directions to +enhance LLMs' causal reasoning capability. Furthermore, CausalBench +incorporates background knowledge and training data into prompts to thoroughly +unlock LLMs' text-comprehension ability during evaluation, whose findings +indicate that, LLM understand causality through semantic associations with +distinct entities, rather than directly from contextual information or +numerical distributions. + +
+
+
+
+
+ + ♻ ☆ EMR-Merging: Tuning-Free High-Performance Model Merging NeurIPS 2024 + + +
+ The success of pretrain-finetune paradigm brings about the release of +numerous model weights. In this case, merging models finetuned on different +tasks to enable a single model with multi-task capabilities is gaining +increasing attention for its practicability. Existing model merging methods +usually suffer from (1) significant performance degradation or (2) requiring +tuning by additional data or training. In this paper, we rethink and analyze +the existing model merging paradigm. We discover that using a single model's +weights can hardly simulate all the models' performance. To tackle this issue, +we propose Elect, Mask & Rescale-Merging (EMR-Merging). We first (a) elect a +unified model from all the model weights and then (b) generate extremely +lightweight task-specific modulators, including masks and rescalers, to align +the direction and magnitude between the unified model and each specific model, +respectively. EMR-Merging is tuning-free, thus requiring no data availability +or any additional training while showing impressive performance. We find that +EMR-Merging shows outstanding performance compared to existing merging methods +under different classical and newly-established settings, including merging +different numbers of vision models (up to 30), NLP models, PEFT models, and +multi-modal models. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ An Enhanced-State Reinforcement Learning Algorithm for Multi-Task Fusion + in Large-Scale Recommender Systems + + +
+ As the last key stage of Recommender Systems (RSs), Multi-Task Fusion (MTF) +is in charge of combining multiple scores predicted by Multi-Task Learning +(MTL) into a final score to maximize user satisfaction, which decides the +ultimate recommendation results. In recent years, to maximize long-term user +satisfaction within a recommendation session, Reinforcement Learning (RL) is +widely used for MTF in large-scale RSs. However, limited by their modeling +pattern, all the current RL-MTF methods can only utilize user features as the +state to generate actions for each user, but unable to make use of item +features and other valuable features, which leads to suboptimal results. +Addressing this problem is a challenge that requires breaking through the +current modeling pattern of RL-MTF. To solve this problem, we propose a novel +method called Enhanced-State RL for MTF in RSs. Unlike the existing methods +mentioned above, our method first defines user features, item features, and +other valuable features collectively as the enhanced state; then proposes a +novel actor and critic learning process to utilize the enhanced state to make +much better action for each user-item pair. To the best of our knowledge, this +novel modeling pattern is being proposed for the first time in the field of +RL-MTF. We conduct extensive offline and online experiments in a large-scale +RS. The results demonstrate that our model outperforms other models +significantly. Enhanced-State RL has been fully deployed in our RS more than +half a year, improving +3.84% user valid consumption and +0.58% user duration +time compared to baseline. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2404.17589 +
+
+
+
+
+ + ♻ ☆ An Off-Policy Reinforcement Learning Algorithm Customized for Multi-Task + Fusion in Large-Scale Recommender Systems + + +
+ As the last critical stage of RSs, Multi-Task Fusion (MTF) is responsible for +combining multiple scores outputted by Multi-Task Learning (MTL) into a final +score to maximize user satisfaction, which determines the ultimate +recommendation results. Recently, to optimize long-term user satisfaction +within a recommendation session, Reinforcement Learning (RL) is used for MTF in +the industry. However, the off-policy RL algorithms used for MTF so far have +the following severe problems: 1) to avoid out-of-distribution (OOD) problem, +their constraints are overly strict, which seriously damage their performance; +2) they are unaware of the exploration policy used for producing training data +and never interact with real environment, so only suboptimal policy can be +learned; 3) the traditional exploration policies are inefficient and hurt user +experience. To solve the above problems, we propose a novel method named +IntegratedRL-MTF customized for MTF in large-scale RSs. IntegratedRL-MTF +integrates off-policy RL model with our online exploration policy to relax +overstrict and complicated constraints, which significantly improves its +performance. We also design an extremely efficient exploration policy, which +eliminates low-value exploration space and focuses on exploring potential +high-value state-action pairs. Moreover, we adopt progressive training mode to +further enhance our model's performance with the help of our exploration +policy. We conduct extensive offline and online experiments in the short video +channel of Tencent News. The results demonstrate that our model outperforms +other models remarkably. IntegratedRL-MTF has been fully deployed in our RS and +other large-scale RSs in Tencent, which have achieved significant improvements. + +
+
+
+
+
+ + ♻ ☆ 2D or not 2D: How Does the Dimensionality of Gesture Representation + Affect 3D Co-Speech Gesture Generation? + + +
+ Co-speech gestures are fundamental for communication. The advent of recent +deep learning techniques has facilitated the creation of lifelike, synchronous +co-speech gestures for Embodied Conversational Agents. "In-the-wild" datasets, +aggregating video content from platforms like YouTube via human pose detection +technologies, provide a feasible solution by offering 2D skeletal sequences +aligned with speech. Concurrent developments in lifting models enable the +conversion of these 2D sequences into 3D gesture databases. However, it is +important to note that the 3D poses estimated from the 2D extracted poses are, +in essence, approximations of the ground-truth, which remains in the 2D domain. +This distinction raises questions about the impact of gesture representation +dimensionality on the quality of generated motions - a topic that, to our +knowledge, remains largely unexplored. Our study examines the effect of using +either 2D or 3D joint coordinates as training data on the performance of +speech-to-gesture deep generative models. We employ a lifting model for +converting generated 2D pose sequences into 3D and assess how gestures created +directly in 3D stack up against those initially generated in 2D and then +converted to 3D. We perform an objective evaluation using widely used metrics +in the gesture generation field as well as a user study to qualitatively +evaluate the different approaches. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2406.15111 +
+
+
+
+
+ + ♻ ☆ Full error analysis of the random deep splitting method for nonlinear + parabolic PDEs and PIDEs + + +
+ In this paper, we present a randomized extension of the deep splitting +algorithm introduced in [Beck, Becker, Cheridito, Jentzen, and Neufeld (2021)] +using random neural networks suitable to approximately solve both +high-dimensional nonlinear parabolic PDEs and PIDEs with jumps having +(possibly) infinite activity. We provide a full error analysis of our so-called +random deep splitting method. In particular, we prove that our random deep +splitting method converges to the (unique viscosity) solution of the nonlinear +PDE or PIDE under consideration. Moreover, we empirically analyze our random +deep splitting method by considering several numerical examples including both +nonlinear PDEs and nonlinear PIDEs relevant in the context of pricing of +financial derivatives under default risk. In particular, we empirically +demonstrate in all examples that our random deep splitting method can +approximately solve nonlinear PDEs and PIDEs in 10'000 dimensions within +seconds. + +
+
+
+
+
+ + ♻ ☆ Sparse Low-Ranked Self-Attention Transformer for Remaining Useful + Lifetime Prediction of Optical Fiber Amplifiers + + +
+ Optical fiber amplifiers are key elements in present optical networks. +Failures of these components result in high financial loss of income of the +network operator as the communication traffic over an affected link is +interrupted. Applying Remaining useful lifetime (RUL) prediction in the context +of Predictive Maintenance (PdM) to optical fiber amplifiers to predict upcoming +system failures at an early stage, so that network outages can be minimized +through planning of targeted maintenance actions, ensures reliability and +safety. Optical fiber amplifier are complex systems, that work under various +operating conditions, which makes correct forecasting a difficult task. +Increased monitoring capabilities of systems results in datasets that +facilitate the application of data-driven RUL prediction methods. Deep learning +models in particular have shown good performance, but generalization based on +comparatively small datasets for RUL prediction is difficult. In this paper, we +propose Sparse Low-ranked self-Attention Transformer (SLAT) as a novel RUL +prediction method. SLAT is based on an encoder-decoder architecture, wherein +two parallel working encoders extract features for sensors and time steps. By +utilizing the self-attention mechanism, long-term dependencies can be learned +from long sequences. The implementation of sparsity in the attention matrix and +a low-rank parametrization reduce overfitting and increase generalization. +Experimental application to optical fiber amplifiers exemplified on EDFA, as +well as a reference dataset from turbofan engines, shows that SLAT outperforms +the state-of-the-art methods. + +
+
+ comment: 9 pages, 7 figures, submitted to IEEE Transactions on Machine + Learning in Communications and Networking (TMLCN) +
+
+
+
+
+ + ♻ ☆ Generalisation to unseen topologies: Towards control of biological + neural network activity + + +
+ Novel imaging and neurostimulation techniques open doors for advancements in +closed-loop control of activity in biological neural networks. This would allow +for applications in the investigation of activity propagation, and for +diagnosis and treatment of pathological behaviour. Due to the partially +observable characteristics of activity propagation, through networks in which +edges can not be observed, and the dynamic nature of neuronal systems, there is +a need for adaptive, generalisable control. In this paper, we introduce an +environment that procedurally generates neuronal networks with different +topologies to investigate this generalisation problem. Additionally, an +existing transformer-based architecture is adjusted to evaluate the +generalisation performance of a deep RL agent in the presented partially +observable environment. The agent demonstrates the capability to generalise +control from a limited number of training networks to unseen test networks. + +
+
+
+
+
+ + ♻ ☆ Two Trades is not Baffled: Condensing Graph via Crafting Rational + Gradient Matching + + +
+ Training on large-scale graphs has achieved remarkable results in graph +representation learning, but its cost and storage have raised growing concerns. +As one of the most promising directions, graph condensation methods address +these issues by employing gradient matching, aiming to condense the full graph +into a more concise yet information-rich synthetic set. Though encouraging, +these strategies primarily emphasize matching directions of the gradients, +which leads to deviations in the training trajectories. Such deviations are +further magnified by the differences between the condensation and evaluation +phases, culminating in accumulated errors, which detrimentally affect the +performance of the condensed graphs. In light of this, we propose a novel graph +condensation method named \textbf{C}raf\textbf{T}ing \textbf{R}ationa\textbf{L} +trajectory (\textbf{CTRL}), which offers an optimized starting point closer to +the original dataset's feature distribution and a more refined strategy for +gradient matching. Theoretically, CTRL can effectively neutralize the impact of +accumulated errors on the performance of condensed graphs. We provide extensive +experiments on various graph datasets and downstream tasks to support the +effectiveness of CTRL. Code is released at +https://github.com/NUS-HPC-AI-Lab/CTRL. + +
+
+ comment: An effective method for graph condensation +
+
+
+
+
+ + ♻ ☆ Can-SAVE: Mass Cancer Risk Prediction via Survival Analysis Variables + and EHR + + +
+ Specific medical cancer screening methods are often costly, time-consuming, +and weakly applicable on a large scale. Advanced Artificial Intelligence (AI) +methods greatly help cancer detection but require specific or deep medical +data. These aspects prevent the mass implementation of cancer screening +methods. For this reason, it is a disruptive change for healthcare to apply AI +methods for mass personalized assessment of the cancer risk among patients +based on the existing Electronic Health Records (EHR) volume. This paper +presents a novel Can-SAVE cancer risk assessment method combining a survival +analysis approach with a gradient-boosting algorithm. It is highly accessible +and resource-efficient, utilizing only a sequence of high-level medical events. +We tested the proposed method in a long-term retrospective experiment covering +more than 1.1 million people and four regions of Russia. The Can-SAVE method +significantly exceeds the baselines by the Average Precision metric of +22.8%$\pm$2.7% vs 15.1%$\pm$2.6%. The extensive ablation study also confirmed +the proposed method's dominant performance. The experiment supervised by +oncologists shows a reliable cancer patient detection rate of up to 84 out of +1000 selected. Such results surpass the medical screening strategies estimates; +the typical age-specific Number Needed to Screen is only 9 out of 1000 (for +colorectal cancer). Overall, our experiments show a 4.7-6.4 times improvement +in cancer detection rate (TOP@1k) compared to the traditional healthcare risk +estimation approach. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ SharkTrack: an accurate, generalisable software for streamlining shark + and ray underwater video analysis + + +
+ Elasmobranchs (shark sand rays) represent a critical component of marine +ecosystems. Yet, they are experiencing global population declines and effective +monitoring of populations is essential to their protection. Underwater +stationary videos, such as those from Baited Remote Underwater Video Stations +(BRUVS), are critical for understanding elasmobranch spatial ecology and +abundance. However, processing these videos requires time-consuming manual +analysis that can delay conservation. To address this challenge, we developed +SharkTrack, a semi-automatic underwater video analysis software. SharkTrack +uses Convolutional Neural Networks (CNN) and Multi-Object Tracking to +automatically detect and track elasmobranchs and provides an annotation +pipeline to manually classify elasmobranch species and compute species-specific +MaxN (ssMaxN), the standard metric of relative abundance. When tested on BRUVS +footage from locations unseen by the CNN model during training, SharkTrack +computed ssMaxN with 89% accuracy over 207 hours of footage. The semi-automatic +SharkTrack pipeline required two minutes of manual classification per hour of +video, an estimated 95% reduction of manual analysis time compared to +traditional methods. Furthermore, we demonstrate SharkTrack accuracy across +diverse marine ecosystems and elasmobranch species, an advancement compared to +previous models, which were limited to specific species or locations. +SharkTrack applications extend beyond BRUVS, facilitating the analysis of any +underwater stationary video. By making video analysis faster and more +accessible, SharkTrack enables research and conservation organisations to +monitor elasmobranch populations more efficiently, thereby improving +conservation efforts. To further support these goals, we provide public access +to the SharkTrack software. + +
+
+
+
+
+ + ♻ ☆ Make Large Language Model a Better Ranker + + +
+ Large Language Models (LLMs) demonstrate robust capabilities across various +fields, leading to a paradigm shift in LLM-enhanced Recommender System (RS). +Research to date focuses on point-wise and pair-wise recommendation paradigms, +which are inefficient for LLM-based recommenders due to high computational +costs. However, existing list-wise approaches also fall short in ranking tasks +due to misalignment between ranking objectives and next-token prediction. +Moreover, these LLM-based methods struggle to effectively address the order +relation among candidates, particularly given the scale of ratings. To address +these challenges, this paper introduces the large language model framework with +Aligned Listwise Ranking Objectives (ALRO). ALRO is designed to bridge the gap +between the capabilities of LLMs and the nuanced requirements of ranking tasks. +Specifically, ALRO employs explicit feedback in a listwise manner by +introducing soft lambda loss, a customized adaptation of lambda loss designed +for optimizing order relations. This mechanism provides more accurate +optimization goals, enhancing the ranking process. Additionally, ALRO +incorporates a permutation-sensitive learning mechanism that addresses position +bias, a prevalent issue in generative models, without imposing additional +computational burdens during inference. Our evaluative studies reveal that ALRO +outperforms both existing embedding-based recommendation methods and LLM-based +recommendation baselines. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Cross-Domain Few-Shot Object Detection via Enhanced Open-Set Object + Detector ECCV2024 + + +
+ This paper studies the challenging cross-domain few-shot object detection +(CD-FSOD), aiming to develop an accurate object detector for novel domains with +minimal labeled examples. While transformer-based open-set detectors, such as +DE-ViT, show promise in traditional few-shot object detection, their +generalization to CD-FSOD remains unclear: 1) can such open-set detection +methods easily generalize to CD-FSOD? 2) If not, how can models be enhanced +when facing huge domain gaps? To answer the first question, we employ measures +including style, inter-class variance (ICV), and indefinable boundaries (IB) to +understand the domain gap. Based on these measures, we establish a new +benchmark named CD-FSOD to evaluate object detection methods, revealing that +most of the current approaches fail to generalize across domains. Technically, +we observe that the performance decline is associated with our proposed +measures: style, ICV, and IB. Consequently, we propose several novel modules to +address these issues. First, the learnable instance features align initial +fixed instances with target categories, enhancing feature distinctiveness. +Second, the instance reweighting module assigns higher importance to +high-quality instances with slight IB. Third, the domain prompter encourages +features resilient to different styles by synthesizing imaginary domains +without altering semantic contents. These techniques collectively contribute to +the development of the Cross-Domain Vision Transformer for CD-FSOD (CD-ViTO), +significantly improving upon the base DE-ViT. Experimental results validate the +efficacy of our model. + +
+
+ comment: Accepted by ECCV2024 (project website: + http://yuqianfu.com/CDFSOD-benchmark) +
+
+
+
+
+ + ♻ ☆ CauSkelNet: Causal Representation Learning for Human Behaviour Analysis + + +
+ Constrained by the lack of model interpretability and a deep understanding of +human movement in traditional movement recognition machine learning methods, +this study introduces a novel representation learning method based on causal +inference to better understand human joint dynamics and complex behaviors. We +propose a two-stage framework that combines the Peter-Clark (PC) algorithm and +Kullback-Leibler (KL) divergence to identify and quantify causal relationships +between joints. Our method effectively captures interactions and produces +interpretable, robust representations. Experiments on the EmoPain dataset show +that our causal GCN outperforms traditional GCNs in accuracy, F1 score, and +recall, especially in detecting protective behaviors. The model is also highly +invariant to data scale changes, enhancing its reliability in practical +applications. Our approach advances human motion analysis and paves the way for +more adaptive intelligent healthcare solutions. + +
+
+
+
+
+ + ♻ ☆ DICTDIS: Dictionary Constrained Disambiguation for Improved NMT EMNLP + + +
+ Domain-specific neural machine translation (NMT) systems (e.g., in +educational applications) are socially significant with the potential to help +make information accessible to a diverse set of users in multilingual +societies. It is desirable that such NMT systems be lexically constrained and +draw from domain-specific dictionaries. Dictionaries could present multiple +candidate translations for a source word/phrase due to the polysemous nature of +words. The onus is then on the NMT model to choose the contextually most +appropriate candidate. Prior work has largely ignored this problem and focused +on the single candidate constraint setting wherein the target word or phrase is +replaced by a single constraint. In this work we present DictDis, a lexically +constrained NMT system that disambiguates between multiple candidate +translations derived from dictionaries. We achieve this by augmenting training +data with multiple dictionary candidates to actively encourage disambiguation +during training by implicitly aligning multiple candidate constraints. We +demonstrate the utility of DictDis via extensive experiments on English-Hindi +and English-German sentences in a variety of domains including regulatory, +finance, engineering. We also present comparisons on standard benchmark test +datasets. In comparison with existing approaches for lexically constrained and +unconstrained NMT, we demonstrate superior performance with respect to +constraint copy and disambiguation related measures on all domains while also +obtaining improved fluency of up to 2-3 BLEU points on some domains. + +
+
+ comment: In Findings of EMNLP, 2024 +
+
+
+
+
+ + ♻ ☆ In-context Contrastive Learning for Event Causality Identification + + +
+ Event Causality Identification (ECI) aims at determining the existence of a +causal relation between two events. Although recent prompt learning-based +approaches have shown promising improvements on the ECI task, their performance +are often subject to the delicate design of multiple prompts and the positive +correlations between the main task and derivate tasks. The in-context learning +paradigm provides explicit guidance for label prediction in the prompt learning +paradigm, alleviating its reliance on complex prompts and derivative tasks. +However, it does not distinguish between positive and negative demonstrations +for analogy learning. Motivated from such considerations, this paper proposes +an In-Context Contrastive Learning (ICCL) model that utilizes contrastive +learning to enhance the effectiveness of both positive and negative +demonstrations. Additionally, we apply contrastive learning to event pairs to +better facilitate event causality identification. Our ICCL is evaluated on the +widely used corpora, including the EventStoryLine and Causal-TimeBank, and +results show significant performance improvements over the state-of-the-art +algorithms. + +
+
+
+
+
+ + ♻ ☆ Deep Reinforcement Learning for Autonomous Cyber Defence: A Survey + + +
+ The rapid increase in the number of cyber-attacks in recent years raises the +need for principled methods for defending networks against malicious actors. +Deep reinforcement learning (DRL) has emerged as a promising approach for +mitigating these attacks. However, while DRL has shown much potential for cyber +defence, numerous challenges must be overcome before DRL can be applied to the +autonomous cyber defence (ACD) problem at scale. Principled methods are +required for environments that confront learners with very high-dimensional +state spaces, large multi-discrete action spaces, and adversarial learning. +Recent works have reported success in solving these problems individually. +There have also been impressive engineering efforts towards solving all three +for real-time strategy games. However, applying DRL to the full ACD problem +remains an open challenge. Here, we survey the relevant DRL literature and +conceptualize an idealised ACD-DRL agent. We provide: i.) A summary of the +domain properties that define the ACD problem; ii.) A comprehensive comparison +of current ACD environments used for benchmarking DRL approaches; iii.) An +overview of state-of-the-art approaches for scaling DRL to domains that +confront learners with the curse of dimensionality, and; iv.) A survey and +critique of current methods for limiting the exploitability of agents within +adversarial settings from the perspective of ACD. We conclude with open +research questions that we hope will motivate future directions for researchers +and practitioners working on ACD. + +
+
+ comment: 89 pages, 14 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Learning to Boost the Performance of Stable Nonlinear Systems + + +
+ The growing scale and complexity of safety-critical control systems +underscore the need to evolve current control architectures aiming for the +unparalleled performances achievable through state-of-the-art optimization and +machine learning algorithms. However, maintaining closed-loop stability while +boosting the performance of nonlinear control systems using data-driven and +deep-learning approaches stands as an important unsolved challenge. In this +paper, we tackle the performance-boosting problem with closed-loop stability +guarantees. Specifically, we establish a synergy between the Internal Model +Control (IMC) principle for nonlinear systems and state-of-the-art +unconstrained optimization approaches for learning stable dynamics. Our methods +enable learning over arbitrarily deep neural network classes of +performance-boosting controllers for stable nonlinear systems; crucially, we +guarantee L_p closed-loop stability even if optimization is halted prematurely, +and even when the ground-truth dynamics are unknown, with vanishing +conservatism in the class of stabilizing policies as the model uncertainty is +reduced to zero. We discuss the implementation details of the proposed control +schemes, including distributed ones, along with the corresponding optimization +procedures, demonstrating the potential of freely shaping the cost functions +through several numerical experiments. + +
+
+
+
+
+ + ♻ ☆ Reservoir Static Property Estimation Using Nearest-Neighbor Neural + Network + + +
+ This note presents an approach for estimating the spatial distribution of +static properties in reservoir modeling using a nearest-neighbor neural +network. The method leverages the strengths of neural networks in approximating +complex, non-linear functions, particularly for tasks involving spatial +interpolation. It incorporates a nearest-neighbor algorithm to capture local +spatial relationships between data points and introduces randomization to +quantify the uncertainty inherent in the interpolation process. This approach +addresses the limitations of traditional geostatistical methods, such as +Inverse Distance Weighting (IDW) and Kriging, which often fail to model the +complex non-linear dependencies in reservoir data. By integrating spatial +proximity and uncertainty quantification, the proposed method can improve the +accuracy of static property predictions like porosity and permeability. + +
+
+ comment: 6 pages, 3 figures; updated to tex source +
+
+
+
+
+ + ♻ ☆ SustainDC -- Benchmarking for Sustainable Data Center Control NeurIPS 2024 + + +
+ Machine learning has driven an exponential increase in computational demand, +leading to massive data centers that consume significant amounts of energy and +contribute to climate change. This makes sustainable data center control a +priority. In this paper, we introduce SustainDC, a set of Python environments +for benchmarking multi-agent reinforcement learning (MARL) algorithms for data +centers (DC). SustainDC supports custom DC configurations and tasks such as +workload scheduling, cooling optimization, and auxiliary battery management, +with multiple agents managing these operations while accounting for the effects +of each other. We evaluate various MARL algorithms on SustainDC, showing their +performance across diverse DC designs, locations, weather conditions, grid +carbon intensity, and workload requirements. Our results highlight significant +opportunities for improvement of data center operations using MARL algorithms. +Given the increasing use of DC due to AI, SustainDC provides a crucial platform +for the development and benchmarking of advanced algorithms essential for +achieving sustainable computing and addressing other heterogeneous real-world +challenges. + +
+
+ comment: Under review at Advances in Neural Information Processing Systems + 2024 (NeurIPS 2024) +
+
+
+
+
+ + ♻ ☆ Robust Conformal Prediction Using Privileged Information + + +
+ We develop a method to generate prediction sets with a guaranteed coverage +rate that is robust to corruptions in the training data, such as missing or +noisy variables. Our approach builds on conformal prediction, a powerful +framework to construct prediction sets that are valid under the i.i.d +assumption. Importantly, naively applying conformal prediction does not provide +reliable predictions in this setting, due to the distribution shift induced by +the corruptions. To account for the distribution shift, we assume access to +privileged information (PI). The PI is formulated as additional features that +explain the distribution shift, however, they are only available during +training and absent at test time. We approach this problem by introducing a +novel generalization of weighted conformal prediction and support our method +with theoretical coverage guarantees. Empirical experiments on both real and +synthetic datasets indicate that our approach achieves a valid coverage rate +and constructs more informative predictions compared to existing methods, which +are not supported by theoretical guarantees. + +
+
+
+
+
+ + ♻ ☆ Lemon and Orange Disease Classification using CNN-Extracted Features and + Machine Learning Classifier + + +
+ Lemons and oranges, both are the most economically significant citrus fruits +globally. The production of lemons and oranges is severely affected due to +diseases in its growth stages. Fruit quality has degraded due to the presence +of flaws. Thus, it is necessary to diagnose the disease accurately so that we +can avoid major loss of lemons and oranges. To improve citrus farming, we +proposed a disease classification approach for lemons and oranges. This +approach would enable early disease detection and intervention, reduce yield +losses, and optimize resource allocation. For the initial modeling of disease +classification, the research uses innovative deep learning architectures such +as VGG16, VGG19 and ResNet50. In addition, for achieving better accuracy, the +basic machine learning algorithms used for classification problems include +Random Forest, Naive Bayes, K-Nearest Neighbors (KNN) and Logistic Regression. +The lemon and orange fruits diseases are classified more accurately (95.0% for +lemon and 99.69% for orange) by the model. The model's base features were +extracted from the ResNet50 pre-trained model and the diseases are classified +by the Logistic Regression which beats the performance given by VGG16 and VGG19 +for other classifiers. Experimental outcomes show that the proposed model also +outperforms existing models in which most of them classified the diseases using +the Softmax classifier without using any individual classifiers. + +
+
+
+
+
+ + ♻ ☆ Overcoming Growth-Induced Forgetting in Task-Agnostic Continual Learning + + +
+ In continual learning (CL), model growth enhances adaptability over new data, +improving knowledge retention for more tasks. However, improper model growth +can lead to severe degradation of previously learned knowledge, an issue we +name as growth-induced forgetting (GIFt), especially in task-agnostic CL using +entire grown model for inference. Existing works, despite adopting model growth +and random initialization for better adaptability, often fail to recognize the +presence of GIFt caused by improper model growth. This oversight limits +comprehensive control of forgetting and hinders full utilization of model +growth. We are the first in CL to identify this issue and conduct an in-depth +study on root cause of GIFt, where layer expansion stands out among model +growth strategies, widening layers without affecting model functionality. Yet, +direct adoption of layer expansion presents challenges. It lacks data-driven +control and initialization of expanded parameters to balance adaptability and +knowledge retention. This paper presents a novel SparseGrow approach to +overcome the issue of GIFt while enhancing adaptability over new data. +SparseGrow employs data-driven sparse layer expansion to control efficient +parameter usage during growth, reducing GIFt from excessive growth and +functionality changes. It also combines sparse growth with on-data +initialization at training late-stage to create partially 0-valued expansions +that fit learned distribution, enhancing retention and adaptability. To further +minimize forgetting, freezing is applied by calculating the sparse mask, +allowing data-driven preservation of important parameters. Through experiments +across datasets with various settings, cases, and task numbers, we demonstrate +the necessity of layer expansion and showcase the effectiveness of SparseGrow +in overcoming GIFt, highlighting its adaptability and knowledge retention for +incremental tasks. + +
+
+
+
+
+ + ♻ ☆ FedRepOpt: Gradient Re-parametrized Optimizers in Federated Learning + + +
+ Federated Learning (FL) has emerged as a privacy-preserving method for +training machine learning models in a distributed manner on edge devices. +However, on-device models face inherent computational power and memory +limitations, potentially resulting in constrained gradient updates. As the +model's size increases, the frequency of gradient updates on edge devices +decreases, ultimately leading to suboptimal training outcomes during any +particular FL round. This limits the feasibility of deploying advanced and +large-scale models on edge devices, hindering the potential for performance +enhancements. To address this issue, we propose FedRepOpt, a gradient +re-parameterized optimizer for FL. The gradient re-parameterized method allows +training a simple local model with a similar performance as a complex model by +modifying the optimizer's gradients according to a set of model-specific +hyperparameters obtained from the complex models. In this work, we focus on +VGG-style and Ghost-style models in the FL environment. Extensive experiments +demonstrate that models using FedRepOpt obtain a significant boost in +performance of 16.7% and 11.4% compared to the RepGhost-style and RepVGG-style +networks, while also demonstrating a faster convergence time of 11.7% and 57.4% +compared to their complex structure. + +
+
+
+
+
+ + ♻ ☆ PromptKD: Distilling Student-Friendly Knowledge for Generative Language + Models via Prompt Tuning EMNLP 2024 + + +
+ Recent advancements in large language models (LLMs) have raised concerns +about inference costs, increasing the need for research into model compression. +While knowledge distillation (KD) is a prominent method for this, research on +KD for generative language models like LLMs is relatively sparse, and the +approach of distilling student-friendly knowledge, which has shown promising +performance in KD for classification models, remains unexplored in generative +language models. To explore this approach, we propose PromptKD, a simple yet +effective method that utilizes prompt tuning - for the first time in KD - to +enable generative language models to transfer student-friendly knowledge. +Unlike previous works in classification that require fine-tuning the entire +teacher model for extracting student-friendly knowledge, PromptKD achieves +similar effects by adding a small number of prompt tokens and tuning only the +prompt with student guidance. Extensive experiments on instruction-following +datasets show that PromptKD achieves state-of-the-art performance while adding +only 0.0007% of the teacher's parameters as prompts. Further analysis suggests +that distilling student-friendly knowledge alleviates exposure bias effectively +throughout the entire training process, leading to performance enhancements. + +
+
+ comment: EMNLP 2024 Findings. Our project page: https://promptkd.github.io +
+
+
+
+
+ + ♻ ☆ Large-Scale Multi-omic Biosequence Transformers for Modeling + Peptide-Nucleotide Interactions + + +
+ The transformer architecture has revolutionized bioinformatics and driven +progress in the understanding and prediction of the properties of biomolecules. +Almost all research on large-scale biosequence transformers has focused on one +domain at a time (single-omic), usually nucleotides or peptides. These models +have seen incredible success in downstream tasks in each domain and have +achieved particularly noteworthy breakthroughs in sequences of peptides and +structural modeling. However, these single-omic models are naturally incapable +of modeling multi-omic tasks, one of the most biologically critical being +nucleotide-peptide interactions. + We present our work training the first multi-omic nucleotide-peptide +foundation models. We show that these multi-omic models (MOMs) can learn joint +representations between various single-omic distributions that are emergently +consistent with the Central Dogma of molecular biology, despite only being +trained on unlabeled biosequences. We further demonstrate that MOMs can be +fine-tuned to achieve state-of-the-art results on peptide-nucleotide +interaction tasks, namely predicting the change in Gibbs free energy +({\Delta}G) of the binding interaction between a given oligonucleotide and +peptide, as well as the effect on this binding interaction due to mutations in +the oligonucleotide sequence ({\Delta}{\Delta}G). + Remarkably, we show that multi-omic biosequence transformers emergently learn +useful structural information without any prior structural training, allowing +us to predict which peptide residues are most involved in the +peptide-nucleotide binding interaction. Lastly, we provide evidence that +multi-omic biosequence models are non-inferior to foundation models trained on +single-omics distributions, suggesting a more generalized or foundational +approach to building these models. + +
+
+ comment: 27 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Sensor Response-Time Reduction using Long-Short Term Memory Network + Forecasting + + +
+ The response time of a biosensor is a crucial metric in safety-critical +applications such as medical diagnostics where an earlier diagnosis can +markedly improve patient outcomes. However, the speed at which a biosensor +reaches a final equilibrium state can be limited by poor mass transport and +long molecular diffusion times that increase the time it takes target molecules +to reach the active sensing region of a biosensor. While optimization of system +and sensor design can promote molecules reaching the sensing element faster, a +simpler and complementary approach for response time reduction that is widely +applicable across all sensor platforms is to use time-series forecasting to +predict the ultimate steady-state sensor response. In this work, we show that +ensembles of long short-term memory (LSTM) networks can accurately predict +equilibrium biosensor response from a small quantity of initial time-dependent +biosensor measurements, allowing for significant reduction in response time by +a mean and median factor of improvement of 18.6 and 5.1 respectively. The +ensemble of models simultaneously estimates uncertainty, which is vital for +ensuring confidence in the predictions and subsequent safety-related decisions +that are made. This approach is demonstrated on real-time experimental data +collected by exposing porous silicon biosensors to buffered protein solutions +using a multi-channel fluidic cell that enables the automated measurement of +100 porous silicon biosensors in parallel. The dramatic improvement in sensor +response time achieved using LSTM network ensembles and associated uncertainty +quantification opens the door to trustworthy and faster responding biosensors, +enabling more rapid medical diagnostics for faster clinical decision making +that can lead to improved patient outcomes and healthcare access, as well as +quicker identification of toxins in food and the environment. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Learning Transferable Time Series Classifier with Cross-Domain + Pre-training from Language Model + + +
+ Advancements in self-supervised pre-training (SSL) have significantly +advanced the field of learning transferable time series representations, which +can be very useful in enhancing the downstream task. Despite being effective, +most existing works struggle to achieve cross-domain SSL pre-training, missing +valuable opportunities to integrate patterns and features from different +domains. The main challenge lies in the significant differences in the +characteristics of time-series data across different domains, such as +variations in the number of channels and temporal resolution scales. To address +this challenge, we propose CrossTimeNet, a novel cross-domain SSL learning +framework to learn transferable knowledge from various domains to largely +benefit the target downstream task. One of the key characteristics of +CrossTimeNet is the newly designed time series tokenization module, which could +effectively convert the raw time series into a sequence of discrete tokens +based on a reconstruction optimization process. Besides, we highlight that +predicting a high proportion of corrupted tokens can be very helpful for +extracting informative patterns across different domains during SSL +pre-training, which has been largely overlooked in past years. Furthermore, +unlike previous works, our work treats the pre-training language model (PLM) as +the initialization of the encoder network, investigating the feasibility of +transferring the knowledge learned by the PLM to the time series area. Through +these efforts, the path to cross-domain pre-training of a generic time series +model can be effectively paved. We conduct extensive experiments in a +real-world scenario across various time series classification domains. The +experimental results clearly confirm CrossTimeNet's superior performance. + +
+
+
+
+
+ + ♻ ☆ A Survey of Out-of-distribution Generalization for Graph Machine + Learning from a Causal View + + +
+ Graph machine learning (GML) has been successfully applied across a wide +range of tasks. Nonetheless, GML faces significant challenges in generalizing +over out-of-distribution (OOD) data, which raises concerns about its wider +applicability. Recent advancements have underscored the crucial role of +causality-driven approaches in overcoming these generalization challenges. +Distinct from traditional GML methods that primarily rely on statistical +dependencies, causality-focused strategies delve into the underlying causal +mechanisms of data generation and model prediction, thus significantly +improving the generalization of GML across different environments. This paper +offers a thorough review of recent progress in causality-involved GML +generalization. We elucidate the fundamental concepts of employing causality to +enhance graph model generalization and categorize the various approaches, +providing detailed descriptions of their methodologies and the connections +among them. Furthermore, we explore the incorporation of causality in other +related important areas of trustworthy GML, such as explanation, fairness, and +robustness. Concluding with a discussion on potential future research +directions, this review seeks to articulate the continuing development and +future potential of causality in enhancing the trustworthiness of graph machine +learning. + +
+
+ comment: 15 pages, 2 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Achieving Dimension-Free Communication in Federated Learning via + Zeroth-Order Optimization + + +
+ Federated Learning (FL) offers a promising framework for collaborative and +privacy-preserving machine learning across distributed data sources. However, +the substantial communication costs associated with FL significantly challenge +its efficiency. Specifically, in each communication round, the communication +costs scale linearly with the model's dimension, which presents a formidable +obstacle, especially in large model scenarios. Despite various +communication-efficient strategies, the intrinsic dimension-dependent +communication cost remains a major bottleneck for current FL implementations. +This paper proposes a novel dimension-free communication algorithm -- DeComFL, +which leverages the zeroth-order optimization techniques and reduces the +communication cost from $\mathscr{O}(d)$ to $\mathscr{O}(1)$ by transmitting +only a constant number of scalar values between clients and the server in each +round, regardless of the dimension $d$ of the model parameters. Theoretically, +in non-convex functions, we prove that our algorithm achieves state-of-the-art +rates, which show a linear speedup of the number of clients and local steps +under standard assumptions. With additional low effective rank assumption, we +can further show the convergence rate is independent of the model dimension $d$ +as well. Empirical evaluations, encompassing both classic deep learning +training and large language model fine-tuning, demonstrate significant +reductions in communication overhead. Notably, DeComFL achieves this by +transmitting only around 1MB of data in total between the server and a client +to fine-tune a model with billions of parameters. + +
+
+
+
+
+ + ♻ ☆ Meta Clustering of Neural Bandits + + +
+ The contextual bandit has been identified as a powerful framework to +formulate the recommendation process as a sequential decision-making process, +where each item is regarded as an arm and the objective is to minimize the +regret of $T$ rounds. In this paper, we study a new problem, Clustering of +Neural Bandits, by extending previous work to the arbitrary reward function, to +strike a balance between user heterogeneity and user correlations in the +recommender system. To solve this problem, we propose a novel algorithm called +M-CNB, which utilizes a meta-learner to represent and rapidly adapt to dynamic +clusters, along with an informative Upper Confidence Bound (UCB)-based +exploration strategy. We provide an instance-dependent performance guarantee +for the proposed algorithm that withstands the adversarial context, and we +further prove the guarantee is at least as good as state-of-the-art (SOTA) +approaches under the same assumptions. In extensive experiments conducted in +both recommendation and online classification scenarios, M-CNB outperforms SOTA +baselines. This shows the effectiveness of the proposed approach in improving +online recommendation and online classification performance. + +
+
+ comment: Accepted by KDD 2024 +
+
+
+
+
+ + ♻ ☆ Gradient Flow Based Phase-Field Modeling Using Separable Neural Networks + + +
+ The $L^2$ gradient flow of the Ginzburg-Landau free energy functional leads +to the Allen Cahn equation that is widely used for modeling phase separation. +Machine learning methods for solving the Allen-Cahn equation in its strong form +suffer from inaccuracies in collocation techniques, errors in computing +higher-order spatial derivatives through automatic differentiation, and the +large system size required by the space-time approach. To overcome these +limitations, we propose a separable neural network-based approximation of the +phase field in a minimizing movement scheme to solve the aforementioned +gradient flow problem. At each time step, the separable neural network is used +to approximate the phase field in space through a low-rank tensor decomposition +thereby accelerating the derivative calculations. The minimizing movement +scheme naturally allows for the use of Gauss quadrature technique to compute +the functional. A `$tanh$' transformation is applied on the neural +network-predicted phase field to strictly bounds the solutions within the +values of the two phases. For this transformation, a theoretical guarantee for +energy stability of the minimizing movement scheme is established. Our results +suggest that bounding the solution through this transformation is the key to +effectively model sharp interfaces through separable neural network. The +proposed method outperforms the state-of-the-art machine learning methods for +phase separation problems and is an order of magnitude faster than the finite +element method. + +
+
+
+
+
+ + ♻ ☆ The Star Geometry of Critic-Based Regularizer Learning NeurIPS 2024 + + +
+ Variational regularization is a classical technique to solve statistical +inference tasks and inverse problems, with modern data-driven approaches +parameterizing regularizers via deep neural networks showcasing impressive +empirical performance. Recent works along these lines learn task-dependent +regularizers. This is done by integrating information about the measurements +and ground-truth data in an unsupervised, critic-based loss function, where the +regularizer attributes low values to likely data and high values to unlikely +data. However, there is little theory about the structure of regularizers +learned via this process and how it relates to the two data distributions. To +make progress on this challenge, we initiate a study of optimizing critic-based +loss functions to learn regularizers over a particular family of regularizers: +gauges (or Minkowski functionals) of star-shaped bodies. This family contains +regularizers that are commonly employed in practice and shares properties with +regularizers parameterized by deep neural networks. We specifically investigate +critic-based losses derived from variational representations of statistical +distances between probability measures. By leveraging tools from star geometry +and dual Brunn-Minkowski theory, we illustrate how these losses can be +interpreted as dual mixed volumes that depend on the data distribution. This +allows us to derive exact expressions for the optimal regularizer in certain +cases. Finally, we identify which neural network architectures give rise to +such star body gauges and when do such regularizers have favorable properties +for optimization. More broadly, this work highlights how the tools of star +geometry can aid in understanding the geometry of unsupervised regularizer +learning. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ LongLaMP: A Benchmark for Personalized Long-form Text Generation EMNLP + + +
+ Long-text generation is seemingly ubiquitous in real-world applications of +large language models such as generating an email or writing a review. Despite +the fundamental importance and prevalence of long-text generation in many +practical applications, existing work on personalized generation has focused on +the generation of very short text. To overcome these limitations, we study the +problem of personalized long-text generation, that is, generating long-text +that is personalized for a specific user while being practically useful for the +vast majority of real-world applications that naturally require the generation +of longer text. In this work, we demonstrate the importance of user-specific +personalization for long-text generation tasks and develop the Long-text +Language Model Personalization (LongLaMP) Benchmark. LongLaMP provides a +comprehensive and diverse evaluation framework for personalized long-text +generation. Extensive experiments on LongLaMP for zero-shot and fine-tuned +language tasks demonstrate the effectiveness of the proposed benchmark and its +utility for developing and evaluating techniques for personalized long-text +generation across a wide variety of long-text generation tasks. The results +highlight the importance of personalization across a wide variety of long-text +generation tasks. Finally, we release the benchmark for others to use for this +important problem. + +
+
+ comment: 9 pages, 4 figures, 20 tables(including appendix) submitted to EMNLP +
+
+
+
+
+ + ♻ ☆ Mask-Encoded Sparsification: Mitigating Biased Gradients in + Communication-Efficient Split Learning + + +
+ This paper introduces a novel framework designed to achieve a high +compression ratio in Split Learning (SL) scenarios where resource-constrained +devices are involved in large-scale model training. Our investigations +demonstrate that compressing feature maps within SL leads to biased gradients +that can negatively impact the convergence rates and diminish the +generalization capabilities of the resulting models. Our theoretical analysis +provides insights into how compression errors critically hinder SL performance, +which previous methodologies underestimate. To address these challenges, we +employ a narrow bit-width encoded mask to compensate for the sparsification +error without increasing the order of time complexity. Supported by rigorous +theoretical analysis, our framework significantly reduces compression errors +and accelerates the convergence. Extensive experiments also verify that our +method outperforms existing solutions regarding training efficiency and +communication complexity. + +
+
+
+
+
+ + ♻ ☆ 3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple + 3D Representations RSS + + +
+ Imitation learning provides an efficient way to teach robots dexterous +skills; however, learning complex skills robustly and generalizablely usually +consumes large amounts of human demonstrations. To tackle this challenging +problem, we present 3D Diffusion Policy (DP3), a novel visual imitation +learning approach that incorporates the power of 3D visual representations into +diffusion policies, a class of conditional action generative models. The core +design of DP3 is the utilization of a compact 3D visual representation, +extracted from sparse point clouds with an efficient point encoder. In our +experiments involving 72 simulation tasks, DP3 successfully handles most tasks +with just 10 demonstrations and surpasses baselines with a 24.2% relative +improvement. In 4 real robot tasks, DP3 demonstrates precise control with a +high success rate of 85%, given only 40 demonstrations of each task, and shows +excellent generalization abilities in diverse aspects, including space, +viewpoint, appearance, and instance. Interestingly, in real robot experiments, +DP3 rarely violates safety requirements, in contrast to baseline methods which +frequently do, necessitating human intervention. Our extensive evaluation +highlights the critical importance of 3D representations in real-world robot +learning. Videos, code, and data are available on +https://3d-diffusion-policy.github.io . + +
+
+ comment: Published at Robotics: Science and Systems (RSS) 2024. Videos, code, + and data: https://3d-diffusion-policy.github.io +
+
+
+
+
+
+
+
+ + Artificial Intelligence 128 + +
+
+
+ + ☆ PhysGen: Rigid-Body Physics-Grounded Image-to-Video Generation ECCV 2024 + + +
+ We present PhysGen, a novel image-to-video generation method that converts a +single image and an input condition (e.g., force and torque applied to an +object in the image) to produce a realistic, physically plausible, and +temporally consistent video. Our key insight is to integrate model-based +physical simulation with a data-driven video generation process, enabling +plausible image-space dynamics. At the heart of our system are three core +components: (i) an image understanding module that effectively captures the +geometry, materials, and physical parameters of the image; (ii) an image-space +dynamics simulation model that utilizes rigid-body physics and inferred +parameters to simulate realistic behaviors; and (iii) an image-based rendering +and refinement module that leverages generative video diffusion to produce +realistic video footage featuring the simulated motion. The resulting videos +are realistic in both physics and appearance and are even precisely +controllable, showcasing superior results over existing data-driven +image-to-video generation works through quantitative comparison and +comprehensive user study. PhysGen's resulting videos can be used for various +downstream applications, such as turning an image into a realistic animation or +allowing users to interact with the image and create various dynamics. Project +page: https://stevenlsw.github.io/physgen/ + +
+
+ comment: Accepted to ECCV 2024. Project page: + https://stevenlsw.github.io/physgen/ +
+
+
+
+
+ + ☆ Exploring Token Pruning in Vision State Space Models NeurIPS'24 + + +
+ State Space Models (SSMs) have the advantage of keeping linear computational +complexity compared to attention modules in transformers, and have been applied +to vision tasks as a new type of powerful vision foundation model. Inspired by +the observations that the final prediction in vision transformers (ViTs) is +only based on a subset of most informative tokens, we take the novel step of +enhancing the efficiency of SSM-based vision models through token-based +pruning. However, direct applications of existing token pruning techniques +designed for ViTs fail to deliver good performance, even with extensive +fine-tuning. To address this issue, we revisit the unique computational +characteristics of SSMs and discover that naive application disrupts the +sequential token positions. This insight motivates us to design a novel and +general token pruning method specifically for SSM-based vision models. We first +introduce a pruning-aware hidden state alignment method to stabilize the +neighborhood of remaining tokens for performance enhancement. Besides, based on +our detailed analysis, we propose a token importance evaluation method adapted +for SSM models, to guide the token pruning. With efficient implementation and +practical acceleration methods, our method brings actual speedup. Extensive +experiments demonstrate that our approach can achieve significant computation +reduction with minimal impact on performance across different tasks. Notably, +we achieve 81.7\% accuracy on ImageNet with a 41.6\% reduction in the FLOPs for +pruned PlainMamba-L3. Furthermore, our work provides deeper insights into +understanding the behavior of SSM-based vision models for future research. + +
+
+ comment: NeurIPS'24 +
+
+
+
+
+ + ☆ ProMerge: Prompt and Merge for Unsupervised Instance Segmentation ECCV2024 + + +
+ Unsupervised instance segmentation aims to segment distinct object instances +in an image without relying on human-labeled data. This field has recently seen +significant advancements, partly due to the strong local correspondences +afforded by rich visual feature representations from self-supervised models +(e.g., DINO). Recent state-of-the-art approaches use self-supervised features +to represent images as graphs and solve a generalized eigenvalue system (i.e., +normalized-cut) to generate foreground masks. While effective, this strategy is +limited by its attendant computational demands, leading to slow inference +speeds. In this paper, we propose Prompt and Merge (ProMerge), which leverages +self-supervised visual features to obtain initial groupings of patches and +applies a strategic merging to these segments, aided by a sophisticated +background-based mask pruning technique. ProMerge not only yields competitive +results but also offers a significant reduction in inference time compared to +state-of-the-art normalized-cut-based approaches. Furthermore, when training an +object detector using our mask predictions as pseudo-labels, the resulting +detector surpasses the current leading unsupervised model on various +challenging instance segmentation benchmarks. + +
+
+ comment: ECCV2024 camera-ready +
+
+
+
+
+ + ☆ $O(d/T)$ Convergence Theory for Diffusion Probabilistic Models under + Minimal Assumptions + + +
+ Score-based diffusion models, which generate new data by learning to reverse +a diffusion process that perturbs data from the target distribution into noise, +have achieved remarkable success across various generative tasks. Despite their +superior empirical performance, existing theoretical guarantees are often +constrained by stringent assumptions or suboptimal convergence rates. In this +paper, we establish a fast convergence theory for a popular SDE-based sampler +under minimal assumptions. Our analysis shows that, provided +$\ell_{2}$-accurate estimates of the score functions, the total variation +distance between the target and generated distributions is upper bounded by +$O(d/T)$ (ignoring logarithmic factors), where $d$ is the data dimensionality +and $T$ is the number of steps. This result holds for any target distribution +with finite first-order moment. To our knowledge, this improves upon existing +convergence theory for both the SDE-based sampler and another ODE-based +sampler, while imposing minimal assumptions on the target data distribution and +score estimates. This is achieved through a novel set of analytical tools that +provides a fine-grained characterization of how the error propagates at each +step of the reverse process. + +
+
+
+
+
+ + ☆ LML: Language Model Learning a Dataset for Data-Augmented Prediction + + +
+ This paper introduces a new approach to using Large Language Models (LLMs) +for classification tasks, which are typically handled using Machine Learning +(ML) models. Unlike ML models that rely heavily on data cleaning and feature +engineering, this method streamlines the process using LLMs. This paper +proposes a new concept called "Language Model Learning (LML)" powered by a new +method called "Data-Augmented Prediction (DAP)". The classification is +performed by LLMs using a method similar to humans manually exploring and +understanding the data and deciding classifications using data as a reference. +Training data is summarized and evaluated to determine the features that lead +to the classification of each label the most. In the process of DAP, the system +uses the data summary to automatically create a query, which is used to +retrieve relevant rows from the dataset. A classification is generated by the +LLM using data summary and relevant rows, ensuring satisfactory accuracy even +with complex data. Usage of data summary and similar data in DAP ensures +context-aware decision-making. The proposed method uses the words "Act as an +Explainable Machine Learning Model" in the prompt to enhance the +interpretability of the predictions by allowing users to review the logic +behind each prediction. In some test cases, the system scored an accuracy above +90%, proving the effectiveness of the system and its potential to outperform +conventional ML models in various scenarios. The code is available at +https://github.com/Pro-GenAI/LML-DAP + +
+
+ comment: First version +
+
+
+
+
+ + ☆ Unconditional stability of a recurrent neural circuit implementing + divisive normalization + + +
+ Stability in recurrent neural models poses a significant challenge, +particularly in developing biologically plausible neurodynamical models that +can be seamlessly trained. Traditional cortical circuit models are notoriously +difficult to train due to expansive nonlinearities in the dynamical system, +leading to an optimization problem with nonlinear stability constraints that +are difficult to impose. Conversely, recurrent neural networks (RNNs) excel in +tasks involving sequential data but lack biological plausibility and +interpretability. In this work, we address these challenges by linking dynamic +divisive normalization (DN) to the stability of ORGaNICs, a biologically +plausible recurrent cortical circuit model that dynamically achieves DN and has +been shown to simulate a wide range of neurophysiological phenomena. By using +the indirect method of Lyapunov, we prove the remarkable property of +unconditional local stability for an arbitrary-dimensional ORGaNICs circuit +when the recurrent weight matrix is the identity. We thus connect ORGaNICs to a +system of coupled damped harmonic oscillators, which enables us to derive the +circuit's energy function, providing a normative principle of what the circuit, +and individual neurons, aim to accomplish. Further, for a generic recurrent +weight matrix, we prove the stability of the 2D model and demonstrate +empirically that stability holds in higher dimensions. Finally, we show that +ORGaNICs can be trained by backpropagation through time without gradient +clipping/scaling, thanks to its intrinsic stability property and adaptive time +constants, which address the problems of exploding, vanishing, and oscillating +gradients. By evaluating the model's performance on RNN benchmarks, we find +that ORGaNICs outperform alternative neurodynamical models on static image +classification tasks and perform comparably to LSTMs on sequential tasks. + +
+
+
+
+
+ + ☆ Building Trust Through Voice: How Vocal Tone Impacts User Perception of + Attractiveness of Voice Assistants + + +
+ Voice Assistants (VAs) are popular for simple tasks, but users are often +hesitant to use them for complex activities like online shopping. We explored +whether the vocal characteristics like the VA's vocal tone, can make VAs +perceived as more attractive and trustworthy to users for complex tasks. Our +findings show that the tone of the VA voice significantly impacts its perceived +attractiveness and trustworthiness. Participants in our experiment were more +likely to be attracted to VAs with positive or neutral tones and ultimately +trusted the VAs they found more attractive. We conclude that VA's perceived +trustworthiness can be enhanced through thoughtful voice design, incorporating +a variety of vocal tones. + +
+
+ comment: Extended Abstract +
+
+
+
+
+ + ☆ From Seconds to Hours: Reviewing MultiModal Large Language Models on + Comprehensive Long Video Understanding + + +
+ The integration of Large Language Models (LLMs) with visual encoders has +recently shown promising performance in visual understanding tasks, leveraging +their inherent capability to comprehend and generate human-like text for visual +reasoning. Given the diverse nature of visual data, MultiModal Large Language +Models (MM-LLMs) exhibit variations in model designing and training for +understanding images, short videos, and long videos. Our paper focuses on the +substantial differences and unique challenges posed by long video understanding +compared to static image and short video understanding. Unlike static images, +short videos encompass sequential frames with both spatial and within-event +temporal information, while long videos consist of multiple events with +between-event and long-term temporal information. In this survey, we aim to +trace and summarize the advancements of MM-LLMs from image understanding to +long video understanding. We review the differences among various visual +understanding tasks and highlight the challenges in long video understanding, +including more fine-grained spatiotemporal details, dynamic events, and +long-term dependencies. We then provide a detailed summary of the advancements +in MM-LLMs in terms of model design and training methodologies for +understanding long videos. Finally, we compare the performance of existing +MM-LLMs on video understanding benchmarks of various lengths and discuss +potential future directions for MM-LLMs in long video understanding. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ AIPatient: Simulating Patients with EHRs and LLM Powered Agentic + Workflow + + +
+ Simulated patient systems play a crucial role in modern medical education and +research, providing safe, integrative learning environments and enabling +clinical decision-making simulations. Large Language Models (LLM) could advance +simulated patient systems by replicating medical conditions and patient-doctor +interactions with high fidelity and low cost. However, ensuring the +effectiveness and trustworthiness of these systems remains a challenge, as they +require a large, diverse, and precise patient knowledgebase, along with a +robust and stable knowledge diffusion to users. Here, we developed AIPatient, +an advanced simulated patient system with AIPatient Knowledge Graph (AIPatient +KG) as the input and the Reasoning Retrieval-Augmented Generation (Reasoning +RAG) agentic workflow as the generation backbone. AIPatient KG samples data +from Electronic Health Records (EHRs) in the Medical Information Mart for +Intensive Care (MIMIC)-III database, producing a clinically diverse and +relevant cohort of 1,495 patients with high knowledgebase validity (F1 0.89). +Reasoning RAG leverages six LLM powered agents spanning tasks including +retrieval, KG query generation, abstraction, checker, rewrite, and +summarization. This agentic framework reaches an overall accuracy of 94.15% in +EHR-based medical Question Answering (QA), outperforming benchmarks that use +either no agent or only partial agent integration. Our system also presents +high readability (median Flesch Reading Ease 77.23; median Flesch Kincaid Grade +5.6), robustness (ANOVA F-value 0.6126, p<0.1), and stability (ANOVA F-value +0.782, p<0.1). The promising performance of the AIPatient system highlights its +potential to support a wide range of applications, including medical education, +model evaluation, and system integration. + +
+
+ comment: 42 pages, 6 figures, 7 tables +
+
+
+
+
+ + ☆ Soft Measures for Extracting Causal Collective Intelligence EMNLP 2024 + + +
+ Understanding and modeling collective intelligence is essential for +addressing complex social systems. Directed graphs called fuzzy cognitive maps +(FCMs) offer a powerful tool for encoding causal mental models, but extracting +high-integrity FCMs from text is challenging. This study presents an approach +using large language models (LLMs) to automate FCM extraction. We introduce +novel graph-based similarity measures and evaluate them by correlating their +outputs with human judgments through the Elo rating system. Results show +positive correlations with human evaluations, but even the best-performing +measure exhibits limitations in capturing FCM nuances. Fine-tuning LLMs +improves performance, but existing measures still fall short. This study +highlights the need for soft similarity measures tailored to FCM extraction, +advancing collective intelligence modeling with NLP. + +
+
+ comment: Camera-ready version accepted for publication in the EMNLP 2024 + Workshop NLP4Science +
+
+
+
+
+ + ☆ Improving Visual Object Tracking through Visual Prompting + + +
+ Learning a discriminative model to distinguish a target from its surrounding +distractors is essential to generic visual object tracking. Dynamic target +representation adaptation against distractors is challenging due to the limited +discriminative capabilities of prevailing trackers. We present a new visual +Prompting mechanism for generic Visual Object Tracking (PiVOT) to address this +issue. PiVOT proposes a prompt generation network with the pre-trained +foundation model CLIP to automatically generate and refine visual prompts, +enabling the transfer of foundation model knowledge for tracking. While CLIP +offers broad category-level knowledge, the tracker, trained on +instance-specific data, excels at recognizing unique object instances. Thus, +PiVOT first compiles a visual prompt highlighting potential target locations. +To transfer the knowledge of CLIP to the tracker, PiVOT leverages CLIP to +refine the visual prompt based on the similarities between candidate objects +and the reference templates across potential targets. Once the visual prompt is +refined, it can better highlight potential target locations, thereby reducing +irrelevant prompt information. With the proposed prompting mechanism, the +tracker can generate improved instance-aware feature maps through the guidance +of the visual prompt, thus effectively reducing distractors. The proposed +method does not involve CLIP during training, thereby keeping the same training +complexity and preserving the generalization capability of the pretrained +foundation model. Extensive experiments across multiple benchmarks indicate +that PiVOT, using the proposed prompting method can suppress distracting +objects and enhance the tracker. + +
+
+ comment: Accepted and to appear in IEEE Transactions on Multimedia +
+
+
+
+
+ + ☆ Multi-Source Hard and Soft Information Fusion Approach for Accurate + Cryptocurrency Price Movement Prediction + + +
+ One of the most important challenges in the financial and cryptocurrency +field is accurately predicting cryptocurrency price trends. Leveraging +artificial intelligence (AI) is beneficial in addressing this challenge. +Cryptocurrency markets, marked by substantial growth and volatility, attract +investors and scholars keen on deciphering and forecasting cryptocurrency price +movements. The vast and diverse array of data available for such predictions +increases the complexity of the task. In our study, we introduce a novel +approach termed hard and soft information fusion (HSIF) to enhance the accuracy +of cryptocurrency price movement forecasts. The hard information component of +our approach encompasses historical price records alongside technical +indicators. Complementing this, the soft data component extracts from X +(formerly Twitter), encompassing news headlines and tweets about the +cryptocurrency. To use this data, we use the Bidirectional Encoder +Representations from Transformers (BERT)-based sentiment analysis method, +financial BERT (FinBERT), which performs best. Finally, our model feeds on the +information set including processed hard and soft data. We employ the +bidirectional long short-term memory (BiLSTM) model because processing +information in both forward and backward directions can capture long-term +dependencies in sequential information. Our empirical findings emphasize the +superiority of the HSIF approach over models dependent on single-source data by +testing on Bitcoin-related data. By fusing hard and soft information on Bitcoin +dataset, our model has about 96.8\% accuracy in predicting price movement. +Incorporating information enables our model to grasp the influence of social +sentiment on price fluctuations, thereby supplementing the technical +analysis-based predictions derived from hard information. + +
+
+
+
+
+ + ☆ Suicide Phenotyping from Clinical Notes in Safety-Net Psychiatric + Hospital Using Multi-Label Classification with Pre-Trained Language Models + + +
+ Accurate identification and categorization of suicidal events can yield +better suicide precautions, reducing operational burden, and improving care +quality in high-acuity psychiatric settings. Pre-trained language models offer +promise for identifying suicidality from unstructured clinical narratives. We +evaluated the performance of four BERT-based models using two fine-tuning +strategies (multiple single-label and single multi-label) for detecting +coexisting suicidal events from 500 annotated psychiatric evaluation notes. The +notes were labeled for suicidal ideation (SI), suicide attempts (SA), exposure +to suicide (ES), and non-suicidal self-injury (NSSI). RoBERTa outperformed +other models using binary relevance (acc=0.86, F1=0.78). MentalBERT (F1=0.74) +also exceeded BioClinicalBERT (F1=0.72). RoBERTa fine-tuned with a single +multi-label classifier further improved performance (acc=0.88, F1=0.81), +highlighting that models pre-trained on domain-relevant data and the single +multi-label classification strategy enhance efficiency and performance. + Keywords: EHR-based Phynotyping; Natural Language Processing; Secondary Use +of EHR Data; Suicide Classification; BERT-based Model; Psychiatry; Mental +Health + +
+
+ comment: submitted to AMIA Informatics Summit 2025 as a conference paper +
+
+
+
+
+ + ☆ UniEmoX: Cross-modal Semantic-Guided Large-Scale Pretraining for + Universal Scene Emotion Perception + + +
+ Visual emotion analysis holds significant research value in both computer +vision and psychology. However, existing methods for visual emotion analysis +suffer from limited generalizability due to the ambiguity of emotion perception +and the diversity of data scenarios. To tackle this issue, we introduce +UniEmoX, a cross-modal semantic-guided large-scale pretraining framework. +Inspired by psychological research emphasizing the inseparability of the +emotional exploration process from the interaction between individuals and +their environment, UniEmoX integrates scene-centric and person-centric +low-level image spatial structural information, aiming to derive more nuanced +and discriminative emotional representations. By exploiting the similarity +between paired and unpaired image-text samples, UniEmoX distills rich semantic +knowledge from the CLIP model to enhance emotional embedding representations +more effectively. To the best of our knowledge, this is the first large-scale +pretraining framework that integrates psychological theories with contemporary +contrastive learning and masked image modeling techniques for emotion analysis +across diverse scenarios. Additionally, we develop a visual emotional dataset +titled Emo8. Emo8 samples cover a range of domains, including cartoon, natural, +realistic, science fiction and advertising cover styles, covering nearly all +common emotional scenes. Comprehensive experiments conducted on six benchmark +datasets across two downstream tasks validate the effectiveness of UniEmoX. The +source code is available at https://github.com/chincharles/u-emo. + +
+
+ comment: Submitted to TIP +
+
+
+
+
+ + ☆ CESNET-TimeSeries24: Time Series Dataset for Network Traffic Anomaly + Detection and Forecasting + + +
+ Anomaly detection in network traffic is crucial for maintaining the security +of computer networks and identifying malicious activities. One of the primary +approaches to anomaly detection are methods based on forecasting. Nevertheless, +extensive real-world network datasets for forecasting and anomaly detection +techniques are missing, potentially causing performance overestimation of +anomaly detection algorithms. This manuscript addresses this gap by introducing +a dataset comprising time series data of network entities' behavior, collected +from the CESNET3 network. The dataset was created from 40 weeks of network +traffic of 275 thousand active IP addresses. The ISP origin of the presented +data ensures a high level of variability among network entities, which forms a +unique and authentic challenge for forecasting and anomaly detection models. It +provides valuable insights into the practical deployment of forecast-based +anomaly detection approaches. + +
+
+
+
+
+ + ☆ Individuation in Neural Models with and without Visual Grounding + + +
+ We show differences between a language-and-vision model CLIP and two +text-only models - FastText and SBERT - when it comes to the encoding of +individuation information. We study latent representations that CLIP provides +for substrates, granular aggregates, and various numbers of objects. We +demonstrate that CLIP embeddings capture quantitative differences in +individuation better than models trained on text-only data. Moreover, the +individuation hierarchy we deduce from the CLIP embeddings agrees with the +hierarchies proposed in linguistics and cognitive science. + +
+
+
+
+
+ + ☆ Positional Encoder Graph Quantile Neural Networks for Geographic Data + + +
+ Positional Encoder Graph Neural Networks (PE-GNNs) are a leading approach for +modeling continuous spatial data. However, they often fail to produce +calibrated predictive distributions, limiting their effectiveness for +uncertainty quantification. We introduce the Positional Encoder Graph Quantile +Neural Network (PE-GQNN), a novel method that integrates PE-GNNs, Quantile +Neural Networks, and recalibration techniques in a fully nonparametric +framework, requiring minimal assumptions about the predictive distributions. We +propose a new network architecture that, when combined with a quantile-based +loss function, yields accurate and reliable probabilistic models without +increasing computational complexity. Our approach provides a flexible, robust +framework for conditional density estimation, applicable beyond spatial data +contexts. We further introduce a structured method for incorporating a KNN +predictor into the model while avoiding data leakage through the GNN layer +operation. Experiments on benchmark datasets demonstrate that PE-GQNN +significantly outperforms existing state-of-the-art methods in both predictive +accuracy and uncertainty quantification. + +
+
+ comment: 17 main text pages, 4 figures +
+
+
+
+
+ + ☆ Mitigating Selection Bias with Node Pruning and Auxiliary Options + + +
+ Large language models (LLMs) often show unwarranted preference for certain +choice options when responding to multiple-choice questions, posing significant +reliability concerns in LLM-automated systems. To mitigate this selection bias +problem, previous solutions utilized debiasing methods to adjust the model's +input and/or output. Our work, in contrast, investigates the model's internal +representation of the selection bias. Specifically, we introduce a novel +debiasing approach, Bias Node Pruning (BNP), which eliminates the linear layer +parameters that contribute to the bias. Furthermore, we present Auxiliary +Option Injection (AOI), a simple yet effective input modification technique for +debiasing, which is compatible even with black-box LLMs. To provide a more +systematic evaluation of selection bias, we review existing metrics and +introduce Choice Kullback-Leibler Divergence (CKLD), which addresses the +insensitivity of the commonly used metrics to label imbalance. Experiments show +that our methods are robust and adaptable across various datasets when applied +to three LLMs. + +
+
+
+
+
+ + ☆ MECG-E: Mamba-based ECG Enhancer for Baseline Wander Removal + + +
+ Electrocardiogram (ECG) is an important non-invasive method for diagnosing +cardiovascular disease. However, ECG signals are susceptible to noise +contamination, such as electrical interference or signal wandering, which +reduces diagnostic accuracy. Various ECG denoising methods have been proposed, +but most existing methods yield suboptimal performance under very noisy +conditions or require several steps during inference, leading to latency during +online processing. In this paper, we propose a novel ECG denoising model, +namely Mamba-based ECG Enhancer (MECG-E), which leverages the Mamba +architecture known for its fast inference and outstanding nonlinear mapping +capabilities. Experimental results indicate that MECG-E surpasses several +well-known existing models across multiple metrics under different noise +conditions. Additionally, MECG-E requires less inference time than +state-of-the-art diffusion-based ECG denoisers, demonstrating the model's +functionality and efficiency. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ Early diagnosis of Alzheimer's disease from MRI images with deep + learning model + + +
+ It is acknowledged that the most common cause of dementia worldwide is +Alzheimer's disease (AD). This condition progresses in severity from mild to +severe and interferes with people's everyday routines. Early diagnosis plays a +critical role in patient care and clinical trials. Convolutional neural +networks (CNN) are used to create a framework for identifying specific disease +features from MRI scans Classification of dementia involves approaches such as +medical history review, neuropsychological tests, and magnetic resonance +imaging (MRI). However, the image dataset obtained from Kaggle faces a +significant issue of class imbalance, which requires equal distribution of +samples from each class to address. In this article, to address this imbalance, +the Synthetic Minority Oversampling Technique (SMOTE) is utilized. Furthermore, +a pre-trained convolutional neural network has been applied to the DEMNET +dementia network to extract key features from AD images. The proposed model +achieved an impressive accuracy of 98.67%. + +
+
+ comment: 7 pages, 3 figures, Presented at the 20-th CSI International + Symposium on Artificial Intelligence and Signal Processing (AISP) 21-22 + February, 2024, Mazandaran University of Science and Technology, Babol, Iran +
+
+
+
+
+ + ☆ LLMs4Synthesis: Leveraging Large Language Models for Scientific + Synthesis + + +
+ In response to the growing complexity and volume of scientific literature, +this paper introduces the LLMs4Synthesis framework, designed to enhance the +capabilities of Large Language Models (LLMs) in generating high-quality +scientific syntheses. This framework addresses the need for rapid, coherent, +and contextually rich integration of scientific insights, leveraging both +open-source and proprietary LLMs. It also examines the effectiveness of LLMs in +evaluating the integrity and reliability of these syntheses, alleviating +inadequacies in current quantitative metrics. Our study contributes to this +field by developing a novel methodology for processing scientific papers, +defining new synthesis types, and establishing nine detailed quality criteria +for evaluating syntheses. The integration of LLMs with reinforcement learning +and AI feedback is proposed to optimize synthesis quality, ensuring alignment +with established criteria. The LLMs4Synthesis framework and its components are +made available, promising to enhance both the generation and evaluation +processes in scientific research synthesis. + +
+
+ comment: 12 pages, 3 figures, Accepted to JCDL 2024 Research Track +
+
+
+
+
+ + ☆ Esports Debut as a Medal Event at 2023 Asian Games: Exploring Public + Perceptions with BERTopic and GPT-4 Topic Fine-Tuning + + +
+ This study examined the public opinions of esports at the 2023 Asian Games +and value co-creation during the event using an LLM-enhanced BERTopic modeling +analysis. We identified five major themes representing public perceptions, as +well as how major stakeholders co-created value within and beyond the esports +ecosystem. Key findings highlighted the strategic use of social media marketing +to influence public opinion and promote esports events and brands, emphasizing +the importance of event logistics and infrastructure. Additionally, the study +revealed the co-creation value contributed by stakeholders outside the +traditional esports ecosystem, particularly in promoting national +representation and performance. Our findings supported the ongoing efforts to +legitimize esports as a sport, noting that mainstream recognition remains a +challenge. The inclusion of esports as a medal event showcased broader +acceptance and helped mitigate negative public perceptions. Moreover, +contributions from non-traditional stakeholders underscored the value of +cross-subcultural collaborations in esports. + +
+
+
+
+
+ + ☆ Hierarchical Federated ADMM + + +
+ In this paper, we depart from the widely-used gradient descent-based +hierarchical federated learning (FL) algorithms to develop a novel hierarchical +FL framework based on the alternating direction method of multipliers (ADMM). +Within this framework, we propose two novel FL algorithms, which both use ADMM +in the top layer: one that employs ADMM in the lower layer and another that +uses the conventional gradient descent-based approach. The proposed framework +enhances privacy, and experiments demonstrate the superiority of the proposed +algorithms compared to the conventional algorithms in terms of learning +convergence and accuracy. Additionally, gradient descent on the lower layer +performs well even if the number of local steps is very limited, while ADMM on +both layers lead to better performance otherwise. + +
+
+
+
+
+ + A Survey on the Honesty of Large Language Models + + +
+ Honesty is a fundamental principle for aligning large language models (LLMs) +with human values, requiring these models to recognize what they know and don't +know and be able to faithfully express their knowledge. Despite promising, +current LLMs still exhibit significant dishonest behaviors, such as confidently +presenting wrong answers or failing to express what they know. In addition, +research on the honesty of LLMs also faces challenges, including varying +definitions of honesty, difficulties in distinguishing between known and +unknown knowledge, and a lack of comprehensive understanding of related +research. To address these issues, we provide a survey on the honesty of LLMs, +covering its clarification, evaluation approaches, and strategies for +improvement. Moreover, we offer insights for future research, aiming to inspire +further exploration in this important area. + +
+
+ comment: Project Page: https://github.com/SihengLi99/LLM-Honesty-Survey +
+
+
+
+
+ + ☆ HardCore Generation: Generating Hard UNSAT Problems for Data + Augmentation + + +
+ Efficiently determining the satisfiability of a boolean equation -- known as +the SAT problem for brevity -- is crucial in various industrial problems. +Recently, the advent of deep learning methods has introduced significant +potential for enhancing SAT solving. However, a major barrier to the +advancement of this field has been the scarcity of large, realistic datasets. +The majority of current public datasets are either randomly generated or +extremely limited, containing only a few examples from unrelated problem +families. These datasets are inadequate for meaningful training of deep +learning methods. In light of this, researchers have started exploring +generative techniques to create data that more accurately reflect SAT problems +encountered in practical situations. These methods have so far suffered from +either the inability to produce challenging SAT problems or time-scalability +obstacles. In this paper we address both by identifying and manipulating the +key contributors to a problem's ``hardness'', known as cores. Although some +previous work has addressed cores, the time costs are unacceptably high due to +the expense of traditional heuristic core detection techniques. We introduce a +fast core detection procedure that uses a graph neural network. Our empirical +results demonstrate that we can efficiently generate problems that remain hard +to solve and retain key attributes of the original example problems. We show +via experiment that the generated synthetic SAT problems can be used in a data +augmentation setting to provide improved prediction of solver runtimes. + +
+
+
+
+
+ + ☆ State-of-the-Art Periorbital Distance Prediction and Disease + Classification Using Periorbital Features + + +
+ Periorbital distances and features around the eyes and lids hold valuable +information for disease quantification and monitoring of surgical and medical +intervention. These distances are commonly measured manually, a process that is +both subjective and highly time-consuming. Here, we set out to developed three +deep-learning methods for segmentation and periorbital distance prediction, and +also evaluate the utility of periorbital distances for disease classification. +The MAE of our deep learning predicted distances was less than or very close to +the error observed between trained human annotators. We compared our models to +the current state-of-the-art (SOTA) method for periorbital distance prediction +and found that our methods outperformed SOTA on all of our datasets on all but +one periorbital measurement. We also show that robust segmentation can be +achieved on diseased eyes using models trained on open-source, healthy eyes, +and that periorbital distances have can be used as high-quality features in +downstream classification models. Leveraging segmentation networks as +intermediary steps in classification has broad implications for increasing the +generalizability of classification models in ophthalmic plastic and +craniofacial surgery by avoiding the out-of-distribution problem observed in +traditional convolutional neural networks. + +
+
+ comment: 16 pages, 4 figures, 4 tables +
+
+
+
+
+ + ☆ Learning from Demonstration with Implicit Nonlinear Dynamics Models + + +
+ Learning from Demonstration (LfD) is a useful paradigm for training policies +that solve tasks involving complex motions. In practice, the successful +application of LfD requires overcoming error accumulation during policy +execution, i.e. the problem of drift due to errors compounding over time and +the consequent out-of-distribution behaviours. Existing works seek to address +this problem through scaling data collection, correcting policy errors with a +human-in-the-loop, temporally ensembling policy predictions or through learning +the parameters of a dynamical system model. In this work, we propose and +validate an alternative approach to overcoming this issue. Inspired by +reservoir computing, we develop a novel neural network layer that includes a +fixed nonlinear dynamical system with tunable dynamical properties. We validate +the efficacy of our neural network layer on the task of reproducing human +handwriting motions using the LASA Human Handwriting Dataset. Through empirical +experiments we demonstrate that incorporating our layer into existing neural +network architectures addresses the issue of compounding errors in LfD. +Furthermore, we perform a comparative evaluation against existing approaches +including a temporal ensemble of policy predictions and an Echo State Networks +(ESNs) implementation. We find that our approach yields greater policy +precision and robustness on the handwriting task while also generalising to +multiple dynamics regimes and maintaining competitive latency scores. + +
+
+ comment: 21 pages, 9 figures +
+
+
+
+
+ + ☆ OpenObject-NAV: Open-Vocabulary Object-Oriented Navigation Based on + Dynamic Carrier-Relationship Scene Graph + + +
+ In everyday life, frequently used objects like cups often have unfixed +positions and multiple instances within the same category, and their carriers +frequently change as well. As a result, it becomes challenging for a robot to +efficiently navigate to a specific instance. To tackle this challenge, the +robot must capture and update scene changes and plans continuously. However, +current object navigation approaches primarily focus on semantic-level and lack +the ability to dynamically update scene representation. This paper captures the +relationships between frequently used objects and their static carriers. It +constructs an open-vocabulary Carrier-Relationship Scene Graph (CRSG) and +updates the carrying status during robot navigation to reflect the dynamic +changes of the scene. Based on the CRSG, we further propose an instance +navigation strategy that models the navigation process as a Markov Decision +Process. At each step, decisions are informed by Large Language Model's +commonsense knowledge and visual-language feature similarity. We designed a +series of long-sequence navigation tasks for frequently used everyday items in +the Habitat simulator. The results demonstrate that by updating the CRSG, the +robot can efficiently navigate to moved targets. Additionally, we deployed our +algorithm on a real robot and validated its practical effectiveness. + +
+
+ comment: Project website: https://openobject-nav.github.io/ +
+
+
+
+
+ + ☆ Autoregressive Policy Optimization for Constrained Allocation Tasks NeurIPS 2024 + + +
+ Allocation tasks represent a class of problems where a limited amount of +resources must be allocated to a set of entities at each time step. Prominent +examples of this task include portfolio optimization or distributing +computational workloads across servers. Allocation tasks are typically bound by +linear constraints describing practical requirements that have to be strictly +fulfilled at all times. In portfolio optimization, for example, investors may +be obligated to allocate less than 30\% of the funds into a certain industrial +sector in any investment period. Such constraints restrict the action space of +allowed allocations in intricate ways, which makes learning a policy that +avoids constraint violations difficult. In this paper, we propose a new method +for constrained allocation tasks based on an autoregressive process to +sequentially sample allocations for each entity. In addition, we introduce a +novel de-biasing mechanism to counter the initial bias caused by sequential +sampling. We demonstrate the superior performance of our approach compared to a +variety of Constrained Reinforcement Learning (CRL) methods on three distinct +constrained allocation tasks: portfolio optimization, computational workload +distribution, and a synthetic allocation benchmark. Our code is available at: +https://github.com/niklasdbs/paspo + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ☆ Multi-modal Medical Image Fusion For Non-Small Cell Lung Cancer + Classification + + +
+ The early detection and nuanced subtype classification of non-small cell lung +cancer (NSCLC), a predominant cause of cancer mortality worldwide, is a +critical and complex issue. In this paper, we introduce an innovative +integration of multi-modal data, synthesizing fused medical imaging (CT and PET +scans) with clinical health records and genomic data. This unique fusion +methodology leverages advanced machine learning models, notably MedClip and +BEiT, for sophisticated image feature extraction, setting a new standard in +computational oncology. Our research surpasses existing approaches, as +evidenced by a substantial enhancement in NSCLC detection and classification +precision. The results showcase notable improvements across key performance +metrics, including accuracy, precision, recall, and F1-score. Specifically, our +leading multi-modal classifier model records an impressive accuracy of 94.04%. +We believe that our approach has the potential to transform NSCLC diagnostics, +facilitating earlier detection and more effective treatment planning and, +ultimately, leading to superior patient outcomes in lung cancer care. + +
+
+
+
+
+ + ☆ Read Over the Lines: Attacking LLMs and Toxicity Detection Systems with + ASCII Art to Mask Profanity + + +
+ We introduce a novel family of adversarial attacks that exploit the inability +of language models to interpret ASCII art. To evaluate these attacks, we +propose the ToxASCII benchmark and develop two custom ASCII art fonts: one +leveraging special tokens and another using text-filled letter shapes. Our +attacks achieve a perfect 1.0 Attack Success Rate across ten models, including +OpenAI's o1-preview and LLaMA 3.1. + Warning: this paper contains examples of toxic language used for research +purposes. + +
+
+
+
+
+ + ☆ Speech Boosting: Low-Latency Live Speech Enhancement for TWS Earbuds + + +
+ This paper introduces a speech enhancement solution tailored for true +wireless stereo (TWS) earbuds on-device usage. The solution was specifically +designed to support conversations in noisy environments, with active noise +cancellation (ANC) activated. The primary challenges for speech enhancement +models in this context arise from computational complexity that limits +on-device usage and latency that must be less than 3 ms to preserve a live +conversation. To address these issues, we evaluated several crucial design +elements, including the network architecture and domain, design of loss +functions, pruning method, and hardware-specific optimization. Consequently, we +demonstrated substantial improvements in speech enhancement quality compared +with that in baseline models, while simultaneously reducing the computational +complexity and algorithmic latency. + +
+
+ comment: Accepted by Interspeech 2024 +
+
+
+
+
+ + ☆ Semantic Model Component Implementation for Model-driven Semantic + Communications + + +
+ The key feature of model-driven semantic communication is the propagation of +the model. The semantic model component (SMC) is designed to drive the +intelligent model to transmit in the physical channel, allowing the +intelligence to flow through the networks. According to the characteristics of +neural networks with common and individual model parameters, this paper designs +the cross-source-domain and cross-task semantic component model. Considering +that the basic model is deployed on the edge node, the large server node +updates the edge node by transmitting only the semantic component model to the +edge node so that the edge node can handle different sources and different +tasks. In addition, this paper also discusses how channel noise affects the +performance of the model and proposes methods of injection noise and +regularization to improve the noise resistance of the model. Experiments show +that SMCs use smaller model parameters to achieve cross-source, cross-task +functionality while maintaining performance and improving the model's tolerance +to noise. Finally, a component transfer-based unmanned vehicle tracking +prototype was implemented to verify the feasibility of model components in +practical applications. + +
+
+
+
+
+ + ☆ KALE-LM: Unleash The Power Of AI For Science Via Knowledge And Logic + Enhanced Large Model + + +
+ Artificial intelligence is gradually demonstrating its immense potential, and +increasing attention is being given to how AI can be harnessed to advance +scientific research. In this vision paper, we present our perspectives on how +AI can better assist scientific inquiry and explore corresponding technical +approach. We have proposed and open-sourced a large model of our KALE-LM model +series, Llama3-KALE-LM-Chem-8B, which has achieved outstanding performance in +tasks related to the field of chemistry. We hope that our work serves as a +strong starting point, helping to realize more intelligent AI and promoting the +advancement of human science and technology, as well as societal development. + +
+
+
+
+
+ + ☆ Learning from Pattern Completion: Self-supervised Controllable + Generation + + +
+ The human brain exhibits a strong ability to spontaneously associate +different visual attributes of the same or similar visual scene, such as +associating sketches and graffiti with real-world visual objects, usually +without supervising information. In contrast, in the field of artificial +intelligence, controllable generation methods like ControlNet heavily rely on +annotated training datasets such as depth maps, semantic segmentation maps, and +poses, which limits the method's scalability. Inspired by the neural mechanisms +that may contribute to the brain's associative power, specifically the cortical +modularization and hippocampal pattern completion, here we propose a +self-supervised controllable generation (SCG) framework. Firstly, we introduce +an equivariant constraint to promote inter-module independence and intra-module +correlation in a modular autoencoder network, thereby achieving functional +specialization. Subsequently, based on these specialized modules, we employ a +self-supervised pattern completion approach for controllable generation +training. Experimental results demonstrate that the proposed modular +autoencoder effectively achieves functional specialization, including the +modular processing of color, brightness, and edge detection, and exhibits +brain-like features including orientation selectivity, color antagonism, and +center-surround receptive fields. Through self-supervised training, associative +generation capabilities spontaneously emerge in SCG, demonstrating excellent +generalization ability to various tasks such as associative generation on +painting, sketches, and ancient graffiti. Compared to the previous +representative method ControlNet, our proposed approach not only demonstrates +superior robustness in more challenging high-noise scenarios but also possesses +more promising scalability potential due to its self-supervised manner. + +
+
+
+
+
+ + ☆ MG-Net: Learn to Customize QAOA with Circuit Depth Awareness + + +
+ Quantum Approximate Optimization Algorithm (QAOA) and its variants exhibit +immense potential in tackling combinatorial optimization challenges. However, +their practical realization confronts a dilemma: the requisite circuit depth +for satisfactory performance is problem-specific and often exceeds the maximum +capability of current quantum devices. To address this dilemma, here we first +analyze the convergence behavior of QAOA, uncovering the origins of this +dilemma and elucidating the intricate relationship between the employed mixer +Hamiltonian, the specific problem at hand, and the permissible maximum circuit +depth. Harnessing this understanding, we introduce the Mixer Generator Network +(MG-Net), a unified deep learning framework adept at dynamically formulating +optimal mixer Hamiltonians tailored to distinct tasks and circuit depths. +Systematic simulations, encompassing Ising models and weighted Max-Cut +instances with up to 64 qubits, substantiate our theoretical findings, +highlighting MG-Net's superior performance in terms of both approximation ratio +and efficiency. + +
+
+ comment: 29 pages, 16 figures +
+
+
+
+
+ + ☆ Beyond Single-Audio: Advancing Multi-Audio Processing in Audio Large + Language Models EMNLP24 + + +
+ Various audio-LLMs (ALLMs) have been explored recently for tackling different +audio tasks simultaneously using a single, unified model. While existing +evaluations of ALLMs primarily focus on single-audio tasks, real-world +applications often involve processing multiple audio streams simultaneously. To +bridge this gap, we propose the first multi-audio evaluation (MAE) benchmark +that consists of 20 datasets from 11 multi-audio tasks encompassing both speech +and sound scenarios. Comprehensive experiments on MAE demonstrate that the +existing ALLMs, while being powerful in comprehending primary audio elements in +individual audio inputs, struggling to handle multi-audio scenarios. To this +end, we propose a novel multi-audio-LLM (MALLM) to capture audio context among +multiple similar audios using discriminative learning on our proposed synthetic +data. The results demonstrate that the proposed MALLM outperforms all baselines +and achieves high data efficiency using synthetic data without requiring human +annotations. The proposed MALLM opens the door for ALLMs towards multi-audio +processing era and brings us closer to replicating human auditory capabilities +in machines. + +
+
+ comment: EMNLP24 Findings +
+
+
+
+
+ + ☆ Toward Universal and Interpretable World Models for Open-ended Learning + Agents + + +
+ We introduce a generic, compositional and interpretable class of generative +world models that supports open-ended learning agents. This is a sparse class +of Bayesian networks capable of approximating a broad range of stochastic +processes, which provide agents with the ability to learn world models in a +manner that may be both interpretable and computationally scalable. This +approach integrating Bayesian structure learning and intrinsically motivated +(model-based) planning enables agents to actively develop and refine their +world models, which may lead to open-ended learning and more robust, adaptive +behavior. + +
+
+ comment: 4 pages including appendix, 6 including appendix and references; 2 + figures +
+
+
+
+
+ + ☆ Exploiting Motion Prior for Accurate Pose Estimation of Dashboard + Cameras + + +
+ Dashboard cameras (dashcams) record millions of driving videos daily, +offering a valuable potential data source for various applications, including +driving map production and updates. A necessary step for utilizing these +dashcam data involves the estimation of camera poses. However, the low-quality +images captured by dashcams, characterized by motion blurs and dynamic objects, +pose challenges for existing image-matching methods in accurately estimating +camera poses. In this study, we propose a precise pose estimation method for +dashcam images, leveraging the inherent camera motion prior. Typically, image +sequences captured by dash cameras exhibit pronounced motion prior, such as +forward movement or lateral turns, which serve as essential cues for +correspondence estimation. Building upon this observation, we devise a pose +regression module aimed at learning camera motion prior, subsequently +integrating these prior into both correspondences and pose estimation +processes. The experiment shows that, in real dashcams dataset, our method is +22% better than the baseline for pose estimation in AUC5\textdegree, and it can +estimate poses for 19% more images with less reprojection error in Structure +from Motion (SfM). + +
+
+
+
+
+ + ☆ Not the Silver Bullet: LLM-enhanced Programming Error Messages are + Ineffective in Practice + + +
+ The sudden emergence of large language models (LLMs) such as ChatGPT has had +a disruptive impact throughout the computing education community. LLMs have +been shown to excel at producing correct code to CS1 and CS2 problems, and can +even act as friendly assistants to students learning how to code. Recent work +shows that LLMs demonstrate unequivocally superior results in being able to +explain and resolve compiler error messages -- for decades, one of the most +frustrating parts of learning how to code. However, LLM-generated error message +explanations have only been assessed by expert programmers in artificial +conditions. This work sought to understand how novice programmers resolve +programming error messages (PEMs) in a more realistic scenario. We ran a +within-subjects study with $n$ = 106 participants in which students were tasked +to fix six buggy C programs. For each program, participants were randomly +assigned to fix the problem using either a stock compiler error message, an +expert-handwritten error message, or an error message explanation generated by +GPT-4. Despite promising evidence on synthetic benchmarks, we found that GPT-4 +generated error messages outperformed conventional compiler error messages in +only 1 of the 6 tasks, measured by students' time-to-fix each problem. +Handwritten explanations still outperform LLM and conventional error messages, +both on objective and subjective measures. + +
+
+ comment: To appear in the proceedings of the 2024 UK and Ireland Computing + Education Research conference (UKICER '24) +
+
+
+
+
+ + ☆ Effects of AI Feedback on Learning, the Skill Gap, and Intellectual + Diversity + + +
+ Can human decision-makers learn from AI feedback? Using data on 52,000 +decision-makers from a large online chess platform, we investigate how their AI +use affects three interrelated long-term outcomes: Learning, skill gap, and +diversity of decision strategies. First, we show that individuals are far more +likely to seek AI feedback in situations in which they experienced success +rather than failure. This AI feedback seeking strategy turns out to be +detrimental to learning: Feedback on successes decreases future performance, +while feedback on failures increases it. Second, higher-skilled decision-makers +seek AI feedback more often and are far more likely to seek AI feedback after a +failure, and benefit more from AI feedback than lower-skilled individuals. As a +result, access to AI feedback increases, rather than decreases, the skill gap +between high- and low-skilled individuals. Finally, we leverage 42 major +platform updates as natural experiments to show that access to AI feedback +causes a decrease in intellectual diversity of the population as individuals +tend to specialize in the same areas. Together, those results indicate that +learning from AI feedback is not automatic and using AI correctly seems to be a +skill itself. Furthermore, despite its individual-level benefits, access to AI +feedback can have significant population-level downsides including loss of +intellectual diversity and an increasing skill gap. + +
+
+
+
+
+ + ☆ When SAM2 Meets Video Camouflaged Object Segmentation: A Comprehensive + Evaluation and Adaptation + + +
+ This study investigates the application and performance of the Segment +Anything Model 2 (SAM2) in the challenging task of video camouflaged object +segmentation (VCOS). VCOS involves detecting objects that blend seamlessly in +the surroundings for videos, due to similar colors and textures, poor light +conditions, etc. Compared to the objects in normal scenes, camouflaged objects +are much more difficult to detect. SAM2, a video foundation model, has shown +potential in various tasks. But its effectiveness in dynamic camouflaged +scenarios remains under-explored. This study presents a comprehensive study on +SAM2's ability in VCOS. First, we assess SAM2's performance on camouflaged +video datasets using different models and prompts (click, box, and mask). +Second, we explore the integration of SAM2 with existing multimodal large +language models (MLLMs) and VCOS methods. Third, we specifically adapt SAM2 by +fine-tuning it on the video camouflaged dataset. Our comprehensive experiments +demonstrate that SAM2 has excellent zero-shot ability of detecting camouflaged +objects in videos. We also show that this ability could be further improved by +specifically adjusting SAM2's parameters for VCOS. The code will be available +at https://github.com/zhoustan/SAM2-VCOS + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Enhanced Convolution Neural Network with Optimized Pooling and + Hyperparameter Tuning for Network Intrusion Detection + + +
+ Network Intrusion Detection Systems (NIDS) are essential for protecting +computer networks from malicious activities, including Denial of Service (DoS), +Probing, User-to-Root (U2R), and Remote-to-Local (R2L) attacks. Without +effective NIDS, networks are vulnerable to significant security breaches and +data loss. Machine learning techniques provide a promising approach to enhance +NIDS by automating threat detection and improving accuracy. In this research, +we propose an Enhanced Convolutional Neural Network (EnCNN) for NIDS and +evaluate its performance using the KDDCUP'99 dataset. Our methodology includes +comprehensive data preprocessing, exploratory data analysis (EDA), and feature +engineering. We compare EnCNN with various machine learning algorithms, +including Logistic Regression, Decision Trees, Support Vector Machines (SVM), +and ensemble methods like Random Forest, AdaBoost, and Voting Ensemble. The +results show that EnCNN significantly improves detection accuracy, with a +notable 10% increase over state-of-art approaches. This demonstrates the +effectiveness of EnCNN in real-time network intrusion detection, offering a +robust solution for identifying and mitigating security threats, and enhancing +overall network resilience. + +
+
+ comment: 7 Pages , 2 figures , 4 Tables , Conference paper +
+
+
+
+
+ + ☆ Reducing Diversity to Generate Hierarchical Archetypes + + +
+ The Artificial Intelligence field seldom address the development of a +fundamental building piece: a framework, methodology or algorithm to +automatically build hierarchies of abstractions. This is a key requirement in +order to build intelligent behaviour, as recent neuroscience studies clearly +expose. In this paper we present a primitive-based framework to automatically +generate hierarchies of constructive archetypes, as a theory of how to generate +hierarchies of abstractions. We assume the existence of a primitive with very +specific characteristics, and we develop our framework over it. We prove the +effectiveness of our framework through mathematical definitions and proofs. +Finally, we give a few insights about potential uses of our framework and the +expected results. + +
+
+
+
+
+ + ☆ Quantum Algorithms for Drone Mission Planning + + +
+ Mission planning often involves optimising the use of ISR (Intelligence, +Surveillance and Reconnaissance) assets in order to achieve a set of mission +objectives within allowed parameters subject to constraints. The missions of +interest here, involve routing multiple UAVs visiting multiple targets, +utilising sensors to capture data relating to each target. Finding such +solutions is often an NP-Hard problem and cannot be solved efficiently on +classical computers. Furthermore, during the mission new constraints and +objectives may arise, requiring a new solution to be computed within a short +time period. To achieve this we investigate near term quantum algorithms that +have the potential to offer speed-ups against current classical methods. We +demonstrate how a large family of these problems can be formulated as a Mixed +Integer Linear Program (MILP) and then converted to a Quadratic Unconstrained +Binary Optimisation (QUBO). The formulation provided is versatile and can be +adapted for many different constraints with clear qubit scaling provided. We +discuss the results of solving the QUBO formulation using commercial quantum +annealers and compare the solutions to current edge classical solvers. We also +analyse the results from solving the QUBO using Quantum Approximate +Optimisation Algorithms (QAOA) and discuss their results. Finally, we also +provide efficient methods to encode to the problem into the Variational Quantum +Eigensolver (VQE) formalism, where we have tailored the ansatz to the problem +making efficient use of the qubits available. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ☆ Entropy, concentration, and learning: a statistical mechanics primer + + +
+ Artificial intelligence models trained through loss minimization have +demonstrated significant success, grounded in principles from fields like +information theory and statistical physics. This work explores these +established connections through the lens of statistical mechanics, starting +from first-principles sample concentration behaviors that underpin AI and +machine learning. Our development of statistical mechanics for modeling +highlights the key role of exponential families, and quantities of statistics, +physics, and information theory. + +
+
+
+
+
+ + ☆ Towards Integrating Epistemic Uncertainty Estimation into the + Radiotherapy Workflow + + +
+ The precision of contouring target structures and organs-at-risk (OAR) in +radiotherapy planning is crucial for ensuring treatment efficacy and patient +safety. Recent advancements in deep learning (DL) have significantly improved +OAR contouring performance, yet the reliability of these models, especially in +the presence of out-of-distribution (OOD) scenarios, remains a concern in +clinical settings. This application study explores the integration of epistemic +uncertainty estimation within the OAR contouring workflow to enable OOD +detection in clinically relevant scenarios, using specifically compiled data. +Furthermore, we introduce an advanced statistical method for OOD detection to +enhance the methodological framework of uncertainty estimation. Our empirical +evaluation demonstrates that epistemic uncertainty estimation is effective in +identifying instances where model predictions are unreliable and may require an +expert review. Notably, our approach achieves an AUC-ROC of 0.95 for OOD +detection, with a specificity of 0.95 and a sensitivity of 0.92 for implant +cases, underscoring its efficacy. This study addresses significant gaps in the +current research landscape, such as the lack of ground truth for uncertainty +estimation and limited empirical evaluations. Additionally, it provides a +clinically relevant application of epistemic uncertainty estimation in an +FDA-approved and widely used clinical solution for OAR segmentation from +Varian, a Siemens Healthineers company, highlighting its practical benefits. + +
+
+ comment: Keywords: Epistemic Uncertainty - Out-of-Distribution Detection - CT + Segmentation - OAR contouring - Radiotherapy +
+
+
+
+
+ + ☆ Refutation of Spectral Graph Theory Conjectures with Search Algorithms) + + +
+ We are interested in the automatic refutation of spectral graph theory +conjectures. Most existing works address this problem either with the +exhaustive generation of graphs with a limited size or with deep reinforcement +learning. Exhaustive generation is limited by the size of the generated graphs +and deep reinforcement learning takes hours or days to refute a conjecture. We +propose to use search algorithms to address these shortcomings to find +potentially large counter-examples to spectral graph theory conjectures in +seconds. We apply a wide range of search algorithms to a selection of +conjectures from Graffiti. Out of 13 already refuted conjectures from Graffiti, +our algorithms are able to refute 12 in seconds. We also refute conjecture 197 +from Graffiti which was open until now. + +
+
+
+
+
+ + ☆ Unsupervised Cognition + + +
+ Unsupervised learning methods have a soft inspiration in cognition models. To +this day, the most successful unsupervised learning methods revolve around +clustering samples in a mathematical space. In this paper we propose a +state-of-the-art primitive-based unsupervised learning approach for +decision-making inspired by novel cognition models. This representation-centric +approach models the input space constructively as a distributed hierarchical +structure in an input-agnostic way. We compared our approach with current +state-of-the-art in unsupervised learning classification, and with current +state-of-the-art in cancer type classification. We show how our proposal +outperforms previous state-of-the-art. We also evaluate some cognition-like +properties of our proposal where it not only outperforms the compared +algorithms (even supervised learning ones), but it also shows a different, more +cognition-like, behaviour. + +
+
+
+
+
+ + ☆ Model-based Preference Optimization in Abstractive Summarization without + Human Feedback EMNLP 2024 + + +
+ In abstractive summarization, the challenge of producing concise and accurate +summaries arises from the vast amount of information contained in the source +document. Consequently, although Large Language Models (LLMs) can generate +fluent text, they often introduce inaccuracies by hallucinating content not +found in the original source. While supervised fine-tuning methods that +maximize likelihood contribute to this issue, they do not consistently enhance +the faithfulness of the summaries. Preference-based optimization methods, such +as Direct Preference Optimization (DPO), can further refine the model to align +with human preferences. However, these methods still heavily depend on costly +human feedback. In this work, we introduce a novel and straightforward approach +called Model-based Preference Optimization (MPO) to fine-tune LLMs for improved +summarization abilities without any human feedback. By leveraging the model's +inherent summarization capabilities, we create a preference dataset that is +fully generated by the model using different decoding strategies. Our +experiments on standard summarization datasets and various metrics demonstrate +that our proposed MPO significantly enhances the quality of generated summaries +without relying on human feedback. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ☆ TemporalPaD: a reinforcement-learning framework for temporal feature + representation and dimension reduction + + +
+ Recent advancements in feature representation and dimension reduction have +highlighted their crucial role in enhancing the efficacy of predictive +modeling. This work introduces TemporalPaD, a novel end-to-end deep learning +framework designed for temporal pattern datasets. TemporalPaD integrates +reinforcement learning (RL) with neural networks to achieve concurrent feature +representation and feature reduction. The framework consists of three +cooperative modules: a Policy Module, a Representation Module, and a +Classification Module, structured based on the Actor-Critic (AC) framework. The +Policy Module, responsible for dimensionality reduction through RL, functions +as the actor, while the Representation Module for feature extraction and the +Classification Module collectively serve as the critic. We comprehensively +evaluate TemporalPaD using 29 UCI datasets, a well-known benchmark for +validating feature reduction algorithms, through 10 independent tests and +10-fold cross-validation. Additionally, given that TemporalPaD is specifically +designed for time series data, we apply it to a real-world DNA classification +problem involving enhancer category and enhancer strength. The results +demonstrate that TemporalPaD is an efficient and effective framework for +achieving feature reduction, applicable to both structured data and sequence +datasets. The source code of the proposed TemporalPaD is freely available as +supplementary material to this article and at +http://www.healthinformaticslab.org/supp/. + +
+
+
+
+
+ + ☆ ASAG2024: A Combined Benchmark for Short Answer Grading + + +
+ Open-ended questions test a more thorough understanding than closed-ended +questions and are often a preferred assessment method. However, open-ended +questions are tedious to grade and subject to personal bias. Therefore, there +have been efforts to speed up the grading process through automation. Short +Answer Grading (SAG) systems aim to automatically score students' answers. +Despite growth in SAG methods and capabilities, there exists no comprehensive +short-answer grading benchmark across different subjects, grading scales, and +distributions. Thus, it is hard to assess the capabilities of current automated +grading methods in terms of their generalizability. In this preliminary work, +we introduce the combined ASAG2024 benchmark to facilitate the comparison of +automated grading systems. Combining seven commonly used short-answer grading +datasets in a common structure and grading scale. For our benchmark, we +evaluate a set of recent SAG methods, revealing that while LLM-based approaches +reach new high scores, they still are far from reaching human performance. This +opens up avenues for future research on human-machine SAG systems. + +
+
+ comment: Accepted at SIGCSE-Virtual 2024 +
+
+
+
+
+ + ☆ "Oh LLM, I'm Asking Thee, Please Give Me a Decision Tree": Zero-Shot + Decision Tree Induction and Embedding with Large Language Models + + +
+ Large language models (LLMs) provide powerful means to leverage prior +knowledge for predictive modeling when data is limited. In this work, we +demonstrate how LLMs can use their compressed world knowledge to generate +intrinsically interpretable machine learning models, i.e., decision trees, +without any training data. We find that these zero-shot decision trees can +surpass data-driven trees on some small-sized tabular datasets and that +embeddings derived from these trees perform on par with data-driven tree-based +embeddings on average. Our knowledge-driven decision tree induction and +embedding approaches therefore serve as strong new baselines for data-driven +machine learning methods in the low-data regime. + +
+
+
+
+
+ + ☆ Analysis of Truncated Singular Value Decomposition for Koopman + Operator-Based Lane Change Model + + +
+ Understanding and modeling complex dynamic systems is crucial for enhancing +vehicle performance and safety, especially in the context of autonomous +driving. Recently, popular methods such as Koopman operators and their +approximators, known as Extended Dynamic Mode Decomposition (EDMD), have +emerged for their effectiveness in transforming strongly nonlinear system +behavior into linear representations. This allows them to be integrated with +conventional linear controllers. To achieve this, Singular Value Decomposition +(SVD), specifically truncated SVD, is employed to approximate Koopman operators +from extensive datasets efficiently. This study evaluates different basis +functions used in EDMD and ranks for truncated SVD for representing lane change +behavior models, aiming to balance computational efficiency with information +loss. The findings, however, suggest that the technique of truncated SVD does +not necessarily achieve substantial reductions in computational training time +and results in significant information loss. + +
+
+ comment: Submitted to the 21st International Conference on Informatics in + Control, Automation and Robotics (ICINCO 2024) +
+
+
+
+
+ + ☆ An Enhanced Federated Prototype Learning Method under Domain Shift + + +
+ Federated Learning (FL) allows collaborative machine learning training +without sharing private data. Numerous studies have shown that one significant +factor affecting the performance of federated learning models is the +heterogeneity of data across different clients, especially when the data is +sampled from various domains. A recent paper introduces variance-aware +dual-level prototype clustering and uses a novel $\alpha$-sparsity prototype +loss, which increases intra-class similarity and reduces inter-class +similarity. To ensure that the features converge within specific clusters, we +introduce an improved algorithm, Federated Prototype Learning with Convergent +Clusters, abbreviated as FedPLCC. To increase inter-class distances, we weight +each prototype with the size of the cluster it represents. To reduce +intra-class distances, considering that prototypes with larger distances might +come from different domains, we select only a certain proportion of prototypes +for the loss function calculation. Evaluations on the Digit-5, Office-10, and +DomainNet datasets show that our method performs better than existing +approaches. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ Experimental Evaluation of Machine Learning Models for Goal-oriented + Customer Service Chatbot with Pipeline Architecture + + +
+ Integrating machine learning (ML) into customer service chatbots enhances +their ability to understand and respond to user queries, ultimately improving +service performance. However, they may appear artificial to some users and +affecting customer experience. Hence, meticulous evaluation of ML models for +each pipeline component is crucial for optimizing performance, though +differences in functionalities can lead to unfair comparisons. In this paper, +we present a tailored experimental evaluation approach for goal-oriented +customer service chatbots with pipeline architecture, focusing on three key +components: Natural Language Understanding (NLU), dialogue management (DM), and +Natural Language Generation (NLG). Our methodology emphasizes individual +assessment to determine optimal ML models. Specifically, we focus on optimizing +hyperparameters and evaluating candidate models for NLU (utilizing BERT and +LSTM), DM (employing DQN and DDQN), and NLG (leveraging GPT-2 and DialoGPT). +The results show that for the NLU component, BERT excelled in intent detection +whereas LSTM was superior for slot filling. For the DM component, the DDQN +model outperformed DQN by achieving fewer turns, higher rewards, as well as +greater success rates. For NLG, the large language model GPT-2 surpassed +DialoGPT in BLEU, METEOR, and ROUGE metrics. These findings aim to provide a +benchmark for future research in developing and optimizing customer service +chatbots, offering valuable insights into model performance and optimal +hyperparameters. + +
+
+
+
+
+ + ☆ Efficient Noise Mitigation for Enhancing Inference Accuracy in DNNs on + Mixed-Signal Accelerators + + +
+ In this paper, we propose a framework to enhance the robustness of the neural +models by mitigating the effects of process-induced and aging-related +variations of analog computing components on the accuracy of the analog neural +networks. We model these variations as the noise affecting the precision of the +activations and introduce a denoising block inserted between selected layers of +a pre-trained model. We demonstrate that training the denoising block +significantly increases the model's robustness against various noise levels. To +minimize the overhead associated with adding these blocks, we present an +exploration algorithm to identify optimal insertion points for the denoising +blocks. Additionally, we propose a specialized architecture to efficiently +execute the denoising blocks, which can be integrated into mixed-signal +accelerators. We evaluate the effectiveness of our approach using Deep Neural +Network (DNN) models trained on the ImageNet and CIFAR-10 datasets. The results +show that on average, by accepting 2.03% parameter count overhead, the accuracy +drop due to the variations reduces from 31.7% to 1.15%. + +
+
+
+
+
+ + ☆ Research on Predicting Public Opinion Event Heat Levels Based on Large + Language Models + + +
+ In recent years, with the rapid development of large language models, serval +models such as GPT-4o have demonstrated extraordinary capabilities, surpassing +human performance in various language tasks. As a result, many researchers have +begun exploring their potential applications in the field of public opinion +analysis. This study proposes a novel large-language-models-based method for +public opinion event heat level prediction. First, we preprocessed and +classified 62,836 Chinese hot event data collected between July 2022 and +December 2023. Then, based on each event's online dissemination heat index, we +used the MiniBatchKMeans algorithm to automatically cluster the events and +categorize them into four heat levels (ranging from low heat to very high +heat). Next, we randomly selected 250 events from each heat level, totalling +1,000 events, to build the evaluation dataset. During the evaluation process, +we employed various large language models to assess their accuracy in +predicting event heat levels in two scenarios: without reference cases and with +similar case references. The results showed that GPT-4o and DeepseekV2 +performed the best in the latter case, achieving prediction accuracies of 41.4% +and 41.5%, respectively. Although the overall prediction accuracy remains +relatively low, it is worth noting that for low-heat (Level 1) events, the +prediction accuracies of these two models reached 73.6% and 70.4%, +respectively. Additionally, the prediction accuracy showed a downward trend +from Level 1 to Level 4, which correlates with the uneven distribution of data +across the heat levels in the actual dataset. This suggests that with the more +robust dataset, public opinion event heat level prediction based on large +language models will have significant research potential for the future. + +
+
+ comment: conference +
+
+
+
+
+ + ☆ An Epistemic Human-Aware Task Planner which Anticipates Human Beliefs + and Decisions + + +
+ We present a substantial extension of our Human-Aware Task Planning +framework, tailored for scenarios with intermittent shared execution +experiences and significant belief divergence between humans and robots, +particularly due to the uncontrollable nature of humans. Our objective is to +build a robot policy that accounts for uncontrollable human behaviors, thus +enabling the anticipation of possible advancements achieved by the robot when +the execution is not shared, e.g. when humans are briefly absent from the +shared environment to complete a subtask. But, this anticipation is considered +from the perspective of humans who have access to an estimated model for the +robot. To this end, we propose a novel planning framework and build a solver +based on AND-OR search, which integrates knowledge reasoning, including +situation assessment by perspective taking. Our approach dynamically models and +manages the expansion and contraction of potential advances while precisely +keeping track of when (and when not) agents share the task execution +experience. The planner systematically assesses the situation and ignores +worlds that it has reason to think are impossible for humans. Overall, our new +solver can estimate the distinct beliefs of the human and the robot along +potential courses of action, enabling the synthesis of plans where the robot +selects the right moment for communication, i.e. informing, or replying to an +inquiry, or defers ontic actions until the execution experiences can be shared. +Preliminary experiments in two domains, one novel and one adapted, demonstrate +the effectiveness of the framework. + +
+
+ comment: 15 pages, 4 figures, 1 table +
+
+
+
+
+ + ☆ MIMII-Gen: Generative Modeling Approach for Simulated Evaluation of + Anomalous Sound Detection System + + +
+ Insufficient recordings and the scarcity of anomalies present significant +challenges in developing and validating robust anomaly detection systems for +machine sounds. To address these limitations, we propose a novel approach for +generating diverse anomalies in machine sound using a latent diffusion-based +model that integrates an encoder-decoder framework. Our method utilizes the +Flan-T5 model to encode captions derived from audio file metadata, enabling +conditional generation through a carefully designed U-Net architecture. This +approach aids our model in generating audio signals within the EnCodec latent +space, ensuring high contextual relevance and quality. We objectively evaluated +the quality of our generated sounds using the Fr\'echet Audio Distance (FAD) +score and other metrics, demonstrating that our approach surpasses existing +models in generating reliable machine audio that closely resembles actual +abnormal conditions. The evaluation of the anomaly detection system using our +generated data revealed a strong correlation, with the area under the curve +(AUC) score differing by 4.8\% from the original, validating the effectiveness +of our generated data. These results demonstrate the potential of our approach +to enhance the evaluation and robustness of anomaly detection systems across +varied and previously unseen conditions. Audio samples can be found at +\url{https://hpworkhub.github.io/MIMII-Gen.github.io/}. + +
+
+
+
+
+ + ☆ Align$^2$LLaVA: Cascaded Human and Large Language Model Preference + Alignment for Multi-modal Instruction Curation + + +
+ Recent advances in Multi-modal Large Language Models (MLLMs), such as +LLaVA-series models, are driven by massive machine-generated +instruction-following data tuning. Such automatic instruction collection +pipelines, however, inadvertently introduce significant variability in data +quality. This paper introduces a novel instruction curation algorithm, derived +from two unique perspectives, human and LLM preference alignment, to compress +this vast corpus of machine-generated multimodal instructions to a compact and +high-quality form: (i) For human preference alignment, we have collected a +machine-generated multimodal instruction dataset and established a +comprehensive set of both subjective and objective criteria to guide the data +quality assessment critically from human experts. By doing so, a reward model +was trained on the annotated dataset to internalize the nuanced human +understanding of instruction alignment. (ii) For LLM preference alignment, +given the instruction selected by the reward model, we propose leveraging the +inner LLM used in MLLM to align the writing style of visual instructions with +that of the inner LLM itself, resulting in LLM-aligned instruction improvement. +Extensive experiments demonstrate that we can maintain or even improve model +performance by compressing synthetic multimodal instructions by up to 90%. +Impressively, by aggressively reducing the total training sample size from 158k +to 14k (9$\times$ smaller), our model consistently outperforms its full-size +dataset counterpart across various MLLM benchmarks. Our project is available at +https://github.com/DCDmllm/Align2LLaVA. + +
+
+
+
+
+ + ☆ EmoPro: A Prompt Selection Strategy for Emotional Expression in LM-based + Speech Synthesis + + +
+ Recent advancements in speech synthesis models, trained on extensive +datasets, have demonstrated remarkable zero-shot capabilities. These models can +control content, timbre, and emotion in generated speech based on prompt +inputs. Despite these advancements, the choice of prompts significantly impacts +the output quality, yet most existing selection schemes do not adequately +address the control of emotional intensity. To address this question, this +paper proposes a two-stage prompt selection strategy EmoPro, which is +specifically designed for emotionally controllable speech synthesis. This +strategy focuses on selecting highly expressive and high-quality prompts by +evaluating them from four perspectives: emotional expression strength, speech +quality, text-emotion consistency, and model generation performance. +Experimental results show that prompts selected using the proposed method +result in more emotionally expressive and engaging synthesized speech compared +to those obtained through baseline. Audio samples and codes will be available +at https://whyrrrrun.github.io/EmoPro/. + +
+
+
+
+
+ + ☆ Fairness-aware Multiobjective Evolutionary Learning + + +
+ Multiobjective evolutionary learning (MOEL) has demonstrated its advantages +of training fairer machine learning models considering a predefined set of +conflicting objectives, including accuracy and different fairness measures. +Recent works propose to construct a representative subset of fairness measures +as optimisation objectives of MOEL throughout model training. However, the +determination of a representative measure set relies on dataset, prior +knowledge and requires substantial computational costs. What's more, those +representative measures may differ across different model training processes. +Instead of using a static predefined set determined before model training, this +paper proposes to dynamically and adaptively determine a representative measure +set online during model training. The dynamically determined representative set +is then used as optimising objectives of the MOEL framework and can vary with +time. Extensive experimental results on 12 well-known benchmark datasets +demonstrate that our proposed framework achieves outstanding performance +compared to state-of-the-art approaches for mitigating unfairness in terms of +accuracy as well as 25 fairness measures although only a few of them were +dynamically selected and used as optimisation objectives. The results indicate +the importance of setting optimisation objectives dynamically during training. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Data Analysis in the Era of Generative AI + + +
+ This paper explores the potential of AI-powered tools to reshape data +analysis, focusing on design considerations and challenges. We explore how the +emergence of large language and multimodal models offers new opportunities to +enhance various stages of data analysis workflow by translating high-level user +intentions into executable code, charts, and insights. We then examine +human-centered design principles that facilitate intuitive interactions, build +user trust, and streamline the AI-assisted analysis workflow across multiple +apps. Finally, we discuss the research challenges that impede the development +of these AI-based systems such as enhancing model capabilities, evaluating and +benchmarking, and understanding end-user needs. + +
+
+
+
+
+ + ☆ Towards Diverse Device Heterogeneous Federated Learning via Task + Arithmetic Knowledge Integration NeurIPS 2024 + + +
+ Federated Learning has emerged as a promising paradigm for collaborative +machine learning, while preserving user data privacy. Despite its potential, +standard FL lacks support for diverse heterogeneous device prototypes, which +vary significantly in model and dataset sizes -- from small IoT devices to +large workstations. This limitation is only partially addressed by existing +knowledge distillation techniques, which often fail to transfer knowledge +effectively across a broad spectrum of device prototypes with varied +capabilities. This failure primarily stems from two issues: the dilution of +informative logits from more capable devices by those from less capable ones, +and the use of a single integrated logits as the distillation target across all +devices, which neglects their individual learning capacities and and the unique +contributions of each. To address these challenges, we introduce TAKFL, a novel +KD-based framework that treats the knowledge transfer from each device +prototype's ensemble as a separate task, independently distilling each to +preserve its unique contributions and avoid dilution. TAKFL also incorporates a +KD-based self-regularization technique to mitigate the issues related to the +noisy and unsupervised ensemble distillation process. To integrate the +separately distilled knowledge, we introduce an adaptive task arithmetic +knowledge integration process, allowing each student model to customize the +knowledge integration for optimal performance. Additionally, we present +theoretical results demonstrating the effectiveness of task arithmetic in +transferring knowledge across heterogeneous devices with varying capacities. +Comprehensive evaluations of our method across both CV and NLP tasks +demonstrate that TAKFL achieves SOTA results in a variety of datasets and +settings, significantly outperforming existing KD-based methods. Code is +released at https://github.com/MMorafah/TAKFL + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Review of Digital Asset Development with Graph Neural Network Unlearning + + +
+ In the rapidly evolving landscape of digital assets, the imperative for +robust data privacy and compliance with regulatory frameworks has intensified. +This paper investigates the critical role of Graph Neural Networks (GNNs) in +the management of digital assets and introduces innovative unlearning +techniques specifically tailored to GNN architectures. We categorize unlearning +strategies into two primary classes: data-driven approximation, which +manipulates the graph structure to isolate and remove the influence of specific +nodes, and model-driven approximation, which modifies the internal parameters +and architecture of the GNN itself. By examining recent advancements in these +unlearning methodologies, we highlight their applicability in various use +cases, including fraud detection, risk assessment, token relationship +prediction, and decentralized governance. We discuss the challenges inherent in +balancing model performance with the requirements for data unlearning, +particularly in the context of real-time financial applications. Furthermore, +we propose a hybrid approach that combines the strengths of both unlearning +strategies to enhance the efficiency and effectiveness of GNNs in digital asset +ecosystems. Ultimately, this paper aims to provide a comprehensive framework +for understanding and implementing GNN unlearning techniques, paving the way +for secure and compliant deployment of machine learning in the digital asset +domain. + +
+
+
+
+
+ + ☆ Leveraging Long-Context Large Language Models for Multi-Document + Understanding and Summarization in Enterprise Applications + + +
+ The rapid increase in unstructured data across various fields has made +multi-document comprehension and summarization a critical task. Traditional +approaches often fail to capture relevant context, maintain logical +consistency, and extract essential information from lengthy documents. This +paper explores the use of Long-context Large Language Models (LLMs) for +multi-document summarization, demonstrating their exceptional capacity to grasp +extensive connections, provide cohesive summaries, and adapt to various +industry domains and integration with enterprise applications/systems. The +paper discusses the workflow of multi-document summarization for effectively +deploying long-context LLMs, supported by case studies in legal applications, +enterprise functions such as HR, finance, and sourcing, as well as in the +medical and news domains. These case studies show notable enhancements in both +efficiency and accuracy. Technical obstacles, such as dataset diversity, model +scalability, and ethical considerations like bias mitigation and factual +accuracy, are carefully analyzed. Prospective research avenues are suggested to +augment the functionalities and applications of long-context LLMs, establishing +them as pivotal tools for transforming information processing across diverse +sectors and enterprise applications. + +
+
+
+
+
+ + ☆ Cost-Aware Dynamic Cloud Workflow Scheduling using Self-Attention and + Evolutionary Reinforcement Learning + + +
+ The Cost-aware Dynamic Multi-Workflow Scheduling (CDMWS) in the cloud is a +kind of cloud workflow management problem, which aims to assign virtual machine +(VM) instances to execute tasks in workflows so as to minimize the total costs, +including both the penalties for violating Service Level Agreement (SLA) and +the VM rental fees. Powered by deep neural networks, Reinforcement Learning +(RL) methods can construct effective scheduling policies for solving CDMWS +problems. Traditional policy networks in RL often use basic feedforward +architectures to separately determine the suitability of assigning any VM +instances, without considering all VMs simultaneously to learn their global +information. This paper proposes a novel self-attention policy network for +cloud workflow scheduling (SPN-CWS) that captures global information from all +VMs. We also develop an Evolution Strategy-based RL (ERL) system to train +SPN-CWS reliably and effectively. The trained SPN-CWS can effectively process +all candidate VM instances simultaneously to identify the most suitable VM +instance to execute every workflow task. Comprehensive experiments show that +our method can noticeably outperform several state-of-the-art algorithms on +multiple benchmark CDMWS problems. + +
+
+ comment: This paper has been accepted by ICSOC (International Conference on + Service-Oriented Computing) 2024 +
+
+
+
+
+ + ☆ State-free Reinforcement Learning + + +
+ In this work, we study the \textit{state-free RL} problem, where the +algorithm does not have the states information before interacting with the +environment. Specifically, denote the reachable state set by ${S}^\Pi := \{ +s|\max_{\pi\in \Pi}q^{P, \pi}(s)>0 \}$, we design an algorithm which requires +no information on the state space $S$ while having a regret that is completely +independent of ${S}$ and only depend on ${S}^\Pi$. We view this as a concrete +first step towards \textit{parameter-free RL}, with the goal of designing RL +algorithms that require no hyper-parameter tuning. + +
+
+
+
+
+ + ☆ Physics Augmented Tuple Transformer for Autism Severity Level Detection + + +
+ Early diagnosis of Autism Spectrum Disorder (ASD) is an effective and +favorable step towards enhancing the health and well-being of children with +ASD. Manual ASD diagnosis testing is labor-intensive, complex, and prone to +human error due to several factors contaminating the results. This paper +proposes a novel framework that exploits the laws of physics for ASD severity +recognition. The proposed physics-informed neural network architecture encodes +the behaviour of the subject extracted by observing a part of the +skeleton-based motion trajectory in a higher dimensional latent space. Two +decoders, namely physics-based and non-physics-based decoder, use this latent +embedding and predict the future motion patterns. The physics branch leverages +the laws of physics that apply to a skeleton sequence in the prediction process +while the non-physics-based branch is optimised to minimise the difference +between the predicted and actual motion of the subject. A classifier also +leverages the same latent space embeddings to recognise the ASD severity. This +dual generative objective explicitly forces the network to compare the actual +behaviour of the subject with the general normal behaviour of children that are +governed by the laws of physics, aiding the ASD recognition task. The proposed +method attains state-of-the-art performance on multiple ASD diagnosis +benchmarks. To illustrate the utility of the proposed framework beyond the task +ASD diagnosis, we conduct a third experiment using a publicly available +benchmark for the task of fall prediction and demonstrate the superiority of +our model. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Multi-agent Reinforcement Learning for Dynamic Dispatching in Material + Handling Systems + + +
+ This paper proposes a multi-agent reinforcement learning (MARL) approach to +learn dynamic dispatching strategies, which is crucial for optimizing +throughput in material handling systems across diverse industries. To benchmark +our method, we developed a material handling environment that reflects the +complexities of an actual system, such as various activities at different +locations, physical constraints, and inherent uncertainties. To enhance +exploration during learning, we propose a method to integrate domain knowledge +in the form of existing dynamic dispatching heuristics. Our experimental +results show that our method can outperform heuristics by up to 7.4 percent in +terms of median throughput. Additionally, we analyze the effect of different +architectures on MARL performance when training multiple agents with different +functions. We also demonstrate that the MARL agents performance can be further +improved by using the first iteration of MARL agents as heuristics to train a +second iteration of MARL agents. This work demonstrates the potential of +applying MARL to learn effective dynamic dispatching strategies that may be +deployed in real-world systems to improve business outcomes. + +
+
+
+
+
+ + ☆ Easy2Hard-Bench: Standardized Difficulty Labels for Profiling LLM + Performance and Generalization NeurIPS 2024 + + +
+ While generalization over tasks from easy to hard is crucial to profile +language models (LLMs), the datasets with fine-grained difficulty annotations +for each problem across a broad range of complexity are still blank. Aiming to +address this limitation, we present Easy2Hard-Bench, a consistently formatted +collection of 6 benchmark datasets spanning various domains, such as +mathematics and programming problems, chess puzzles, and reasoning questions. +Each problem within these datasets is annotated with numerical difficulty +scores. To systematically estimate problem difficulties, we collect abundant +performance data on attempts to each problem by humans in the real world or +LLMs on the prominent leaderboard. Leveraging the rich performance data, we +apply well-established difficulty ranking systems, such as Item Response Theory +(IRT) and Glicko-2 models, to uniformly assign numerical difficulty scores to +problems. Moreover, datasets in Easy2Hard-Bench distinguish themselves from +previous collections by a higher proportion of challenging problems. Through +extensive experiments with six state-of-the-art LLMs, we provide a +comprehensive analysis of their performance and generalization capabilities +across varying levels of difficulty, with the aim of inspiring future research +in LLM generalization. The datasets are available at +https://huggingface.co/datasets/furonghuang-lab/Easy2Hard-Bench. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ☆ A3: Active Adversarial Alignment for Source-Free Domain Adaptation ICML + + +
+ Unsupervised domain adaptation (UDA) aims to transfer knowledge from a +labeled source domain to an unlabeled target domain. Recent works have focused +on source-free UDA, where only target data is available. This is challenging as +models rely on noisy pseudo-labels and struggle with distribution shifts. We +propose Active Adversarial Alignment (A3), a novel framework combining +self-supervised learning, adversarial training, and active learning for robust +source-free UDA. A3 actively samples informative and diverse data using an +acquisition function for training. It adapts models via adversarial losses and +consistency regularization, aligning distributions without source data access. +A3 advances source-free UDA through its synergistic integration of active and +adversarial learning for effective domain alignment and noise reduction. + +
+
+ comment: Accepted at ICMLA 2024 +
+
+
+
+
+ + ☆ VickreyFeedback: Cost-efficient Data Construction for Reinforcement + Learning from Human Feedback + + +
+ This paper addresses the cost-efficiency aspect of Reinforcement Learning +from Human Feedback (RLHF). RLHF leverages datasets of human preferences over +outputs of large language models (LLM) to instill human expectations into LLMs. +While preference annotation comes with a monetized cost, the economic utility +of a preference dataset has not been considered by far. What exacerbates this +situation is that given complex intransitive or cyclic relationships in +preference datasets, existing algorithms for fine-tuning LLMs are still far +from capturing comprehensive preferences. This raises severe cost-efficiency +concerns in production environments, where preference data accumulate over +time. In this paper, we see the fine-tuning of LLMs as a monetized economy and +introduce an auction mechanism to improve the efficiency of the preference data +collection in dollar terms. We show that introducing an auction mechanism can +play an essential role in enhancing the cost-efficiency of RLHF while +maintaining satisfactory model performance. Experimental results demonstrate +that our proposed auction-based protocol is cost-efficient for fine-tuning LLMs +by concentrating on high-quality feedback. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ☆ SciDFM: A Large Language Model with Mixture-of-Experts for Science + + +
+ Recently, there has been a significant upsurge of interest in leveraging +large language models (LLMs) to assist scientific discovery. However, most LLMs +only focus on general science, while they lack domain-specific knowledge, such +as chemical molecules and amino acid sequences. To bridge these gaps, we +introduce SciDFM, a mixture-of-experts LLM, which is trained from scratch and +is able to conduct college-level scientific reasoning and understand molecules +and amino acid sequences. We collect a large-scale training corpus containing +numerous scientific papers and books from different disciplines as well as data +from domain-specific databases. We further fine-tune the pre-trained model on +lots of instruction data to improve performances on downstream benchmarks. From +experiment results, we show that SciDFM achieves strong performance on general +scientific benchmarks such as SciEval and SciQ, and it reaches a SOTA +performance on domain-specific benchmarks among models of similar size. We +further analyze the expert layers and show that the results of expert selection +vary with data from different disciplines. To benefit the broader research +community, we open-source SciDFM at +https://huggingface.co/OpenDFM/SciDFM-MoE-A5.6B-v1.0. + +
+
+ comment: 12 pages, 1 figure, 9 tables. Technical Report, Under Review +
+
+
+
+
+ + ☆ BoT-Drive: Hierarchical Behavior and Trajectory Planning for Autonomous + Driving using POMDPs + + +
+ Uncertainties in dynamic road environments pose significant challenges for +behavior and trajectory planning in autonomous driving. This paper introduces +BoT-Drive, a planning algorithm that addresses uncertainties at both behavior +and trajectory levels within a Partially Observable Markov Decision Process +(POMDP) framework. BoT-Drive employs driver models to characterize unknown +behavioral intentions and utilizes their model parameters to infer hidden +driving styles. By also treating driver models as decision-making actions for +the autonomous vehicle, BoT-Drive effectively tackles the exponential +complexity inherent in POMDPs. To enhance safety and robustness, the planner +further applies importance sampling to refine the driving trajectory +conditioned on the planned high-level behavior. Evaluation on real-world data +shows that BoT-Drive consistently outperforms both existing planning methods +and learning-based methods in regular and complex urban driving scenes, +demonstrating significant improvements in driving safety and reliability. + +
+
+
+
+
+ + ☆ GenesisTex2: Stable, Consistent and High-Quality Text-to-Texture + Generation + + +
+ Large-scale text-guided image diffusion models have shown astonishing results +in text-to-image (T2I) generation. However, applying these models to synthesize +textures for 3D geometries remains challenging due to the domain gap between 2D +images and textures on a 3D surface. Early works that used a +projecting-and-inpainting approach managed to preserve generation diversity but +often resulted in noticeable artifacts and style inconsistencies. While recent +methods have attempted to address these inconsistencies, they often introduce +other issues, such as blurring, over-saturation, or over-smoothing. To overcome +these challenges, we propose a novel text-to-texture synthesis framework that +leverages pretrained diffusion models. We first introduce a local attention +reweighing mechanism in the self-attention layers to guide the model in +concentrating on spatial-correlated patches across different views, thereby +enhancing local details while preserving cross-view consistency. Additionally, +we propose a novel latent space merge pipeline, which further ensures +consistency across different viewpoints without sacrificing too much diversity. +Our method significantly outperforms existing state-of-the-art techniques +regarding texture consistency and visual quality, while delivering results much +faster than distillation-based methods. Importantly, our framework does not +require additional training or fine-tuning, making it highly adaptable to a +wide range of models available on public platforms. + +
+
+
+
+
+ + ☆ Multimodal Trajectory Prediction for Autonomous Driving on Unstructured + Roads using Deep Convolutional Network + + +
+ Recently, the application of autonomous driving in open-pit mining has +garnered increasing attention for achieving safe and efficient mineral +transportation. Compared to urban structured roads, unstructured roads in +mining sites have uneven boundaries and lack clearly defined lane markings. +This leads to a lack of sufficient constraint information for predicting the +trajectories of other human-driven vehicles, resulting in higher uncertainty in +trajectory prediction problems. A method is proposed to predict multiple +possible trajectories and their probabilities of the target vehicle. The +surrounding environment and historical trajectories of the target vehicle are +encoded as a rasterized image, which is used as input to our deep convolutional +network to predict the target vehicle's multiple possible trajectories. The +method underwent offline testing on a dataset specifically designed for +autonomous driving scenarios in open-pit mining and was compared and evaluated +against physics-based method. The open-source code and data are available at +https://github.com/LLsxyc/mine_motion_prediction.git + +
+
+ comment: 11 pages,6 figures +
+
+
+
+
+ + ☆ Code Vulnerability Repair with Large Language Model using Context-Aware + Prompt Tuning + + +
+ Large Language Models (LLMs) have shown significant challenges in detecting +and repairing vulnerable code, particularly when dealing with vulnerabilities +involving multiple aspects, such as variables, code flows, and code structures. +In this study, we utilize GitHub Copilot as the LLM and focus on buffer +overflow vulnerabilities. Our experiments reveal a notable gap in Copilot's +abilities when dealing with buffer overflow vulnerabilities, with a 76% +vulnerability detection rate but only a 15% vulnerability repair rate. To +address this issue, we propose context-aware prompt tuning techniques designed +to enhance LLM performance in repairing buffer overflow. By injecting a +sequence of domain knowledge about the vulnerability, including various +security and code contexts, we demonstrate that Copilot's successful repair +rate increases to 63%, representing more than four times the improvement +compared to repairs without domain knowledge. + +
+
+
+
+
+ + ☆ Speech to Reality: On-Demand Production using Natural Language, 3D + Generative AI, and Discrete Robotic Assembly + + +
+ We present a system that transforms speech into physical objects by combining +3D generative Artificial Intelligence with robotic assembly. The system +leverages natural language input to make design and manufacturing more +accessible, enabling individuals without expertise in 3D modeling or robotic +programming to create physical objects. We propose utilizing discrete robotic +assembly of lattice-based voxel components to address the challenges of using +generative AI outputs in physical production, such as design variability, +fabrication speed, structural integrity, and material waste. The system +interprets speech to generate 3D objects, discretizes them into voxel +components, computes an optimized assembly sequence, and generates a robotic +toolpath. The results are demonstrated through the assembly of various objects, +ranging from chairs to shelves, which are prompted via speech and realized +within 5 minutes using a 6-axis robotic arm. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible. An updated version will replace this version +
+
+
+
+
+ + ☆ Robo-CSK-Organizer: Commonsense Knowledge to Organize Detected Objects + for Multipurpose Robots + + +
+ This paper presents a system called Robo-CSK-Organizer that infuses +commonsense knowledge from a classical knowledge based to enhance the context +recognition capabilities of robots so as to facilitate the organization of +detected objects by classifying them in a task-relevant manner. It is +particularly useful in multipurpose robotics. Unlike systems relying solely on +deep learning tools such as ChatGPT, the Robo-CSK-Organizer system stands out +in multiple avenues as follows. It resolves ambiguities well, and maintains +consistency in object placement. Moreover, it adapts to diverse task-based +classifications. Furthermore, it contributes to explainable AI, hence helping +to improve trust and human-robot collaboration. Controlled experiments +performed in our work, simulating domestic robotics settings, make +Robo-CSK-Organizer demonstrate superior performance while placing objects in +contextually relevant locations. This work highlights the capacity of an +AI-based system to conduct commonsense-guided decision-making in robotics +closer to the thresholds of human cognition. Hence, Robo-CSK-Organizer makes +positive impacts on AI and robotics. + +
+
+
+
+
+ + ☆ Adaptive Learning of the Latent Space of Wasserstein Generative + Adversarial Networks + + +
+ Generative models based on latent variables, such as generative adversarial +networks (GANs) and variational auto-encoders (VAEs), have gained lots of +interests due to their impressive performance in many fields. However, many +data such as natural images usually do not populate the ambient Euclidean space +but instead reside in a lower-dimensional manifold. Thus an inappropriate +choice of the latent dimension fails to uncover the structure of the data, +possibly resulting in mismatch of latent representations and poor generative +qualities. Towards addressing these problems, we propose a novel framework +called the latent Wasserstein GAN (LWGAN) that fuses the Wasserstein +auto-encoder and the Wasserstein GAN so that the intrinsic dimension of the +data manifold can be adaptively learned by a modified informative latent +distribution. We prove that there exist an encoder network and a generator +network in such a way that the intrinsic dimension of the learned encoding +distribution is equal to the dimension of the data manifold. We theoretically +establish that our estimated intrinsic dimension is a consistent estimate of +the true dimension of the data manifold. Meanwhile, we provide an upper bound +on the generalization error of LWGAN, implying that we force the synthetic data +distribution to be similar to the real data distribution from a population +perspective. Comprehensive empirical experiments verify our framework and show +that LWGAN is able to identify the correct intrinsic dimension under several +scenarios, and simultaneously generate high-quality synthetic data by sampling +from the learned latent distribution. + +
+
+
+
+
+ + ☆ Multi-hypotheses Conditioned Point Cloud Diffusion for 3D Human + Reconstruction from Occluded Images NeurIPS 2024 + + +
+ 3D human shape reconstruction under severe occlusion due to human-object or +human-human interaction is a challenging problem. Parametric models i.e., +SMPL(-X), which are based on the statistics across human shapes, can represent +whole human body shapes but are limited to minimally-clothed human shapes. +Implicit-function-based methods extract features from the parametric models to +employ prior knowledge of human bodies and can capture geometric details such +as clothing and hair. However, they often struggle to handle misaligned +parametric models and inpaint occluded regions given a single RGB image. In +this work, we propose a novel pipeline, MHCDIFF, Multi-hypotheses Conditioned +Point Cloud Diffusion, composed of point cloud diffusion conditioned on +probabilistic distributions for pixel-aligned detailed 3D human reconstruction +under occlusion. Compared to previous implicit-function-based methods, the +point cloud diffusion model can capture the global consistent features to +generate the occluded regions, and the denoising process corrects the +misaligned SMPL meshes. The core of MHCDIFF is extracting local features from +multiple hypothesized SMPL(-X) meshes and aggregating the set of features to +condition the diffusion model. In the experiments on CAPE and MultiHuman +datasets, the proposed method outperforms various SOTA methods based on SMPL, +implicit functions, point cloud diffusion, and their combined, under synthetic +and real occlusions. + +
+
+ comment: 17 pages, 7 figures, accepted NeurIPS 2024 +
+
+
+
+
+ + ☆ Tracking Software Security Topics + + +
+ Software security incidents occur everyday and thousands of software security +reports are announced each month. Thus, it is difficult for software security +researchers, engineers, and other stakeholders to follow software security +topics of their interests in real-time. In this paper, we propose, SOSK, a +novel tool for this problem. SOSK allows a user to import a collection of +software security reports. It pre-processes and extracts the most important +keywords from the textual description of the reports. Based on the similarity +of embedding vectors of keywords, SOSK can expand and/or refine a keyword set +from a much smaller set of user-provided keywords. Thus, SOSK allows users to +define any topic of their interests and retrieve security reports relevant to +that topic effectively. Our preliminary evaluation shows that SOSK can expand +keywords and retrieve reports relevant to user requests. + +
+
+
+
+
+ + ♻ ☆ Explaining Explaining + + +
+ Explanation is key to people having confidence in high-stakes AI systems. +However, machine-learning-based systems -- which account for almost all current +AI -- can't explain because they are usually black boxes. The explainable AI +(XAI) movement hedges this problem by redefining "explanation". The +human-centered explainable AI (HCXAI) movement identifies the +explanation-oriented needs of users but can't fulfill them because of its +commitment to machine learning. In order to achieve the kinds of explanations +needed by real people operating in critical domains, we must rethink how to +approach AI. We describe a hybrid approach to developing cognitive agents that +uses a knowledge-based infrastructure supplemented by data obtained through +machine learning when applicable. These agents will serve as assistants to +humans who will bear ultimate responsibility for the decisions and actions of +the human-robot team. We illustrate the explanatory potential of such agents +using the under-the-hood panels of a demonstration system in which a team of +simulated robots collaborate on a search task assigned by a human. + +
+
+
+
+
+ + ♻ ☆ CRoP: Context-wise Robust Static Human-Sensing Personalization + + +
+ The advancement in deep learning and internet-of-things have led to diverse +human sensing applications. However, distinct patterns in human sensing, +influenced by various factors or contexts, challenge generic neural network +model's performance due to natural distribution shifts. To address this, +personalization tailors models to individual users. Yet most personalization +studies overlook intra-user heterogeneity across contexts in sensory data, +limiting intra-user generalizability. This limitation is especially critical in +clinical applications, where limited data availability hampers both +generalizability and personalization. Notably, intra-user sensing attributes +are expected to change due to external factors such as treatment progression, +further complicating the challenges. This work introduces CRoP, a novel static +personalization approach using an off-the-shelf pre-trained model and pruning +to optimize personalization and generalization. CRoP shows superior +personalization effectiveness and intra-user robustness across four +human-sensing datasets, including two from real-world health domains, +highlighting its practical and social impact. Additionally, to support CRoP's +generalization ability and design choices, we provide empirical justification +through gradient inner product analysis, ablation studies, and comparisons +against state-of-the-art baselines. + +
+
+ comment: 31 pages, 10 figues and 13 tables +
+
+
+
+
+ + ♻ ☆ Confidence intervals uncovered: Are we ready for real-world medical + imaging AI? MICCAI 2024 + + +
+ Medical imaging is spearheading the AI transformation of healthcare. +Performance reporting is key to determine which methods should be translated +into clinical practice. Frequently, broad conclusions are simply derived from +mean performance values. In this paper, we argue that this common practice is +often a misleading simplification as it ignores performance variability. Our +contribution is threefold. (1) Analyzing all MICCAI segmentation papers (n = +221) published in 2023, we first observe that more than 50% of papers do not +assess performance variability at all. Moreover, only one (0.5%) paper reported +confidence intervals (CIs) for model performance. (2) To address the reporting +bottleneck, we show that the unreported standard deviation (SD) in segmentation +papers can be approximated by a second-order polynomial function of the mean +Dice similarity coefficient (DSC). Based on external validation data from 56 +previous MICCAI challenges, we demonstrate that this approximation can +accurately reconstruct the CI of a method using information provided in +publications. (3) Finally, we reconstructed 95% CIs around the mean DSC of +MICCAI 2023 segmentation papers. The median CI width was 0.03 which is three +times larger than the median performance gap between the first and second +ranked method. For more than 60% of papers, the mean performance of the +second-ranked method was within the CI of the first-ranked method. We conclude +that current publications typically do not provide sufficient evidence to +support which models could potentially be translated into clinical practice. + +
+
+ comment: Paper accepted at MICCAI 2024 conference +
+
+
+
+
+ + ♻ ☆ MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard + for Prompt Attacks + + +
+ The proliferation of Large Language Models (LLMs) in diverse applications +underscores the pressing need for robust security measures to thwart potential +jailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger +data integrity and user privacy. Guardrails serve as crucial protective +mechanisms against such threats, but existing models often fall short in terms +of both detection accuracy, and computational efficiency. This paper advocates +for the significance of jailbreak attack prevention on LLMs, and emphasises the +role of input guardrails in safeguarding these models. We introduce MoJE +(Mixture of Jailbreak Expert), a novel guardrail architecture designed to +surpass current limitations in existing state-of-the-art guardrails. By +employing simple linguistic statistical techniques, MoJE excels in detecting +jailbreak attacks while maintaining minimal computational overhead during model +inference. Through rigorous experimentation, MoJE demonstrates superior +performance capable of detecting 90% of the attacks without compromising benign +prompts, enhancing LLMs security against jailbreak attacks. + +
+
+
+
+
+ + ♻ ☆ Summarizing Radiology Reports Findings into Impressions + + +
+ Patient hand-off and triage are two fundamental problems in health care. +Often doctors must painstakingly summarize complex findings to efficiently +communicate with specialists and quickly make decisions on which patients have +the most urgent cases. In pursuit of these challenges, we present (1) a model +with state-of-art radiology report summarization performance using (2) a novel +method for augmenting medical data, and (3) an analysis of the model +limitations and radiology knowledge gain. We also provide a data processing +pipeline for future models developed on the the MIMIC CXR dataset. Our best +performing model was a fine-tuned BERT-to-BERT encoder-decoder with 58.75/100 +ROUGE-L F1, which outperformed specialized checkpoints with more sophisticated +attention mechanisms. We investigate these aspects in this work. + +
+
+ comment: This version reverts to the original preprint, following the advice + from the Artificial Intelligence in Health editorial office. The published + version is peer-reviewed and available in the journal (see external DOI). The + preprint remains unchanged to maintain version transparency, as noted in the + further disclosure section of the published article +
+
+
+
+
+ + ♻ ☆ Modulated Intervention Preference Optimization (MIPO): Keep the Easy, + Refine the Difficult AAAI 2025 + + +
+ Preference optimization methods typically begin training with a well-trained +SFT model as a reference model. In RLHF and DPO, a regularization term is used +during the preference optimization process to prevent the policy model from +deviating too far from the reference model's distribution, thereby avoiding the +generation of anomalous responses. When the reference model is already +well-aligned with the given data or only requires slight adjustments, this +approach can produce a well-aligned model. However, if the reference model is +not aligned with the given data and requires significant deviation from its +current state, a regularization term may actually hinder the model alignment. +In this study, we propose \textbf{Modulated Intervention Preference +Optimization (MIPO)} to address this issue. MIPO modulates the degree of +intervention from the reference model based on how well the given data is +aligned with it. If the data is well-aligned, the intervention is increased to +prevent the policy model from diverging significantly from reference model. +Conversely, if the alignment is poor, the interference is reduced to facilitate +more extensive training. We compare the performance of MIPO and DPO using +Mistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental +results demonstrate that MIPO consistently outperforms DPO across various +evaluation scenarios. + +
+
+ comment: 8pages, submitted to AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Proprioception Is All You Need: Terrain Classification for Boreal + Forests IROS 2024 + + +
+ Recent works in field robotics highlighted the importance of resiliency +against different types of terrains. Boreal forests, in particular, are home to +many mobility-impeding terrains that should be considered for off-road +autonomous navigation. Also, being one of the largest land biomes on Earth, +boreal forests are an area where autonomous vehicles are expected to become +increasingly common. In this paper, we address this issue by introducing +BorealTC, a publicly available dataset for proprioceptive-based terrain +classification (TC). Recorded with a Husky A200, our dataset contains 116 min +of Inertial Measurement Unit (IMU), motor current, and wheel odometry data, +focusing on typical boreal forest terrains, notably snow, ice, and silty loam. +Combining our dataset with another dataset from the state-of-the-art, we +evaluate both a Convolutional Neural Network (CNN) and the novel state space +model (SSM)-based Mamba architecture on a TC task. Interestingly, we show that +while CNN outperforms Mamba on each separate dataset, Mamba achieves greater +accuracy when trained on a combination of both. In addition, we demonstrate +that Mamba's learning capacity is greater than a CNN for increasing amounts of +data. We show that the combination of two TC datasets yields a latent space +that can be interpreted with the properties of the terrains. We also discuss +the implications of merging datasets on classification. Our source code and +dataset are publicly available online: +https://github.com/norlab-ulaval/BorealTC. + +
+
+ comment: Accepted to the 2024 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2024) +
+
+
+
+
+ + ♻ ☆ M$^2$PT: Multimodal Prompt Tuning for Zero-shot Instruction Learning EMNLP 2024 + + +
+ Multimodal Large Language Models (MLLMs) demonstrate remarkable performance +across a wide range of domains, with increasing emphasis on enhancing their +zero-shot generalization capabilities for unseen tasks across various +modalities. Instruction tuning has emerged as an effective strategy for +achieving zero-shot generalization by finetuning pretrained models on diverse +multimodal tasks. As the scale of MLLMs continues to grow, parameter-efficient +finetuning becomes increasingly critical. However, most existing +parameter-efficient approaches focus only on single modalities and often +overlook the multimodal characteristics during finetuning. In this work, we +introduce a novel Multimodal Prompt Tuning (M$^2$PT) approach for efficient +instruction tuning of MLLMs. M$^2$PT effectively integrates visual and textual +prompts into the vision encoder and language processor respectively during +finetuning, facilitating the extraction and alignment of features across +modalities. Empirical results on various multimodal evaluation datasets +demonstrate the superior performance of our approach compared to several +state-of-the-art baselines. A comprehensive set of ablation studies validates +the effectiveness of our prompt design and the efficiency of our approach. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ AnySkin: Plug-and-play Skin Sensing for Robotic Touch + + +
+ While tactile sensing is widely accepted as an important and useful sensing +modality, its use pales in comparison to other sensory modalities like vision +and proprioception. AnySkin addresses the critical challenges that impede the +use of tactile sensing -- versatility, replaceability, and data reusability. +Building on the simplistic design of ReSkin, and decoupling the sensing +electronics from the sensing interface, AnySkin simplifies integration making +it as straightforward as putting on a phone case and connecting a charger. +Furthermore, AnySkin is the first uncalibrated tactile-sensor with +cross-instance generalizability of learned manipulation policies. To summarize, +this work makes three key contributions: first, we introduce a streamlined +fabrication process and a design tool for creating an adhesive-free, durable +and easily replaceable magnetic tactile sensor; second, we characterize slip +detection and policy learning with the AnySkin sensor; and third, we +demonstrate zero-shot generalization of models trained on one instance of +AnySkin to new instances, and compare it with popular existing tactile +solutions like DIGIT and ReSkin. Videos of experiments, fabrication details and +design files can be found on https://any-skin.github.io/ + +
+
+
+
+
+ + ♻ ☆ LLM Detectors Still Fall Short of Real World: Case of LLM-Generated + Short News-Like Posts EMNLP + + +
+ With the emergence of widely available powerful LLMs, disinformation +generated by large Language Models (LLMs) has become a major concern. +Historically, LLM detectors have been touted as a solution, but their +effectiveness in the real world is still to be proven. In this paper, we focus +on an important setting in information operations -- short news-like posts +generated by moderately sophisticated attackers. + We demonstrate that existing LLM detectors, whether zero-shot or +purpose-trained, are not ready for real-world use in that setting. All tested +zero-shot detectors perform inconsistently with prior benchmarks and are highly +vulnerable to sampling temperature increase, a trivial attack absent from +recent benchmarks. A purpose-trained detector generalizing across LLMs and +unseen attacks can be developed, but it fails to generalize to new +human-written texts. + We argue that the former indicates domain-specific benchmarking is needed, +while the latter suggests a trade-off between the adversarial evasion +resilience and overfitting to the reference human text, with both needing +evaluation in benchmarks and currently absent. We believe this suggests a +re-consideration of current LLM detector benchmarking approaches and provides a +dynamically extensible benchmark to allow it +(https://github.com/Reliable-Information-Lab-HEVS/benchmark_llm_texts_detection). + +
+
+ comment: 20 pages, 7 tables, 13 figures, under consideration for EMNLP +
+
+
+
+
+ + ♻ ☆ A Learning-based Declarative Privacy-Preserving Framework for Federated + Data Management + + +
+ It is challenging to select the right privacy-preserving mechanism for +federated query processing over multiple private data silos. There exist +numerous privacy-preserving mechanisms, such as secure multi-party computing +(SMC), approximate query processing with differential privacy (DP), combined +SMC and DP, DP-based data obfuscation, and federated learning. These mechanisms +make different trade-offs among accuracy, privacy, execution efficiency, and +storage efficiency. In this work, we first introduce a new privacy-preserving +technique that uses a deep learning model trained using the +Differentially-Private Stochastic Gradient Descent (DP-SGD) algorithm to +replace portions of actual data to answer a query. We then demonstrate a novel +declarative privacy-preserving workflow that allows users to specify "what +private information to protect" rather than "how to protect". Under the hood, +the system relies on a cost model to automatically choose privacy-preserving +mechanisms as well as hyper-parameters. At the same time, the proposed workflow +also allows human experts to review and tune the selected privacy-preserving +mechanism for audit/compliance, and optimization purposes. + +
+
+
+
+
+ + ♻ ☆ PIM-Opt: Demystifying Distributed Optimization Algorithms on a + Real-World Processing-In-Memory System + + +
+ Modern Machine Learning (ML) training on large-scale datasets is a very +time-consuming workload. It relies on the optimization algorithm Stochastic +Gradient Descent (SGD) due to its effectiveness, simplicity, and generalization +performance. Processor-centric architectures (e.g., CPUs, GPUs) commonly used +for modern ML training workloads based on SGD are bottlenecked by data movement +between the processor and memory units due to the poor data locality in +accessing large datasets. As a result, processor-centric architectures suffer +from low performance and high energy consumption while executing ML training +workloads. Processing-In-Memory (PIM) is a promising solution to alleviate the +data movement bottleneck by placing the computation mechanisms inside or near +memory. + Our goal is to understand the capabilities of popular distributed SGD +algorithms on real-world PIM systems to accelerate data-intensive ML training +workloads. To this end, we 1) implement several representative centralized +parallel SGD algorithms on the real-world UPMEM PIM system, 2) rigorously +evaluate these algorithms for ML training on large-scale datasets in terms of +performance, accuracy, and scalability, 3) compare to conventional CPU and GPU +baselines, and 4) discuss implications for future PIM hardware and highlight +the need for a shift to an algorithm-hardware codesign. + Our results demonstrate three major findings: 1) The UPMEM PIM system can be +a viable alternative to state-of-the-art CPUs and GPUs for many memory-bound ML +training workloads, especially when operations and datatypes are natively +supported by PIM hardware, 2) it is important to carefully choose the +optimization algorithms that best fit PIM, and 3) the UPMEM PIM system does not +scale approximately linearly with the number of nodes for many data-intensive +ML training workloads. We open source all our code to facilitate future +research. + +
+
+ comment: "PIM-Opt: Demystifying Distributed Optimization Algorithms on a + Real-World Processing-In-Memory System" in Proceedings of the 33rd + International Conference on Parallel Architectures and Compilation Techniques + (PACT), Long Beach, CA, USA, October 2024 +
+
+
+
+
+ + ♻ ☆ Analyzing Probabilistic Methods for Evaluating Agent Capabilities + + +
+ To mitigate risks from AI systems, we need to assess their capabilities +accurately. This is especially difficult in cases where capabilities are only +rarely displayed. Phuong et al. propose two methods that aim to obtain better +estimates of the probability of an AI agent successfully completing a given +task. The milestone method decomposes tasks into subtasks, aiming to improve +overall success rate estimation, while the expert best-of-N method leverages +human guidance as a proxy for the model's independent performance. + Our analysis of these methods as Monte Carlo estimators reveals that while +both effectively reduce variance compared to naive Monte Carlo sampling, they +also introduce bias. Experimental results demonstrate that the milestone method +underestimates true solve rates for many real-world tasks due to its +constraining assumptions. The expert best-of-N method exhibits even more severe +underestimation across all tasks, attributed to an inherently flawed +re-weighting factor. To enhance the accuracy of capability estimates of AI +agents on difficult tasks, we suggest future work should leverage the rich +literature on Monte Carlo Estimators. + +
+
+ comment: Updated wording in Figure 1 and 2 +
+
+
+
+
+ + ♻ ☆ A Chatbot for Asylum-Seeking Migrants in Europe ICTAI + + +
+ We present ACME: A Chatbot for asylum-seeking Migrants in Europe. ACME relies +on computational argumentation and aims to help migrants identify the highest +level of protection they can apply for. This would contribute to a more +sustainable migration by reducing the load on territorial commissions, Courts, +and humanitarian organizations supporting asylum applicants. We describe the +background context, system architecture, underlying technologies, and a case +study used to validate the tool with domain experts. + +
+
+ comment: Accepted for publication at IEEE International Conference on Tools + with Artificial Intelligence (ICTAI) @IEEE +
+
+
+
+
+ + ♻ ☆ I2EBench: A Comprehensive Benchmark for Instruction-based Image Editing NeurIPS2024 + + +
+ Significant progress has been made in the field of Instruction-based Image +Editing (IIE). However, evaluating these models poses a significant challenge. +A crucial requirement in this field is the establishment of a comprehensive +evaluation benchmark for accurately assessing editing results and providing +valuable insights for its further development. In response to this need, we +propose I2EBench, a comprehensive benchmark designed to automatically evaluate +the quality of edited images produced by IIE models from multiple dimensions. +I2EBench consists of 2,000+ images for editing, along with 4,000+ corresponding +original and diverse instructions. It offers three distinctive characteristics: +1) Comprehensive Evaluation Dimensions: I2EBench comprises 16 evaluation +dimensions that cover both high-level and low-level aspects, providing a +comprehensive assessment of each IIE model. 2) Human Perception Alignment: To +ensure the alignment of our benchmark with human perception, we conducted an +extensive user study for each evaluation dimension. 3) Valuable Research +Insights: By analyzing the advantages and disadvantages of existing IIE models +across the 16 dimensions, we offer valuable research insights to guide future +development in the field. We will open-source I2EBench, including all +instructions, input images, human annotations, edited images from all evaluated +methods, and a simple script for evaluating the results from new IIE models. +The code, dataset and generated images from all IIE models are provided in +github: https://github.com/cocoshe/I2EBench. + +
+
+ comment: NeurIPS2024, 15 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Dual-Layer Training and Decoding of Large Language Model with + Simultaneously Thinking and Speaking + + +
+ Large Language Model can reasonably understand and generate human expressions +but may lack of thorough thinking and reasoning mechanisms. Recently there have +been several studies which enhance the thinking ability of language models but +most of them are not data-driven or training-based. In this paper, we are +motivated by the cognitive mechanism in the natural world, and design a novel +model architecture called TaS which allows it to first consider the thoughts +and then express the response based upon the query. We design several pipelines +to annotate or generate the thought contents from prompt-response samples, then +add language heads in a middle layer which behaves as the thinking layer. We +train the language model by the thoughts-augmented data and successfully let +the thinking layer automatically generate reasonable thoughts and finally +output more reasonable responses. Both qualitative examples and quantitative +results validate the effectiveness and performance of TaS. Our code is +available at https://anonymous.4open.science/r/TadE. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ EasyRec: Simple yet Effective Language Models for Recommendation + + +
+ Deep neural networks have become a powerful technique for learning +representations from user-item interaction data in collaborative filtering (CF) +for recommender systems. However, many existing methods heavily rely on unique +user and item IDs, which limits their ability to perform well in practical +zero-shot learning scenarios where sufficient training data may be unavailable. +Inspired by the success of language models (LMs) and their strong +generalization capabilities, a crucial question arises: How can we harness the +potential of language models to empower recommender systems and elevate its +generalization capabilities to new heights? In this study, we propose EasyRec - +an effective and easy-to-use approach that seamlessly integrates text-based +semantic understanding with collaborative signals. EasyRec employs a +text-behavior alignment framework, which combines contrastive learning with +collaborative language model tuning, to ensure a strong alignment between the +text-enhanced semantic space and the collaborative behavior information. +Extensive empirical evaluations across diverse real-world datasets demonstrate +the superior performance of EasyRec compared to state-of-the-art alternative +models, particularly in the challenging text-based zero-shot recommendation +scenarios. Furthermore, the study highlights the potential of seamlessly +integrating EasyRec as a plug-and-play component into text-enhanced +collaborative filtering frameworks, thereby empowering existing recommender +systems to elevate their recommendation performance and adapt to the evolving +user preferences in dynamic environments. For better result reproducibility of +our EasyRec framework, the model implementation details, source code, and +datasets are available at the link: https://github.com/HKUDS/EasyRec. + +
+
+
+
+
+ + ♻ ☆ Reward-Robust RLHF in LLMs + + +
+ As Large Language Models (LLMs) continue to progress toward more advanced +forms of intelligence, Reinforcement Learning from Human Feedback (RLHF) is +increasingly seen as a key pathway toward achieving Artificial General +Intelligence (AGI). However, the reliance on reward-model-based (RM-based) +alignment methods introduces significant challenges due to the inherent +instability and imperfections of Reward Models (RMs), which can lead to +critical issues such as reward hacking and misalignment with human intentions. +In this paper, we introduce a reward-robust RLHF framework aimed at addressing +these fundamental challenges, paving the way for more reliable and resilient +learning in LLMs. Our approach introduces a novel optimization objective that +carefully balances performance and robustness by incorporating Bayesian Reward +Model Ensembles (BRME) to model the uncertainty set of reward functions. This +allows the framework to integrate both nominal performance and minimum reward +signals, ensuring more stable learning even with imperfect RMs. Empirical +results demonstrate that our framework consistently outperforms baselines +across diverse benchmarks, showing improved accuracy and long-term stability. +We also provide a theoretical analysis, demonstrating that reward-robust RLHF +approaches the stability of constant reward settings, which proves to be +acceptable even in a stochastic-case analysis. Together, these contributions +highlight the framework potential to enhance both the performance and stability +of LLM alignment. + +
+
+
+
+
+ + ♻ ☆ Automating Data Annotation under Strategic Human Agents: Risks and + Potential Solutions + + +
+ As machine learning (ML) models are increasingly used in social domains to +make consequential decisions about humans, they often have the power to reshape +data distributions. Humans, as strategic agents, continuously adapt their +behaviors in response to the learning system. As populations change +dynamically, ML systems may need frequent updates to ensure high performance. +However, acquiring high-quality human-annotated samples can be highly +challenging and even infeasible in social domains. A common practice to address +this issue is using the model itself to annotate unlabeled data samples. This +paper investigates the long-term impacts when ML models are retrained with +model-annotated samples when they incorporate human strategic responses. We +first formalize the interactions between strategic agents and the model and +then analyze how they evolve under such dynamic interactions. We find that +agents are increasingly likely to receive positive decisions as the model gets +retrained, whereas the proportion of agents with positive labels may decrease +over time. We thus propose a refined retraining process to stabilize the +dynamics. Last, we examine how algorithmic fairness can be affected by these +retraining processes and find that enforcing common fairness constraints at +every round may not benefit the disadvantaged group in the long run. +Experiments on (semi-)synthetic and real data validate the theoretical +findings. + +
+
+
+
+
+ + ♻ ☆ QPaug: Question and Passage Augmentation for Open-Domain Question + Answering of LLMs EMNLP + + +
+ Retrieval-augmented generation (RAG) has received much attention for +Open-domain question-answering (ODQA) tasks as a means to compensate for the +parametric knowledge of large language models (LLMs). While previous approaches +focused on processing retrieved passages to remove irrelevant context, they +still rely heavily on the quality of retrieved passages which can degrade if +the question is ambiguous or complex. In this paper, we propose a simple yet +efficient method called question and passage augmentation (QPaug) via LLMs for +open-domain QA. QPaug first decomposes the original questions into +multiple-step sub-questions. By augmenting the original question with detailed +sub-questions and planning, we are able to make the query more specific on what +needs to be retrieved, improving the retrieval performance. In addition, to +compensate for the case where the retrieved passages contain distracting +information or divided opinions, we augment the retrieved passages with +self-generated passages by LLMs to guide the answer extraction. Experimental +results show that QPaug outperforms the previous state-of-the-art and achieves +significant performance gain over existing RAG methods. The source code is +available at \url{https://github.com/kmswin1/QPaug}. + +
+
+ comment: The 2024 Conference on Empirical Methods in Natural Language + Processing (EMNLP), Findings +
+
+
+
+
+ + ♻ ☆ Plurals: A System for Guiding LLMs Via Simulated Social Ensembles + + +
+ Recent debates raised concerns that language models may favor certain +viewpoints. But what if the solution is not to aim for a 'view from nowhere' +but rather to leverage different viewpoints? We introduce Plurals, a system and +Python library for pluralistic AI deliberation. Plurals consists of Agents +(LLMs, optionally with personas) which deliberate within customizable +Structures, with Moderators overseeing deliberation. Plurals is a generator of +simulated social ensembles. Plurals integrates with government datasets to +create nationally representative personas, includes deliberation templates +inspired by democratic deliberation theory, and allows users to customize both +information-sharing structures and deliberation behavior within Structures. Six +case studies demonstrate fidelity to theoretical constructs and efficacy. Three +randomized experiments show simulated focus groups produced output resonant +with an online sample of the relevant audiences (chosen over zero-shot +generation in 75% of trials). Plurals is both a paradigm and a concrete system +for pluralistic AI. The Plurals library is available at +https://github.com/josh-ashkinaze/plurals and will be continually updated. + +
+
+
+
+
+ + ♻ ☆ ELiSe: Efficient Learning of Sequences in Structured Recurrent Networks + + +
+ Behavior can be described as a temporal sequence of actions driven by neural +activity. To learn complex sequential patterns in neural networks, memories of +past activities need to persist on significantly longer timescales than the +relaxation times of single-neuron activity. While recurrent networks can +produce such long transients, training these networks is a challenge. Learning +via error propagation confers models such as FORCE, RTRL or BPTT a significant +functional advantage, but at the expense of biological plausibility. While +reservoir computing circumvents this issue by learning only the readout +weights, it does not scale well with problem complexity. We propose that two +prominent structural features of cortical networks can alleviate these issues: +the presence of a certain network scaffold at the onset of learning and the +existence of dendritic compartments for enhancing neuronal information storage +and computation. Our resulting model for Efficient Learning of Sequences +(ELiSe) builds on these features to acquire and replay complex non-Markovian +spatio-temporal patterns using only local, always-on and phase-free synaptic +plasticity. We showcase the capabilities of ELiSe in a mock-up of birdsong +learning, and demonstrate its flexibility with respect to parametrization, as +well as its robustness to external disturbances. + +
+
+ comment: 15 pages, 7 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Semi-Supervised Cognitive State Classification from Speech with + Multi-View Pseudo-Labeling + + +
+ The lack of labeled data is a common challenge in speech classification +tasks, particularly those requiring extensive subjective assessment, such as +cognitive state classification. In this work, we propose a Semi-Supervised +Learning (SSL) framework, introducing a novel multi-view pseudo-labeling method +that leverages both acoustic and linguistic characteristics to select the most +confident data for training the classification model. Acoustically, unlabeled +data are compared to labeled data using the Frechet audio distance, calculated +from embeddings generated by multiple audio encoders. Linguistically, large +language models are prompted to revise automatic speech recognition +transcriptions and predict labels based on our proposed task-specific +knowledge. High-confidence data are identified when pseudo-labels from both +sources align, while mismatches are treated as low-confidence data. A bimodal +classifier is then trained to iteratively label the low-confidence data until a +predefined criterion is met. We evaluate our SSL framework on emotion +recognition and dementia detection tasks. Experimental results demonstrate that +our method achieves competitive performance compared to fully supervised +learning using only 30% of the labeled data and significantly outperforms two +selected baselines. + +
+
+
+
+
+ + ♻ ☆ Trio-ViT: Post-Training Quantization and Acceleration for Softmax-Free + Efficient Vision Transformer + + +
+ Motivated by the huge success of Transformers in the field of natural +language processing (NLP), Vision Transformers (ViTs) have been rapidly +developed and achieved remarkable performance in various computer vision tasks. +However, their huge model sizes and intensive computations hinder ViTs' +deployment on embedded devices, calling for effective model compression +methods, such as quantization. Unfortunately, due to the existence of +hardware-unfriendly and quantization-sensitive non-linear operations, +particularly {Softmax}, it is non-trivial to completely quantize all operations +in ViTs, yielding either significant accuracy drops or non-negligible hardware +costs. In response to challenges associated with \textit{standard ViTs}, we +focus our attention towards the quantization and acceleration for +\textit{efficient ViTs}, which not only eliminate the troublesome Softmax but +also integrate linear attention with low computational complexity, and propose +Trio-ViT accordingly. Specifically, at the algorithm level, we develop a +{tailored post-training quantization engine} taking the unique activation +distributions of Softmax-free efficient ViTs into full consideration, aiming to +boost quantization accuracy. Furthermore, at the hardware level, we build an +accelerator dedicated to the specific Convolution-Transformer hybrid +architecture of efficient ViTs, thereby enhancing hardware efficiency. +Extensive experimental results consistently prove the effectiveness of our +Trio-ViT framework. {Particularly, we can gain up to +$\uparrow$$\mathbf{3.6}\times$, $\uparrow$$\mathbf{5.0}\times$, and +$\uparrow$$\mathbf{7.3}\times$ FPS under comparable accuracy over +state-of-the-art ViT accelerators, as well as $\uparrow$$\mathbf{6.0}\times$, +$\uparrow$$\mathbf{1.5}\times$, and $\uparrow$$\mathbf{2.1}\times$ DSP +efficiency.} Codes are available at +\url{https://github.com/shihuihong214/Trio-ViT}. + +
+
+
+
+
+ + ♻ ☆ Comparing Apples to Oranges: LLM-powered Multimodal Intention Prediction + in an Object Categorization Task + + +
+ Human intention-based systems enable robots to perceive and interpret user +actions to interact with humans and adapt to their behavior proactively. +Therefore, intention prediction is pivotal in creating a natural interaction +with social robots in human-designed environments. In this paper, we examine +using Large Language Models (LLMs) to infer human intention in a collaborative +object categorization task with a physical robot. We propose a novel multimodal +approach that integrates user non-verbal cues, like hand gestures, body poses, +and facial expressions, with environment states and user verbal cues to predict +user intentions in a hierarchical architecture. Our evaluation of five LLMs +shows the potential for reasoning about verbal and non-verbal user cues, +leveraging their context-understanding and real-world knowledge to support +intention prediction while collaborating on a task with a social robot. + +
+
+ comment: Accepted at ICSR 2024,14 pages,5 figures,2 tables; work was co-funded + by Horizon Europe project TERAIS under Grant agreement number 101079338 +
+
+
+
+
+ + ♻ ☆ Sparse Low-Ranked Self-Attention Transformer for Remaining Useful + Lifetime Prediction of Optical Fiber Amplifiers + + +
+ Optical fiber amplifiers are key elements in present optical networks. +Failures of these components result in high financial loss of income of the +network operator as the communication traffic over an affected link is +interrupted. Applying Remaining useful lifetime (RUL) prediction in the context +of Predictive Maintenance (PdM) to optical fiber amplifiers to predict upcoming +system failures at an early stage, so that network outages can be minimized +through planning of targeted maintenance actions, ensures reliability and +safety. Optical fiber amplifier are complex systems, that work under various +operating conditions, which makes correct forecasting a difficult task. +Increased monitoring capabilities of systems results in datasets that +facilitate the application of data-driven RUL prediction methods. Deep learning +models in particular have shown good performance, but generalization based on +comparatively small datasets for RUL prediction is difficult. In this paper, we +propose Sparse Low-ranked self-Attention Transformer (SLAT) as a novel RUL +prediction method. SLAT is based on an encoder-decoder architecture, wherein +two parallel working encoders extract features for sensors and time steps. By +utilizing the self-attention mechanism, long-term dependencies can be learned +from long sequences. The implementation of sparsity in the attention matrix and +a low-rank parametrization reduce overfitting and increase generalization. +Experimental application to optical fiber amplifiers exemplified on EDFA, as +well as a reference dataset from turbofan engines, shows that SLAT outperforms +the state-of-the-art methods. + +
+
+ comment: 9 pages, 7 figures, submitted to IEEE Transactions on Machine + Learning in Communications and Networking (TMLCN) +
+
+
+
+
+ + ♻ ☆ Generalisation to unseen topologies: Towards control of biological + neural network activity + + +
+ Novel imaging and neurostimulation techniques open doors for advancements in +closed-loop control of activity in biological neural networks. This would allow +for applications in the investigation of activity propagation, and for +diagnosis and treatment of pathological behaviour. Due to the partially +observable characteristics of activity propagation, through networks in which +edges can not be observed, and the dynamic nature of neuronal systems, there is +a need for adaptive, generalisable control. In this paper, we introduce an +environment that procedurally generates neuronal networks with different +topologies to investigate this generalisation problem. Additionally, an +existing transformer-based architecture is adjusted to evaluate the +generalisation performance of a deep RL agent in the presented partially +observable environment. The agent demonstrates the capability to generalise +control from a limited number of training networks to unseen test networks. + +
+
+
+
+
+ + ♻ ☆ HW-TSC's Submission to the CCMT 2024 Machine Translation Tasks + + +
+ This paper presents the submission of Huawei Translation Services Center +(HW-TSC) to machine translation tasks of the 20th China Conference on Machine +Translation (CCMT 2024). We participate in the bilingual machine translation +task and multi-domain machine translation task. For these two translation +tasks, we use training strategies such as regularized dropout, bidirectional +training, data diversification, forward translation, back translation, +alternated training, curriculum learning, and transductive ensemble learning to +train neural machine translation (NMT) models based on the deep Transformer-big +architecture. Furthermore, to explore whether large language model (LLM) can +help improve the translation quality of NMT systems, we use supervised +fine-tuning to train llama2-13b as an Automatic post-editing (APE) model to +improve the translation results of the NMT model on the multi-domain machine +translation task. By using these plyometric strategies, our submission achieves +a competitive result in the final evaluation. + +
+
+ comment: 14 pages, 2 figures, 6 Tables, CCMT2024. arXiv admin note: + substantial text overlap with arXiv:2409.14800 +
+
+
+
+
+ + ♻ ☆ An Empirical Study of AI Techniques in Mobile Applications + + +
+ The integration of artificial intelligence (AI) into mobile applications has +significantly transformed various domains, enhancing user experiences and +providing personalized services through advanced machine learning (ML) and deep +learning (DL) technologies. AI-driven mobile apps typically refer to +applications that leverage ML/DL technologies to perform key tasks such as +image recognition and natural language processing. In this paper, we conducted +the most extensive empirical study on AI applications, exploring on-device ML +apps, on-device DL apps, and AI service-supported (cloud-based) apps. Our study +encompasses 56,682 real-world AI applications, focusing on three crucial +perspectives: 1) Application analysis, where we analyze the popularity of AI +apps and investigate the update states of AI apps; 2) Framework and model +analysis, where we analyze AI framework usage and AI model protection; 3) User +analysis, where we examine user privacy protection and user review attitudes. +Our study has strong implications for AI app developers, users, and AI R\&D. On +one hand, our findings highlight the growing trend of AI integration in mobile +applications, demonstrating the widespread adoption of various AI frameworks +and models. On the other hand, our findings emphasize the need for robust model +protection to enhance app security. Additionally, our study highlights the +importance of user privacy and presents user attitudes towards the AI +technologies utilized in current AI apps. We provide our AI app dataset +(currently the most extensive AI app dataset) as an open-source resource for +future research on AI technologies utilized in mobile applications. + +
+
+ comment: This paper is accepted by the Journal of Systems and Software (JSS) + 2024 +
+
+
+
+
+ + ♻ ☆ Can-SAVE: Mass Cancer Risk Prediction via Survival Analysis Variables + and EHR + + +
+ Specific medical cancer screening methods are often costly, time-consuming, +and weakly applicable on a large scale. Advanced Artificial Intelligence (AI) +methods greatly help cancer detection but require specific or deep medical +data. These aspects prevent the mass implementation of cancer screening +methods. For this reason, it is a disruptive change for healthcare to apply AI +methods for mass personalized assessment of the cancer risk among patients +based on the existing Electronic Health Records (EHR) volume. This paper +presents a novel Can-SAVE cancer risk assessment method combining a survival +analysis approach with a gradient-boosting algorithm. It is highly accessible +and resource-efficient, utilizing only a sequence of high-level medical events. +We tested the proposed method in a long-term retrospective experiment covering +more than 1.1 million people and four regions of Russia. The Can-SAVE method +significantly exceeds the baselines by the Average Precision metric of +22.8%$\pm$2.7% vs 15.1%$\pm$2.6%. The extensive ablation study also confirmed +the proposed method's dominant performance. The experiment supervised by +oncologists shows a reliable cancer patient detection rate of up to 84 out of +1000 selected. Such results surpass the medical screening strategies estimates; +the typical age-specific Number Needed to Screen is only 9 out of 1000 (for +colorectal cancer). Overall, our experiments show a 4.7-6.4 times improvement +in cancer detection rate (TOP@1k) compared to the traditional healthcare risk +estimation approach. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Efficient Exploration of Image Classifier Failures with Bayesian + Optimization and Text-to-Image Models + + +
+ Image classifiers should be used with caution in the real world. Performance +evaluated on a validation set may not reflect performance in the real world. In +particular, classifiers may perform well for conditions that are frequently +encountered during training, but poorly for other infrequent conditions. In +this study, we hypothesize that recent advances in text-to-image generative +models make them valuable for benchmarking computer vision models such as image +classifiers: they can generate images conditioned by textual prompts that cause +classifier failures, allowing failure conditions to be described with textual +attributes. However, their generation cost becomes an issue when a large number +of synthetic images need to be generated, which is the case when many different +attribute combinations need to be tested. We propose an image classifier +benchmarking method as an iterative process that alternates image generation, +classifier evaluation, and attribute selection. This method efficiently +explores the attributes that ultimately lead to poor behavior detection. + +
+
+
+
+
+ + ♻ ☆ AlphaMath Almost Zero: Process Supervision without Process NeurIPS 2024 + + +
+ Although recent advancements in large language models (LLMs) have +significantly improved their performance on various tasks, they still face +challenges with complex and symbolic multi-step reasoning, particularly in +mathematical reasoning. To bolster the mathematical reasoning capabilities of +LLMs, most existing efforts concentrate on seeking assistance from either +domain experts or GPT-4 for high-quality process-supervised data, which is not +only expensive but also labor-intensive. In our study, we propose an innovative +framework, AlphaMath, that bypasses the need for process annotations (from +humans or GPTs) by leveraging Monte Carlo Tree Search (MCTS). This framework +focuses on unleashing the potential of a well-pretrained LLM to autonomously +enhance its mathematical reasoning. Specifically, we integrate a value model +with the LLM, automatically generating both process supervision and step-level +evaluation signals in MCTS. Furthermore, we propose an efficient inference +strategy, step-level beam search, where the value model is crafted to assist +the policy model (i.e., LLM) in navigating more effective reasoning paths, +rather than solely relying on prior probabilities. The experimental results on +both in-domain and out-of-domain datasets demonstrate that even without GPT-4 +or human-annotated process supervision, our AlphaMath framework achieves +comparable or superior results to previous state-of-the-art methods. + +
+
+ comment: Camera ready version for NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Step-level Value Preference Optimization for Mathematical Reasoning EMNLP2024 + + +
+ Direct Preference Optimization (DPO) using an implicit reward model has +proven to be an effective alternative to reinforcement learning from human +feedback (RLHF) for fine-tuning preference aligned large language models +(LLMs). However, the overall preference annotations of responses do not fully +capture the fine-grained quality of model outputs in complex multi-step +reasoning tasks, such as mathematical reasoning. To address this limitation, we +introduce a novel algorithm called Step-level Value Preference Optimization +(SVPO). Our approach employs Monte Carlo Tree Search (MCTS) to automatically +annotate step-level preferences for multi-step reasoning. Furthermore, from the +perspective of learning-to-rank, we train an explicit value model to replicate +the behavior of the implicit reward model, complementing standard preference +optimization. This value model enables the LLM to generate higher reward +responses with minimal cost during inference. Experimental results demonstrate +that our method achieves state-of-the-art performance on both in-domain and +out-of-domain mathematical reasoning benchmarks. Our code is available at +\url{https://github.com/MARIO-Math-Reasoning/Super_MARIO}. + +
+
+ comment: Camera ready version for EMNLP2024-Findings +
+
+
+
+
+ + ♻ ☆ CyberForce: A Federated Reinforcement Learning Framework for Malware + Mitigation + + +
+ Recent research has shown that the integration of Reinforcement Learning (RL) +with Moving Target Defense (MTD) can enhance cybersecurity in +Internet-of-Things (IoT) devices. Nevertheless, the practicality of existing +work is hindered by data privacy concerns associated with centralized data +processing in RL, and the unsatisfactory time needed to learn right MTD +techniques that are effective against a rising number of heterogeneous zero-day +attacks. Thus, this work presents CyberForce, a framework that combines +Federated and Reinforcement Learning (FRL) to collaboratively and privately +learn suitable MTD techniques for mitigating zero-day attacks. CyberForce +integrates device fingerprinting and anomaly detection to reward or penalize +MTD mechanisms chosen by an FRL-based agent. The framework has been deployed +and evaluated in a scenario consisting of ten physical devices of a real IoT +platform affected by heterogeneous malware samples. A pool of experiments has +demonstrated that CyberForce learns the MTD technique mitigating each attack +faster than existing RL-based centralized approaches. In addition, when various +devices are exposed to different attacks, CyberForce benefits from knowledge +transfer, leading to enhanced performance and reduced learning time in +comparison to recent works. Finally, different aggregation algorithms used +during the agent learning process provide CyberForce with notable robustness to +malicious attacks. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ TOP-Nav: Legged Navigation Integrating Terrain, Obstacle and + Proprioception Estimation CoRL 2024 + + +
+ Legged navigation is typically examined within open-world, off-road, and +challenging environments. In these scenarios, estimating external disturbances +requires a complex synthesis of multi-modal information. This underlines a +major limitation in existing works that primarily focus on avoiding obstacles. +In this work, we propose TOP-Nav, a novel legged navigation framework that +integrates a comprehensive path planner with Terrain awareness, Obstacle +avoidance and close-loop Proprioception. TOP-Nav underscores the synergies +between vision and proprioception in both path and motion planning. Within the +path planner, we present and integrate a terrain estimator that enables the +robot to select waypoints on terrains with higher traversability while +effectively avoiding obstacles. In the motion planning level, we not only +implement a locomotion controller to track the navigation commands, but also +construct a proprioception advisor to provide motion evaluations for the path +planner. Based on the close-loop motion feedback, we make online corrections +for the vision-based terrain and obstacle estimations. Consequently, TOP-Nav +achieves open-world navigation that the robot can handle terrains or +disturbances beyond the distribution of prior knowledge and overcomes +constraints imposed by visual conditions. Building upon extensive experiments +conducted in both simulation and real-world environments, TOP-Nav demonstrates +superior performance in open-world navigation compared to existing methods. + +
+
+ comment: Published on CoRL 2024 +
+
+
+
+
+ + ♻ ☆ SustainDC -- Benchmarking for Sustainable Data Center Control NeurIPS 2024 + + +
+ Machine learning has driven an exponential increase in computational demand, +leading to massive data centers that consume significant amounts of energy and +contribute to climate change. This makes sustainable data center control a +priority. In this paper, we introduce SustainDC, a set of Python environments +for benchmarking multi-agent reinforcement learning (MARL) algorithms for data +centers (DC). SustainDC supports custom DC configurations and tasks such as +workload scheduling, cooling optimization, and auxiliary battery management, +with multiple agents managing these operations while accounting for the effects +of each other. We evaluate various MARL algorithms on SustainDC, showing their +performance across diverse DC designs, locations, weather conditions, grid +carbon intensity, and workload requirements. Our results highlight significant +opportunities for improvement of data center operations using MARL algorithms. +Given the increasing use of DC due to AI, SustainDC provides a crucial platform +for the development and benchmarking of advanced algorithms essential for +achieving sustainable computing and addressing other heterogeneous real-world +challenges. + +
+
+ comment: Under review at Advances in Neural Information Processing Systems + 2024 (NeurIPS 2024) +
+
+
+
+
+ + ♻ ☆ Overcoming Growth-Induced Forgetting in Task-Agnostic Continual Learning + + +
+ In continual learning (CL), model growth enhances adaptability over new data, +improving knowledge retention for more tasks. However, improper model growth +can lead to severe degradation of previously learned knowledge, an issue we +name as growth-induced forgetting (GIFt), especially in task-agnostic CL using +entire grown model for inference. Existing works, despite adopting model growth +and random initialization for better adaptability, often fail to recognize the +presence of GIFt caused by improper model growth. This oversight limits +comprehensive control of forgetting and hinders full utilization of model +growth. We are the first in CL to identify this issue and conduct an in-depth +study on root cause of GIFt, where layer expansion stands out among model +growth strategies, widening layers without affecting model functionality. Yet, +direct adoption of layer expansion presents challenges. It lacks data-driven +control and initialization of expanded parameters to balance adaptability and +knowledge retention. This paper presents a novel SparseGrow approach to +overcome the issue of GIFt while enhancing adaptability over new data. +SparseGrow employs data-driven sparse layer expansion to control efficient +parameter usage during growth, reducing GIFt from excessive growth and +functionality changes. It also combines sparse growth with on-data +initialization at training late-stage to create partially 0-valued expansions +that fit learned distribution, enhancing retention and adaptability. To further +minimize forgetting, freezing is applied by calculating the sparse mask, +allowing data-driven preservation of important parameters. Through experiments +across datasets with various settings, cases, and task numbers, we demonstrate +the necessity of layer expansion and showcase the effectiveness of SparseGrow +in overcoming GIFt, highlighting its adaptability and knowledge retention for +incremental tasks. + +
+
+
+
+
+ + ♻ ☆ PromptKD: Distilling Student-Friendly Knowledge for Generative Language + Models via Prompt Tuning EMNLP 2024 + + +
+ Recent advancements in large language models (LLMs) have raised concerns +about inference costs, increasing the need for research into model compression. +While knowledge distillation (KD) is a prominent method for this, research on +KD for generative language models like LLMs is relatively sparse, and the +approach of distilling student-friendly knowledge, which has shown promising +performance in KD for classification models, remains unexplored in generative +language models. To explore this approach, we propose PromptKD, a simple yet +effective method that utilizes prompt tuning - for the first time in KD - to +enable generative language models to transfer student-friendly knowledge. +Unlike previous works in classification that require fine-tuning the entire +teacher model for extracting student-friendly knowledge, PromptKD achieves +similar effects by adding a small number of prompt tokens and tuning only the +prompt with student guidance. Extensive experiments on instruction-following +datasets show that PromptKD achieves state-of-the-art performance while adding +only 0.0007% of the teacher's parameters as prompts. Further analysis suggests +that distilling student-friendly knowledge alleviates exposure bias effectively +throughout the entire training process, leading to performance enhancements. + +
+
+ comment: EMNLP 2024 Findings. Our project page: https://promptkd.github.io +
+
+
+
+
+ + ♻ ☆ A Survey of Out-of-distribution Generalization for Graph Machine + Learning from a Causal View + + +
+ Graph machine learning (GML) has been successfully applied across a wide +range of tasks. Nonetheless, GML faces significant challenges in generalizing +over out-of-distribution (OOD) data, which raises concerns about its wider +applicability. Recent advancements have underscored the crucial role of +causality-driven approaches in overcoming these generalization challenges. +Distinct from traditional GML methods that primarily rely on statistical +dependencies, causality-focused strategies delve into the underlying causal +mechanisms of data generation and model prediction, thus significantly +improving the generalization of GML across different environments. This paper +offers a thorough review of recent progress in causality-involved GML +generalization. We elucidate the fundamental concepts of employing causality to +enhance graph model generalization and categorize the various approaches, +providing detailed descriptions of their methodologies and the connections +among them. Furthermore, we explore the incorporation of causality in other +related important areas of trustworthy GML, such as explanation, fairness, and +robustness. Concluding with a discussion on potential future research +directions, this review seeks to articulate the continuing development and +future potential of causality in enhancing the trustworthiness of graph machine +learning. + +
+
+ comment: 15 pages, 2 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Prompt-Agnostic Adversarial Perturbation for Customized Diffusion Models NIPS 2024 + + +
+ Diffusion models have revolutionized customized text-to-image generation, +allowing for efficient synthesis of photos from personal data with textual +descriptions. However, these advancements bring forth risks including privacy +breaches and unauthorized replication of artworks. Previous researches +primarily center around using prompt-specific methods to generate adversarial +examples to protect personal images, yet the effectiveness of existing methods +is hindered by constrained adaptability to different prompts. In this paper, we +introduce a Prompt-Agnostic Adversarial Perturbation (PAP) method for +customized diffusion models. PAP first models the prompt distribution using a +Laplace Approximation, and then produces prompt-agnostic perturbations by +maximizing a disturbance expectation based on the modeled distribution. This +approach effectively tackles the prompt-agnostic attacks, leading to improved +defense stability. Extensive experiments in face privacy and artistic style +protection, demonstrate the superior generalization of PAP in comparison to +existing techniques. Our project page is available at +https://github.com/vancyland/Prompt-Agnostic-Adversarial-Perturbation-for-Customized-Diffusion-Models.github.io. + +
+
+ comment: Accepted by NIPS 2024 +
+
+
+
+
+ + ♻ ☆ Hierarchical Decoupling Capacitor Optimization for Power Distribution + Network of 2.5D ICs with Co-Analysis of Frequency and Time Domains Based on + Deep Reinforcement Learning + + +
+ With the growing need for higher memory bandwidth and computation density, +2.5D design, which involves integrating multiple chiplets onto an interposer, +emerges as a promising solution. However, this integration introduces +significant challenges due to increasing data rates and a large number of I/Os, +necessitating advanced optimization of the power distribution networks (PDNs) +both on-chip and on-interposer to mitigate the small signal noise and +simultaneous switching noise (SSN). Traditional PDN optimization strategies in +2.5D systems primarily focus on reducing impedance by integrating decoupling +capacitors (decaps) to lessen small signal noises. Unfortunately, relying +solely on frequency-domain analysis has been proven inadequate for addressing +coupled SSN, as indicated by our experimental results. In this work, we +introduce a novel two-phase optimization flow using deep reinforcement learning +to tackle both the on-chip small signal noise and SSN. Initially, we optimize +the impedance in the frequency domain to maintain the small signal noise within +acceptable limits while avoiding over-design. Subsequently, in the time domain, +we refine the PDN to minimize the voltage violation integral (VVI), a more +accurate measure of SSN severity. To the best of our knowledge, this is the +first dual-domain optimization strategy that simultaneously addresses both the +small signal noise and SSN propagation through strategic decap placement in +on-chip and on-interposer PDNs, offering a significant step forward in the +design of robust PDNs for 2.5D integrated systems. + +
+
+ comment: The data needs to be experimentally revalidated, and the experimental + details require further optimization +
+
+
+
+
+ + ♻ ☆ A Survey on In-context Learning + + +
+ With the increasing capabilities of large language models (LLMs), in-context +learning (ICL) has emerged as a new paradigm for natural language processing +(NLP), where LLMs make predictions based on contexts augmented with a few +examples. It has been a significant trend to explore ICL to evaluate and +extrapolate the ability of LLMs. In this paper, we aim to survey and summarize +the progress and challenges of ICL. We first present a formal definition of ICL +and clarify its correlation to related studies. Then, we organize and discuss +advanced techniques, including training strategies, prompt designing +strategies, and related analysis. Additionally, we explore various ICL +application scenarios, such as data engineering and knowledge updating. +Finally, we address the challenges of ICL and suggest potential directions for +further research. We hope that our work can encourage more research on +uncovering how ICL works and improving ICL. + +
+
+ comment: Update +
+
+
+
+
+ + ♻ ☆ Simple Drop-in LoRA Conditioning on Attention Layers Will Improve Your + Diffusion Model + + +
+ Current state-of-the-art diffusion models employ U-Net architectures +containing convolutional and (qkv) self-attention layers. The U-Net processes +images while being conditioned on the time embedding input for each sampling +step and the class or caption embedding input corresponding to the desired +conditional generation. Such conditioning involves scale-and-shift operations +to the convolutional layers but does not directly affect the attention layers. +While these standard architectural choices are certainly effective, not +conditioning the attention layers feels arbitrary and potentially suboptimal. +In this work, we show that simply adding LoRA conditioning to the attention +layers without changing or tuning the other parts of the U-Net architecture +improves the image generation quality. For example, a drop-in addition of LoRA +conditioning to EDM diffusion model yields FID scores of 1.91/1.75 for +unconditional and class-conditional CIFAR-10 generation, improving upon the +baseline of 1.97/1.79. + +
+
+
+
+
+ + ♻ ☆ RoCOCO: Robustness Benchmark of MS-COCO to Stress-test Image-Text + Matching Models ECCV + + +
+ With the extensive use of vision-language models in various downstream tasks, +evaluating their robustness is crucial. In this paper, we propose a benchmark +for assessing the robustness of vision-language models. We believe that a +robust model should properly understand both linguistic and visual semantics +and be resilient to explicit variations. In pursuit of this goal, we create new +variants of texts and images in the MS-COCO test set and re-evaluate the +state-of-the-art (SOTA) models with the new data. Specifically, we alter the +meaning of text by replacing a word, and generate visually altered images that +maintain some visual context while introducing noticeable pixel changes through +image mixing techniques.Our evaluations on the proposed benchmark reveal +substantial performance degradation in many SOTA models (e.g., Image-to-Text +Recall@1: 81.9\% $\rightarrow$ 48.4\% in BLIP, 66.1\% $\rightarrow$ 37.6\% in +VSE$\infty$), with the models often favoring the altered texts/images over the +original ones. This indicates the current vision-language models struggle with +subtle changes and often fail to understand the overall context of texts and +images. Based on these findings, we propose semantic contrastive loss and +visual contrastive loss to learn more robust embedding. Datasets and code are +available at {\url{https://github.com/pseulki/rococo}}. + +
+
+ comment: Accepted to ECCV Synthetic Data for Computer Vision Workshop (Oral) +
+
+
+
+
+
+
+
+ + Computation and Language 89 + +
+
+
+ + ☆ LML: Language Model Learning a Dataset for Data-Augmented Prediction + + +
+ This paper introduces a new approach to using Large Language Models (LLMs) +for classification tasks, which are typically handled using Machine Learning +(ML) models. Unlike ML models that rely heavily on data cleaning and feature +engineering, this method streamlines the process using LLMs. This paper +proposes a new concept called "Language Model Learning (LML)" powered by a new +method called "Data-Augmented Prediction (DAP)". The classification is +performed by LLMs using a method similar to humans manually exploring and +understanding the data and deciding classifications using data as a reference. +Training data is summarized and evaluated to determine the features that lead +to the classification of each label the most. In the process of DAP, the system +uses the data summary to automatically create a query, which is used to +retrieve relevant rows from the dataset. A classification is generated by the +LLM using data summary and relevant rows, ensuring satisfactory accuracy even +with complex data. Usage of data summary and similar data in DAP ensures +context-aware decision-making. The proposed method uses the words "Act as an +Explainable Machine Learning Model" in the prompt to enhance the +interpretability of the predictions by allowing users to review the logic +behind each prediction. In some test cases, the system scored an accuracy above +90%, proving the effectiveness of the system and its potential to outperform +conventional ML models in various scenarios. The code is available at +https://github.com/Pro-GenAI/LML-DAP + +
+
+ comment: First version +
+
+
+
+
+ + ☆ Ruler: A Model-Agnostic Method to Control Generated Length for Large + Language Models + + +
+ The instruction-following ability of large language models enables humans to +interact with AI agents in a natural way. However, when required to generate +responses of a specific length, large language models often struggle to meet +users' needs due to their inherent difficulty in accurately perceiving +numerical constraints. To explore the ability of large language models to +control the length of generated responses, we propose the Target Length +Generation Task (TLG) and design two metrics, Precise Match (PM) and Flexible +Match (FM) to evaluate the model's performance in adhering to specified +response lengths. Furthermore, we introduce a novel, model-agnostic approach +called Ruler, which employs Meta Length Tokens (MLTs) to enhance the +instruction-following ability of large language models under length-constrained +instructions. Specifically, Ruler equips LLMs with the ability to generate +responses of a specified length based on length constraints within the +instructions. Moreover, Ruler can automatically generate appropriate MLT when +length constraints are not explicitly provided, demonstrating excellent +versatility and generalization. Comprehensive experiments show the +effectiveness of Ruler across different LLMs on Target Length Generation Task, +e.g., at All Level 27.97 average gain on PM, 29.57 average gain on FM. In +addition, we conduct extensive ablation experiments to further substantiate the +efficacy and generalization of Ruler. Our code and data is available at +https://github.com/Geaming2002/Ruler. + +
+
+
+
+
+ + ☆ AIPatient: Simulating Patients with EHRs and LLM Powered Agentic + Workflow + + +
+ Simulated patient systems play a crucial role in modern medical education and +research, providing safe, integrative learning environments and enabling +clinical decision-making simulations. Large Language Models (LLM) could advance +simulated patient systems by replicating medical conditions and patient-doctor +interactions with high fidelity and low cost. However, ensuring the +effectiveness and trustworthiness of these systems remains a challenge, as they +require a large, diverse, and precise patient knowledgebase, along with a +robust and stable knowledge diffusion to users. Here, we developed AIPatient, +an advanced simulated patient system with AIPatient Knowledge Graph (AIPatient +KG) as the input and the Reasoning Retrieval-Augmented Generation (Reasoning +RAG) agentic workflow as the generation backbone. AIPatient KG samples data +from Electronic Health Records (EHRs) in the Medical Information Mart for +Intensive Care (MIMIC)-III database, producing a clinically diverse and +relevant cohort of 1,495 patients with high knowledgebase validity (F1 0.89). +Reasoning RAG leverages six LLM powered agents spanning tasks including +retrieval, KG query generation, abstraction, checker, rewrite, and +summarization. This agentic framework reaches an overall accuracy of 94.15% in +EHR-based medical Question Answering (QA), outperforming benchmarks that use +either no agent or only partial agent integration. Our system also presents +high readability (median Flesch Reading Ease 77.23; median Flesch Kincaid Grade +5.6), robustness (ANOVA F-value 0.6126, p<0.1), and stability (ANOVA F-value +0.782, p<0.1). The promising performance of the AIPatient system highlights its +potential to support a wide range of applications, including medical education, +model evaluation, and system integration. + +
+
+ comment: 42 pages, 6 figures, 7 tables +
+
+
+
+
+ + ☆ Soft Measures for Extracting Causal Collective Intelligence EMNLP 2024 + + +
+ Understanding and modeling collective intelligence is essential for +addressing complex social systems. Directed graphs called fuzzy cognitive maps +(FCMs) offer a powerful tool for encoding causal mental models, but extracting +high-integrity FCMs from text is challenging. This study presents an approach +using large language models (LLMs) to automate FCM extraction. We introduce +novel graph-based similarity measures and evaluate them by correlating their +outputs with human judgments through the Elo rating system. Results show +positive correlations with human evaluations, but even the best-performing +measure exhibits limitations in capturing FCM nuances. Fine-tuning LLMs +improves performance, but existing measures still fall short. This study +highlights the need for soft similarity measures tailored to FCM extraction, +advancing collective intelligence modeling with NLP. + +
+
+ comment: Camera-ready version accepted for publication in the EMNLP 2024 + Workshop NLP4Science +
+
+
+
+
+ + ☆ IDGen: Item Discrimination Induced Prompt Generation for LLM Evaluation NeurIPS 2024 + + +
+ As Large Language Models (LLMs) grow increasingly adept at managing complex +tasks, the evaluation set must keep pace with these advancements to ensure it +remains sufficiently discriminative. Item Discrimination (ID) theory, which is +widely used in educational assessment, measures the ability of individual test +items to differentiate between high and low performers. Inspired by this +theory, we propose an ID-induced prompt synthesis framework for evaluating LLMs +to ensure the evaluation set can continually update and refine according to +model abilities. Our data synthesis framework prioritizes both breadth and +specificity. It can generate prompts that comprehensively evaluate the +capabilities of LLMs while revealing meaningful performance differences between +models, allowing for effective discrimination of their relative strengths and +weaknesses across various tasks and domains. To produce high-quality data, we +incorporate a self-correct mechanism into our generalization framework, and +develop two models to predict prompt discrimination and difficulty score to +facilitate our data synthesis framework, contributing valuable tools to +evaluation data synthesis research. We apply our generated data to evaluate +five SOTA models. Our data achieves an average score of 51.92, accompanied by a +variance of 10.06. By contrast, previous works (i.e., SELF-INSTRUCT and +WizardLM) obtain an average score exceeding 67, with a variance below 3.2. The +results demonstrate that the data generated by our framework is more +challenging and discriminative compared to previous works. We will release a +dataset of over 3,000 carefully crafted prompts to facilitate evaluation +research of LLMs. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Suicide Phenotyping from Clinical Notes in Safety-Net Psychiatric + Hospital Using Multi-Label Classification with Pre-Trained Language Models + + +
+ Accurate identification and categorization of suicidal events can yield +better suicide precautions, reducing operational burden, and improving care +quality in high-acuity psychiatric settings. Pre-trained language models offer +promise for identifying suicidality from unstructured clinical narratives. We +evaluated the performance of four BERT-based models using two fine-tuning +strategies (multiple single-label and single multi-label) for detecting +coexisting suicidal events from 500 annotated psychiatric evaluation notes. The +notes were labeled for suicidal ideation (SI), suicide attempts (SA), exposure +to suicide (ES), and non-suicidal self-injury (NSSI). RoBERTa outperformed +other models using binary relevance (acc=0.86, F1=0.78). MentalBERT (F1=0.74) +also exceeded BioClinicalBERT (F1=0.72). RoBERTa fine-tuned with a single +multi-label classifier further improved performance (acc=0.88, F1=0.81), +highlighting that models pre-trained on domain-relevant data and the single +multi-label classification strategy enhance efficiency and performance. + Keywords: EHR-based Phynotyping; Natural Language Processing; Secondary Use +of EHR Data; Suicide Classification; BERT-based Model; Psychiatry; Mental +Health + +
+
+ comment: submitted to AMIA Informatics Summit 2025 as a conference paper +
+
+
+
+
+ + ☆ Individuation in Neural Models with and without Visual Grounding + + +
+ We show differences between a language-and-vision model CLIP and two +text-only models - FastText and SBERT - when it comes to the encoding of +individuation information. We study latent representations that CLIP provides +for substrates, granular aggregates, and various numbers of objects. We +demonstrate that CLIP embeddings capture quantitative differences in +individuation better than models trained on text-only data. Moreover, the +individuation hierarchy we deduce from the CLIP embeddings agrees with the +hierarchies proposed in linguistics and cognitive science. + +
+
+
+
+
+ + ☆ Local Transcription Models in Home Care Nursing in Switzerland: an + Interdisciplinary Case Study + + +
+ Latest advances in the field of natural language processing (NLP) enable new +use cases for different domains, including the medical sector. In particular, +transcription can be used to support automation in the nursing documentation +process and give nurses more time to interact with the patients. However, +different challenges including (a) data privacy, (b) local languages and +dialects, and (c) domain-specific vocabulary need to be addressed. In this case +study, we investigate the case of home care nursing documentation in +Switzerland. We assessed different transcription tools and models, and +conducted several experiments with OpenAI Whisper, involving different +variations of German (i.e., dialects, foreign accent) and manually curated +example texts by a domain expert of home care nursing. Our results indicate +that even the used out-of-the-box model performs sufficiently well to be a good +starting point for future research in the field. + +
+
+
+
+
+ + ☆ LLMs4Synthesis: Leveraging Large Language Models for Scientific + Synthesis + + +
+ In response to the growing complexity and volume of scientific literature, +this paper introduces the LLMs4Synthesis framework, designed to enhance the +capabilities of Large Language Models (LLMs) in generating high-quality +scientific syntheses. This framework addresses the need for rapid, coherent, +and contextually rich integration of scientific insights, leveraging both +open-source and proprietary LLMs. It also examines the effectiveness of LLMs in +evaluating the integrity and reliability of these syntheses, alleviating +inadequacies in current quantitative metrics. Our study contributes to this +field by developing a novel methodology for processing scientific papers, +defining new synthesis types, and establishing nine detailed quality criteria +for evaluating syntheses. The integration of LLMs with reinforcement learning +and AI feedback is proposed to optimize synthesis quality, ensuring alignment +with established criteria. The LLMs4Synthesis framework and its components are +made available, promising to enhance both the generation and evaluation +processes in scientific research synthesis. + +
+
+ comment: 12 pages, 3 figures, Accepted to JCDL 2024 Research Track +
+
+
+
+
+ + A Survey on the Honesty of Large Language Models + + +
+ Honesty is a fundamental principle for aligning large language models (LLMs) +with human values, requiring these models to recognize what they know and don't +know and be able to faithfully express their knowledge. Despite promising, +current LLMs still exhibit significant dishonest behaviors, such as confidently +presenting wrong answers or failing to express what they know. In addition, +research on the honesty of LLMs also faces challenges, including varying +definitions of honesty, difficulties in distinguishing between known and +unknown knowledge, and a lack of comprehensive understanding of related +research. To address these issues, we provide a survey on the honesty of LLMs, +covering its clarification, evaluation approaches, and strategies for +improvement. Moreover, we offer insights for future research, aiming to inspire +further exploration in this important area. + +
+
+ comment: Project Page: https://github.com/SihengLi99/LLM-Honesty-Survey +
+
+
+
+
+ + ☆ Charting the Future: Using Chart Question-Answering for Scalable + Evaluation of LLM-Driven Data Visualizations + + +
+ We propose a novel framework that leverages Visual Question Answering (VQA) +models to automate the evaluation of LLM-generated data visualizations. +Traditional evaluation methods often rely on human judgment, which is costly +and unscalable, or focus solely on data accuracy, neglecting the effectiveness +of visual communication. By employing VQA models, we assess data representation +quality and the general communicative clarity of charts. Experiments were +conducted using two leading VQA benchmark datasets, ChartQA and PlotQA, with +visualizations generated by OpenAI's GPT-3.5 Turbo and Meta's Llama 3.1 +70B-Instruct models. Our results indicate that LLM-generated charts do not +match the accuracy of the original non-LLM-generated charts based on VQA +performance measures. Moreover, while our results demonstrate that few-shot +prompting significantly boosts the accuracy of chart generation, considerable +progress remains to be made before LLMs can fully match the precision of +human-generated graphs. This underscores the importance of our work, which +expedites the research process by enabling rapid iteration without the need for +human annotation, thus accelerating advancements in this field. + +
+
+
+
+
+ + ☆ Cross-Domain Keyword Extraction with Keyness Patterns + + +
+ Domain dependence and annotation subjectivity pose challenges for supervised +keyword extraction. Based on the premises that second-order keyness patterns +are existent at the community level and learnable from annotated keyword +extraction datasets, this paper proposes a supervised ranking approach to +keyword extraction that ranks keywords with keyness patterns consisting of +independent features (such as sublanguage domain and term length) and three +categories of dependent features -- heuristic features, specificity features, +and representavity features. The approach uses two convolutional-neural-network +based models to learn keyness patterns from keyword datasets and overcomes +annotation subjectivity by training the two models with bootstrap sampling +strategy. Experiments demonstrate that the approach not only achieves +state-of-the-art performance on ten keyword datasets in general supervised +keyword extraction with an average top-10-F-measure of 0.316 , but also robust +cross-domain performance with an average top-10-F-measure of 0.346 on four +datasets that are excluded in the training process. Such cross-domain +robustness is attributed to the fact that community-level keyness patterns are +limited in number and temperately independent of language domains, the +distinction between independent features and dependent features, and the +sampling training strategy that balances excess risk and lack of negative +training data. + +
+
+ comment: 26 pages, 14 figures +
+
+
+
+
+ + ☆ Read Over the Lines: Attacking LLMs and Toxicity Detection Systems with + ASCII Art to Mask Profanity + + +
+ We introduce a novel family of adversarial attacks that exploit the inability +of language models to interpret ASCII art. To evaluate these attacks, we +propose the ToxASCII benchmark and develop two custom ASCII art fonts: one +leveraging special tokens and another using text-filled letter shapes. Our +attacks achieve a perfect 1.0 Attack Success Rate across ten models, including +OpenAI's o1-preview and LLaMA 3.1. + Warning: this paper contains examples of toxic language used for research +purposes. + +
+
+
+
+
+ + ☆ KALE-LM: Unleash The Power Of AI For Science Via Knowledge And Logic + Enhanced Large Model + + +
+ Artificial intelligence is gradually demonstrating its immense potential, and +increasing attention is being given to how AI can be harnessed to advance +scientific research. In this vision paper, we present our perspectives on how +AI can better assist scientific inquiry and explore corresponding technical +approach. We have proposed and open-sourced a large model of our KALE-LM model +series, Llama3-KALE-LM-Chem-8B, which has achieved outstanding performance in +tasks related to the field of chemistry. We hope that our work serves as a +strong starting point, helping to realize more intelligent AI and promoting the +advancement of human science and technology, as well as societal development. + +
+
+
+
+
+ + ☆ Beyond Single-Audio: Advancing Multi-Audio Processing in Audio Large + Language Models EMNLP24 + + +
+ Various audio-LLMs (ALLMs) have been explored recently for tackling different +audio tasks simultaneously using a single, unified model. While existing +evaluations of ALLMs primarily focus on single-audio tasks, real-world +applications often involve processing multiple audio streams simultaneously. To +bridge this gap, we propose the first multi-audio evaluation (MAE) benchmark +that consists of 20 datasets from 11 multi-audio tasks encompassing both speech +and sound scenarios. Comprehensive experiments on MAE demonstrate that the +existing ALLMs, while being powerful in comprehending primary audio elements in +individual audio inputs, struggling to handle multi-audio scenarios. To this +end, we propose a novel multi-audio-LLM (MALLM) to capture audio context among +multiple similar audios using discriminative learning on our proposed synthetic +data. The results demonstrate that the proposed MALLM outperforms all baselines +and achieves high data efficiency using synthetic data without requiring human +annotations. The proposed MALLM opens the door for ALLMs towards multi-audio +processing era and brings us closer to replicating human auditory capabilities +in machines. + +
+
+ comment: EMNLP24 Findings +
+
+
+
+
+ + ☆ "Why" Has the Least Side Effect on Model Editing + + +
+ Training large language models (LLMs) from scratch is an expensive endeavor, +particularly as world knowledge continually evolves. To maintain relevance and +accuracy of LLMs, model editing has emerged as a pivotal research area. While +these methods hold promise, they can also produce unintended side effects. +Their underlying factors and causes remain largely unexplored. This paper +delves into a critical factor-question type-by categorizing model editing +questions. Our findings reveal that the extent of performance degradation +varies significantly across different question types, providing new insights +for experimental design in knowledge editing. Furthermore, we investigate +whether insights from smaller models can be extrapolated to larger models. Our +results indicate discrepancies in findings between models of different sizes, +suggesting that insights from smaller models may not necessarily apply to +larger models. Additionally, we examine the impact of batch size on side +effects, discovering that increasing the batch size can mitigate performance +drops. + +
+
+
+
+
+ + ☆ Rehearsing Answers to Probable Questions with Perspective-Taking + + +
+ Question answering (QA) has been a long-standing focus in the NLP field, +predominantly addressing reading comprehension and common sense QA. However, +scenarios involving the preparation of answers to probable questions during +professional oral presentations remain underexplored. In this paper, we pioneer +the examination of this crucial yet overlooked topic by utilizing real-world QA +conversation transcripts between company managers and professional analysts. We +explore the proposed task using three causal knowledge graphs (KGs) and three +large language models (LLMs). This work provides foundational insights into the +application of LLMs in professional QA scenarios, highlighting the importance +of causal KGs and perspective-taking in generating effective responses. + +
+
+
+
+
+ + ☆ Co-Trained Retriever-Generator Framework for Question Generation in + Earnings Calls + + +
+ In diverse professional environments, ranging from academic conferences to +corporate earnings calls, the ability to anticipate audience questions stands +paramount. Traditional methods, which rely on manual assessment of an +audience's background, interests, and subject knowledge, often fall short - +particularly when facing large or heterogeneous groups, leading to imprecision +and inefficiency. While NLP has made strides in text-based question generation, +its primary focus remains on academic settings, leaving the intricate +challenges of professional domains, especially earnings call conferences, +underserved. Addressing this gap, our paper pioneers the multi-question +generation (MQG) task specifically designed for earnings call contexts. Our +methodology involves an exhaustive collection of earnings call transcripts and +a novel annotation technique to classify potential questions. Furthermore, we +introduce a retriever-enhanced strategy to extract relevant information. With a +core aim of generating a spectrum of potential questions that analysts might +pose, we derive these directly from earnings call content. Empirical +evaluations underscore our approach's edge, revealing notable excellence in the +accuracy, consistency, and perplexity of the questions generated. + +
+
+
+
+
+ + ☆ HiCuLR: Hierarchical Curriculum Learning for Rhetorical Role Labeling of + Legal Documents EMNLP 2024 + + +
+ Rhetorical Role Labeling (RRL) of legal documents is pivotal for various +downstream tasks such as summarization, semantic case search and argument +mining. Existing approaches often overlook the varying difficulty levels +inherent in legal document discourse styles and rhetorical roles. In this work, +we propose HiCuLR, a hierarchical curriculum learning framework for RRL. It +nests two curricula: Rhetorical Role-level Curriculum (RC) on the outer layer +and Document-level Curriculum (DC) on the inner layer. DC categorizes documents +based on their difficulty, utilizing metrics like deviation from a standard +discourse structure and exposes the model to them in an easy-to-difficult +fashion. RC progressively strengthens the model to discern +coarse-to-fine-grained distinctions between rhetorical roles. Our experiments +on four RRL datasets demonstrate the efficacy of HiCuLR, highlighting the +complementary nature of DC and RC. + +
+
+ comment: Accepted to EMNLP 2024 Findings +
+
+
+
+
+ + ☆ The Craft of Selective Prediction: Towards Reliable Case Outcome + Classification -- An Empirical Study on European Court of Human Rights Cases EMNLP + + +
+ In high-stakes decision-making tasks within legal NLP, such as Case Outcome +Classification (COC), quantifying a model's predictive confidence is crucial. +Confidence estimation enables humans to make more informed decisions, +particularly when the model's certainty is low, or where the consequences of a +mistake are significant. However, most existing COC works prioritize high task +performance over model reliability. This paper conducts an empirical +investigation into how various design choices including pre-training corpus, +confidence estimator and fine-tuning loss affect the reliability of COC models +within the framework of selective prediction. Our experiments on the +multi-label COC task, focusing on European Court of Human Rights (ECtHR) cases, +highlight the importance of a diverse yet domain-specific pre-training corpus +for better calibration. Additionally, we demonstrate that larger models tend to +exhibit overconfidence, Monte Carlo dropout methods produce reliable confidence +estimates, and confident error regularization effectively mitigates +overconfidence. To our knowledge, this is the first systematic exploration of +selective prediction in legal NLP. Our findings underscore the need for further +research on enhancing confidence measurement and improving the trustworthiness +of models in the legal domain. + +
+
+ comment: Accepted to EMNLP Findings +
+
+
+
+
+ + ☆ Incorporating Precedents for Legal Judgement Prediction on European + Court of Human Rights Cases EMNLP + + +
+ Inspired by the legal doctrine of stare decisis, which leverages precedents +(prior cases) for informed decision-making, we explore methods to integrate +them into LJP models. To facilitate precedent retrieval, we train a retriever +with a fine-grained relevance signal based on the overlap ratio of alleged +articles between cases. We investigate two strategies to integrate precedents: +direct incorporation at inference via label interpolation based on case +proximity and during training via a precedent fusion module using a +stacked-cross attention model. We employ joint training of the retriever and +LJP models to address latent space divergence between them. Our experiments on +LJP tasks from the ECHR jurisdiction reveal that integrating precedents during +training coupled with joint training of the retriever and LJP model, +outperforms models without precedents or with precedents incorporated only at +inference, particularly benefiting sparser articles. + +
+
+ comment: Accepted to EMNLP Findings +
+
+
+
+
+ + ☆ Model-based Preference Optimization in Abstractive Summarization without + Human Feedback EMNLP 2024 + + +
+ In abstractive summarization, the challenge of producing concise and accurate +summaries arises from the vast amount of information contained in the source +document. Consequently, although Large Language Models (LLMs) can generate +fluent text, they often introduce inaccuracies by hallucinating content not +found in the original source. While supervised fine-tuning methods that +maximize likelihood contribute to this issue, they do not consistently enhance +the faithfulness of the summaries. Preference-based optimization methods, such +as Direct Preference Optimization (DPO), can further refine the model to align +with human preferences. However, these methods still heavily depend on costly +human feedback. In this work, we introduce a novel and straightforward approach +called Model-based Preference Optimization (MPO) to fine-tune LLMs for improved +summarization abilities without any human feedback. By leveraging the model's +inherent summarization capabilities, we create a preference dataset that is +fully generated by the model using different decoding strategies. Our +experiments on standard summarization datasets and various metrics demonstrate +that our proposed MPO significantly enhances the quality of generated summaries +without relying on human feedback. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ☆ Do LLMs suffer from Multi-Party Hangover? A Diagnostic Approach to + Addressee Recognition and Response Selection in Conversations EMNLP 2024 + + +
+ Assessing the performance of systems to classify Multi-Party Conversations +(MPC) is challenging due to the interconnection between linguistic and +structural characteristics of conversations. Conventional evaluation methods +often overlook variances in model behavior across different levels of +structural complexity on interaction graphs. In this work, we propose a +methodological pipeline to investigate model performance across specific +structural attributes of conversations. As a proof of concept we focus on +Response Selection and Addressee Recognition tasks, to diagnose model +weaknesses. To this end, we extract representative diagnostic subdatasets with +a fixed number of users and a good structural variety from a large and open +corpus of online MPCs. We further frame our work in terms of data minimization, +avoiding the use of original usernames to preserve privacy, and propose +alternatives to using original text messages. Results show that response +selection relies more on the textual content of conversations, while addressee +recognition requires capturing their structural dimension. Using an LLM in a +zero-shot setting, we further highlight how sensitivity to prompt variations is +task-dependent. + +
+
+ comment: Accepted to EMNLP 2024 main conference +
+
+
+
+
+ + ☆ ASAG2024: A Combined Benchmark for Short Answer Grading + + +
+ Open-ended questions test a more thorough understanding than closed-ended +questions and are often a preferred assessment method. However, open-ended +questions are tedious to grade and subject to personal bias. Therefore, there +have been efforts to speed up the grading process through automation. Short +Answer Grading (SAG) systems aim to automatically score students' answers. +Despite growth in SAG methods and capabilities, there exists no comprehensive +short-answer grading benchmark across different subjects, grading scales, and +distributions. Thus, it is hard to assess the capabilities of current automated +grading methods in terms of their generalizability. In this preliminary work, +we introduce the combined ASAG2024 benchmark to facilitate the comparison of +automated grading systems. Combining seven commonly used short-answer grading +datasets in a common structure and grading scale. For our benchmark, we +evaluate a set of recent SAG methods, revealing that while LLM-based approaches +reach new high scores, they still are far from reaching human performance. This +opens up avenues for future research on human-machine SAG systems. + +
+
+ comment: Accepted at SIGCSE-Virtual 2024 +
+
+
+
+
+ + ☆ "Oh LLM, I'm Asking Thee, Please Give Me a Decision Tree": Zero-Shot + Decision Tree Induction and Embedding with Large Language Models + + +
+ Large language models (LLMs) provide powerful means to leverage prior +knowledge for predictive modeling when data is limited. In this work, we +demonstrate how LLMs can use their compressed world knowledge to generate +intrinsically interpretable machine learning models, i.e., decision trees, +without any training data. We find that these zero-shot decision trees can +surpass data-driven trees on some small-sized tabular datasets and that +embeddings derived from these trees perform on par with data-driven tree-based +embeddings on average. Our knowledge-driven decision tree induction and +embedding approaches therefore serve as strong new baselines for data-driven +machine learning methods in the low-data regime. + +
+
+
+
+
+ + ☆ Hit the Sweet Spot! Span-Level Ensemble for Large Language Models + + +
+ Ensembling various LLMs to unlock their complementary potential and leverage +their individual strengths is highly valuable. Previous studies typically focus +on two main paradigms: sample-level and token-level ensembles. Sample-level +ensemble methods either select or blend fully generated outputs, which hinders +dynamic correction and enhancement of outputs during the generation process. On +the other hand, token-level ensemble methods enable real-time correction +through fine-grained ensemble at each generation step. However, the information +carried by an individual token is quite limited, leading to suboptimal +decisions at each step. To address these issues, we propose SweetSpan, a +span-level ensemble method that effectively balances the need for real-time +adjustments and the information required for accurate ensemble decisions. Our +approach involves two key steps: First, we have each candidate model +independently generate candidate spans based on the shared prefix. Second, we +calculate perplexity scores to facilitate mutual evaluation among the candidate +models and achieve robust span selection by filtering out unfaithful scores. To +comprehensively evaluate ensemble methods, we propose a new challenging setting +(ensemble models with significant performance gaps) in addition to the standard +setting (ensemble the best-performing models) to assess the performance of +model ensembles in more realistic scenarios. Experimental results in both +standard and challenging settings across various language generation tasks +demonstrate the effectiveness, robustness, and versatility of our approach +compared with previous ensemble methods. + +
+
+
+
+
+ + ☆ Research on Predicting Public Opinion Event Heat Levels Based on Large + Language Models + + +
+ In recent years, with the rapid development of large language models, serval +models such as GPT-4o have demonstrated extraordinary capabilities, surpassing +human performance in various language tasks. As a result, many researchers have +begun exploring their potential applications in the field of public opinion +analysis. This study proposes a novel large-language-models-based method for +public opinion event heat level prediction. First, we preprocessed and +classified 62,836 Chinese hot event data collected between July 2022 and +December 2023. Then, based on each event's online dissemination heat index, we +used the MiniBatchKMeans algorithm to automatically cluster the events and +categorize them into four heat levels (ranging from low heat to very high +heat). Next, we randomly selected 250 events from each heat level, totalling +1,000 events, to build the evaluation dataset. During the evaluation process, +we employed various large language models to assess their accuracy in +predicting event heat levels in two scenarios: without reference cases and with +similar case references. The results showed that GPT-4o and DeepseekV2 +performed the best in the latter case, achieving prediction accuracies of 41.4% +and 41.5%, respectively. Although the overall prediction accuracy remains +relatively low, it is worth noting that for low-heat (Level 1) events, the +prediction accuracies of these two models reached 73.6% and 70.4%, +respectively. Additionally, the prediction accuracy showed a downward trend +from Level 1 to Level 4, which correlates with the uneven distribution of data +across the heat levels in the actual dataset. This suggests that with the more +robust dataset, public opinion event heat level prediction based on large +language models will have significant research potential for the future. + +
+
+ comment: conference +
+
+
+
+
+ + ☆ A Survey on Complex Tasks for Goal-Directed Interactive Agents + + +
+ Goal-directed interactive agents, which autonomously complete tasks through +interactions with their environment, can assist humans in various domains of +their daily lives. Recent advances in large language models (LLMs) led to a +surge of new, more and more challenging tasks to evaluate such agents. To +properly contextualize performance across these tasks, it is imperative to +understand the different challenges they pose to agents. To this end, this +survey compiles relevant tasks and environments for evaluating goal-directed +interactive agents, structuring them along dimensions relevant for +understanding current obstacles. An up-to-date compilation of relevant +resources can be found on our project website: +https://coli-saar.github.io/interactive-agents. + +
+
+
+
+
+ + ☆ EmoPro: A Prompt Selection Strategy for Emotional Expression in LM-based + Speech Synthesis + + +
+ Recent advancements in speech synthesis models, trained on extensive +datasets, have demonstrated remarkable zero-shot capabilities. These models can +control content, timbre, and emotion in generated speech based on prompt +inputs. Despite these advancements, the choice of prompts significantly impacts +the output quality, yet most existing selection schemes do not adequately +address the control of emotional intensity. To address this question, this +paper proposes a two-stage prompt selection strategy EmoPro, which is +specifically designed for emotionally controllable speech synthesis. This +strategy focuses on selecting highly expressive and high-quality prompts by +evaluating them from four perspectives: emotional expression strength, speech +quality, text-emotion consistency, and model generation performance. +Experimental results show that prompts selected using the proposed method +result in more emotionally expressive and engaging synthesized speech compared +to those obtained through baseline. Audio samples and codes will be available +at https://whyrrrrun.github.io/EmoPro/. + +
+
+
+
+
+ + ☆ Do We Need Domain-Specific Embedding Models? An Empirical Investigation + + +
+ Embedding models play a crucial role in representing and retrieving +information across various NLP applications. Recent advancements in Large +Language Models (LLMs) have further enhanced the performance of embedding +models, which are trained on massive amounts of text covering almost every +domain. These models are often benchmarked on general-purpose datasets like +Massive Text Embedding Benchmark (MTEB), where they demonstrate superior +performance. However, a critical question arises: Is the development of +domain-specific embedding models necessary when general-purpose models are +trained on vast corpora that already include specialized domain texts? In this +paper, we empirically investigate this question, choosing the finance domain as +an example. We introduce the Finance Massive Text Embedding Benchmark +(FinMTEB), a counterpart to MTEB that consists of financial domain-specific +text datasets. We evaluate the performance of seven state-of-the-art embedding +models on FinMTEB and observe a significant performance drop compared to their +performance on MTEB. To account for the possibility that this drop is driven by +FinMTEB's higher complexity, we propose four measures to quantify dataset +complexity and control for this factor in our analysis. Our analysis provides +compelling evidence that state-of-the-art embedding models struggle to capture +domain-specific linguistic and semantic patterns, even when trained on large +general-purpose corpora. This study sheds light on the necessity of developing +domain-specific embedding models in the LLM era, offering valuable insights for +researchers and practitioners. + +
+
+ comment: https://github.com/yixuantt/FinMTEB +
+
+
+
+
+ + ☆ Evaluation of OpenAI o1: Opportunities and Challenges of AGI + + +
+ This comprehensive study evaluates the performance of OpenAI's o1-preview +large language model across a diverse array of complex reasoning tasks, +spanning multiple domains, including computer science, mathematics, natural +sciences, medicine, linguistics, and social sciences. Through rigorous testing, +o1-preview demonstrated remarkable capabilities, often achieving human-level or +superior performance in areas ranging from coding challenges to scientific +reasoning and from language processing to creative problem-solving. Key +findings include: + -83.3% success rate in solving complex competitive programming problems, +surpassing many human experts. + -Superior ability in generating coherent and accurate radiology reports, +outperforming other evaluated models. + -100% accuracy in high school-level mathematical reasoning tasks, providing +detailed step-by-step solutions. + -Advanced natural language inference capabilities across general and +specialized domains like medicine. + -Impressive performance in chip design tasks, outperforming specialized +models in areas such as EDA script generation and bug analysis. + -Remarkable proficiency in anthropology and geology, demonstrating deep +understanding and reasoning in these specialized fields. + -Strong capabilities in quantitative investing. O1 has comprehensive +financial knowledge and statistical modeling skills. + -Effective performance in social media analysis, including sentiment analysis +and emotion recognition. + The model excelled particularly in tasks requiring intricate reasoning and +knowledge integration across various fields. While some limitations were +observed, including occasional errors on simpler problems and challenges with +certain highly specialized concepts, the overall results indicate significant +progress towards artificial general intelligence. + +
+
+
+
+
+ + ☆ URIEL+: Enhancing Linguistic Inclusion and Usability in a Typological + and Multilingual Knowledge Base + + +
+ URIEL is a knowledge base offering geographical, phylogenetic, and +typological vector representations for 7970 languages. It includes distance +measures between these vectors for 4005 languages, which are accessible via the +lang2vec tool. Despite being frequently cited, URIEL is limited in terms of +linguistic inclusion and overall usability. To tackle these challenges, we +introduce URIEL+, an enhanced version of URIEL and lang2vec addressing these +limitations. In addition to expanding typological feature coverage for 2898 +languages, URIEL+ improves user experience with robust, customizable distance +calculations to better suit the needs of the users. These upgrades also offer +competitive performance on downstream tasks and provide distances that better +align with linguistic distance studies. + +
+
+
+
+
+ + ☆ Leveraging Long-Context Large Language Models for Multi-Document + Understanding and Summarization in Enterprise Applications + + +
+ The rapid increase in unstructured data across various fields has made +multi-document comprehension and summarization a critical task. Traditional +approaches often fail to capture relevant context, maintain logical +consistency, and extract essential information from lengthy documents. This +paper explores the use of Long-context Large Language Models (LLMs) for +multi-document summarization, demonstrating their exceptional capacity to grasp +extensive connections, provide cohesive summaries, and adapt to various +industry domains and integration with enterprise applications/systems. The +paper discusses the workflow of multi-document summarization for effectively +deploying long-context LLMs, supported by case studies in legal applications, +enterprise functions such as HR, finance, and sourcing, as well as in the +medical and news domains. These case studies show notable enhancements in both +efficiency and accuracy. Technical obstacles, such as dataset diversity, model +scalability, and ethical considerations like bias mitigation and factual +accuracy, are carefully analyzed. Prospective research avenues are suggested to +augment the functionalities and applications of long-context LLMs, establishing +them as pivotal tools for transforming information processing across diverse +sectors and enterprise applications. + +
+
+
+
+
+ + Exploring Language Model Generalization in Low-Resource Extractive QA + + +
+ In this paper, we investigate Extractive Question Answering (EQA) with Large +Language Models (LLMs) under domain drift, i.e., can LLMs generalize well to +closed-domains that require specific knowledge such as medicine and law in a +zero-shot fashion without additional in-domain training? To this end, we devise +a series of experiments to empirically explain the performance gap. Our +findings suggest that: a) LLMs struggle with dataset demands of closed-domains +such as retrieving long answer-spans; b) Certain LLMs, despite showing strong +overall performance, display weaknesses in meeting basic requirements as +discriminating between domain-specific senses of words which we link to +pre-processing decisions; c) Scaling model parameters is not always effective +for cross-domain generalization; and d) Closed-domain datasets are +quantitatively much different than open-domain EQA datasets and current LLMs +struggle to deal with them. Our findings point out important directions for +improving existing LLMs. + +
+
+
+
+
+ + ☆ Easy2Hard-Bench: Standardized Difficulty Labels for Profiling LLM + Performance and Generalization NeurIPS 2024 + + +
+ While generalization over tasks from easy to hard is crucial to profile +language models (LLMs), the datasets with fine-grained difficulty annotations +for each problem across a broad range of complexity are still blank. Aiming to +address this limitation, we present Easy2Hard-Bench, a consistently formatted +collection of 6 benchmark datasets spanning various domains, such as +mathematics and programming problems, chess puzzles, and reasoning questions. +Each problem within these datasets is annotated with numerical difficulty +scores. To systematically estimate problem difficulties, we collect abundant +performance data on attempts to each problem by humans in the real world or +LLMs on the prominent leaderboard. Leveraging the rich performance data, we +apply well-established difficulty ranking systems, such as Item Response Theory +(IRT) and Glicko-2 models, to uniformly assign numerical difficulty scores to +problems. Moreover, datasets in Easy2Hard-Bench distinguish themselves from +previous collections by a higher proportion of challenging problems. Through +extensive experiments with six state-of-the-art LLMs, we provide a +comprehensive analysis of their performance and generalization capabilities +across varying levels of difficulty, with the aim of inspiring future research +in LLM generalization. The datasets are available at +https://huggingface.co/datasets/furonghuang-lab/Easy2Hard-Bench. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ☆ Improving Multilingual ASR in the Wild Using Simple N-best Re-ranking + + +
+ Multilingual Automatic Speech Recognition (ASR) models are typically +evaluated in a setting where the ground-truth language of the speech utterance +is known, however, this is often not the case for most practical settings. +Automatic Spoken Language Identification (SLID) models are not perfect and +misclassifications have a substantial impact on the final ASR accuracy. In this +paper, we present a simple and effective N-best re-ranking approach to improve +multilingual ASR accuracy for several prominent acoustic models by employing +external features such as language models and text-based language +identification models. Our results on FLEURS using the MMS and Whisper models +show spoken language identification accuracy improvements of 8.7% and 6.1%, +respectively and word error rates which are 3.3% and 2.0% lower on these +benchmarks. + +
+
+
+
+
+ + ☆ VickreyFeedback: Cost-efficient Data Construction for Reinforcement + Learning from Human Feedback + + +
+ This paper addresses the cost-efficiency aspect of Reinforcement Learning +from Human Feedback (RLHF). RLHF leverages datasets of human preferences over +outputs of large language models (LLM) to instill human expectations into LLMs. +While preference annotation comes with a monetized cost, the economic utility +of a preference dataset has not been considered by far. What exacerbates this +situation is that given complex intransitive or cyclic relationships in +preference datasets, existing algorithms for fine-tuning LLMs are still far +from capturing comprehensive preferences. This raises severe cost-efficiency +concerns in production environments, where preference data accumulate over +time. In this paper, we see the fine-tuning of LLMs as a monetized economy and +introduce an auction mechanism to improve the efficiency of the preference data +collection in dollar terms. We show that introducing an auction mechanism can +play an essential role in enhancing the cost-efficiency of RLHF while +maintaining satisfactory model performance. Experimental results demonstrate +that our proposed auction-based protocol is cost-efficient for fine-tuning LLMs +by concentrating on high-quality feedback. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ☆ SciDFM: A Large Language Model with Mixture-of-Experts for Science + + +
+ Recently, there has been a significant upsurge of interest in leveraging +large language models (LLMs) to assist scientific discovery. However, most LLMs +only focus on general science, while they lack domain-specific knowledge, such +as chemical molecules and amino acid sequences. To bridge these gaps, we +introduce SciDFM, a mixture-of-experts LLM, which is trained from scratch and +is able to conduct college-level scientific reasoning and understand molecules +and amino acid sequences. We collect a large-scale training corpus containing +numerous scientific papers and books from different disciplines as well as data +from domain-specific databases. We further fine-tune the pre-trained model on +lots of instruction data to improve performances on downstream benchmarks. From +experiment results, we show that SciDFM achieves strong performance on general +scientific benchmarks such as SciEval and SciQ, and it reaches a SOTA +performance on domain-specific benchmarks among models of similar size. We +further analyze the expert layers and show that the results of expert selection +vary with data from different disciplines. To benefit the broader research +community, we open-source SciDFM at +https://huggingface.co/OpenDFM/SciDFM-MoE-A5.6B-v1.0. + +
+
+ comment: 12 pages, 1 figure, 9 tables. Technical Report, Under Review +
+
+
+
+
+ + ☆ Defect Prediction with Content-based Features + + +
+ Traditional defect prediction approaches often use metrics that measure the +complexity of the design or implementing code of a software system, such as the +number of lines of code in a source file. In this paper, we explore a different +approach based on content of source code. Our key assumption is that source +code of a software system contains information about its technical aspects and +those aspects might have different levels of defect-proneness. Thus, +content-based features such as words, topics, data types, and package names +extracted from a source code file could be used to predict its defects. We have +performed an extensive empirical evaluation and found that: i) such +content-based features have higher predictive power than code complexity +metrics and ii) the use of feature selection, reduction, and combination +further improves the prediction performance. + +
+
+
+
+
+ + ♻ ☆ BeanCounter: A low-toxicity, large-scale, and open dataset of + business-oriented text + + +
+ Many of the recent breakthroughs in language modeling have resulted from +scaling effectively the same model architecture to larger datasets. In this +vein, recent work has highlighted performance gains from increasing training +dataset size and quality, suggesting a need for novel sources of large-scale +datasets. In this work, we introduce BeanCounter, a public dataset consisting +of more than 159B tokens extracted from businesses' disclosures. We show that +this data is indeed novel: less than 0.1% of BeanCounter appears in Common +Crawl-based datasets and it is an order of magnitude larger than datasets +relying on similar sources. Given the data's provenance, we hypothesize that +BeanCounter is comparatively more factual and less toxic than web-based +datasets. Exploring this hypothesis, we find that many demographic identities +occur with similar prevalence in BeanCounter but with significantly less toxic +context relative to other datasets. To demonstrate the utility of BeanCounter, +we evaluate and compare two LLMs continually pre-trained on BeanCounter with +their base models. We find an 18-33% reduction in toxic generation and improved +performance within the finance domain for the continually pretrained models. +Collectively, our work suggests that BeanCounter is a novel source of +low-toxicity and high-quality domain-specific data with sufficient scale to +train multi-billion parameter LLMs. + +
+
+
+
+
+ + ♻ ☆ Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval + Model EMNLP 2024 + + +
+ A supervised ranking model, despite its advantage of being effective, usually +involves complex processing - typically multiple stages of task-specific +pre-training and fine-tuning. This has motivated researchers to explore simpler +pipelines leveraging large language models (LLMs) that are capable of working +in a zero-shot manner. However, since zero-shot inference does not make use of +a training set of pairs of queries and their relevant documents, its +performance is mostly worse than that of supervised models, which are trained +on such example pairs. Motivated by the existing findings that training +examples generally improve zero-shot performance, in our work, we explore if +this also applies to ranking models. More specifically, given a query and a +pair of documents, the preference prediction task is improved by augmenting +examples of preferences for similar queries from a training set. Our proposed +pairwise few-shot ranker demonstrates consistent improvements over the +zero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset) +retrieval benchmarks. Our method also achieves a close performance to that of a +supervised model without requiring any complex training pipeline. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Fine Tuning vs. Retrieval Augmented Generation for Less Popular + Knowledge + + +
+ Language Models (LMs) memorize a vast amount of factual knowledge, exhibiting +strong performance across diverse tasks and domains. However, it has been +observed that the performance diminishes when dealing with less-popular or +low-frequency concepts and entities, for example in domain specific +applications. The two prominent approaches to enhance the performance of LMs on +low-frequent topics are: Retrieval Augmented Generation (RAG) and fine-tuning +(FT) over synthetic data. This paper explores and evaluates the impact of RAG +and FT on customizing LMs in handling low-frequency entities on question +answering tasks. We conduct extensive experiments on twelve LMs of varying size +and type and different fine tuning, data augmentation, and retrieval models. +Our findings indicate that while FT boosts the performance across entities of +varying popularity, RAG surpasses FT by a large margin particularly for least +popular factual knowledge. Additionally, the success of both RAG and FT +approaches is amplified by improving retrieval and data augmentation +techniques. Fine tuning, while beneficial for small LMs, requires extensive +resources. To address this issue, we propose the new Stimulus RAG approach that +surpasses the effectiveness of fine tuning based approaches, thereby +eliminating the need for the costly data augmentation and fine tuning step for +enriching LMs with less popular factual knowledge. + +
+
+
+
+
+ + ♻ ☆ Summarizing Radiology Reports Findings into Impressions + + +
+ Patient hand-off and triage are two fundamental problems in health care. +Often doctors must painstakingly summarize complex findings to efficiently +communicate with specialists and quickly make decisions on which patients have +the most urgent cases. In pursuit of these challenges, we present (1) a model +with state-of-art radiology report summarization performance using (2) a novel +method for augmenting medical data, and (3) an analysis of the model +limitations and radiology knowledge gain. We also provide a data processing +pipeline for future models developed on the the MIMIC CXR dataset. Our best +performing model was a fine-tuned BERT-to-BERT encoder-decoder with 58.75/100 +ROUGE-L F1, which outperformed specialized checkpoints with more sophisticated +attention mechanisms. We investigate these aspects in this work. + +
+
+ comment: This version reverts to the original preprint, following the advice + from the Artificial Intelligence in Health editorial office. The published + version is peer-reviewed and available in the journal (see external DOI). The + preprint remains unchanged to maintain version transparency, as noted in the + further disclosure section of the published article +
+
+
+
+
+ + ♻ ☆ Modulated Intervention Preference Optimization (MIPO): Keep the Easy, + Refine the Difficult AAAI 2025 + + +
+ Preference optimization methods typically begin training with a well-trained +SFT model as a reference model. In RLHF and DPO, a regularization term is used +during the preference optimization process to prevent the policy model from +deviating too far from the reference model's distribution, thereby avoiding the +generation of anomalous responses. When the reference model is already +well-aligned with the given data or only requires slight adjustments, this +approach can produce a well-aligned model. However, if the reference model is +not aligned with the given data and requires significant deviation from its +current state, a regularization term may actually hinder the model alignment. +In this study, we propose \textbf{Modulated Intervention Preference +Optimization (MIPO)} to address this issue. MIPO modulates the degree of +intervention from the reference model based on how well the given data is +aligned with it. If the data is well-aligned, the intervention is increased to +prevent the policy model from diverging significantly from reference model. +Conversely, if the alignment is poor, the interference is reduced to facilitate +more extensive training. We compare the performance of MIPO and DPO using +Mistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental +results demonstrate that MIPO consistently outperforms DPO across various +evaluation scenarios. + +
+
+ comment: 8pages, submitted to AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Scaling Behavior for Large Language Models regarding Numeral Systems: An + Example using Pythia EMNLP 2024 + + +
+ Though Large Language Models (LLMs) have shown remarkable abilities in +mathematics reasoning, they are still struggling with performing numeric +operations accurately, such as addition and multiplication. Numbers can be +tokenized into tokens in various ways by different LLMs and affect the numeric +operations performance. Currently, there are two representatives: 1) Tokenize +into $1$-digit, and 2) Tokenize into $1\sim 3$ digit. The difference is roughly +equivalent to using different numeral systems (namely base $10$ or base +$10^{3}$). In light of this, we study the scaling behavior of different numeral +systems in the context of transformer-based large language models. We +empirically show that a base $10$ system is consistently more data-efficient +than a base $10^{2}$ or $10^{3}$ system across training data scale, model sizes +under from-scratch training settings, while different number systems have very +similar fine-tuning performances. We attribute this to higher token frequencies +of a base $10$ system. Additionally, we reveal extrapolation behavior patterns +on addition and multiplication. We identify that base $100$ and base $1000$ +systems struggle on token-level discernment and token-level operations. We also +sheds light on the mechanism learnt by the models. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Internalizing ASR with Implicit Chain of Thought for Efficient + Speech-to-Speech Conversational LLM + + +
+ Current speech-based LLMs are predominantly trained on extensive ASR and TTS +datasets, excelling in tasks related to these domains. However, their ability +to handle direct speech-to-speech conversations remains notably constrained. +These models often rely on an ASR-to-TTS chain-of-thought pipeline, converting +speech into text for processing before generating audio responses, which +introduces latency and loses audio features. We propose a method that +implicitly internalizes ASR chain of thought into a speech LLM, enhancing its +native speech understanding capabilities. Our approach reduces latency and +improves the model's native understanding of speech, paving the way for more +efficient and natural real-time audio interactions. We also release a +large-scale synthetic conversational dataset to facilitate further research. + +
+
+ comment: Corrected style from final to preprint +
+
+
+
+
+ + ♻ ☆ Plurals: A System for Guiding LLMs Via Simulated Social Ensembles + + +
+ Recent debates raised concerns that language models may favor certain +viewpoints. But what if the solution is not to aim for a 'view from nowhere' +but rather to leverage different viewpoints? We introduce Plurals, a system and +Python library for pluralistic AI deliberation. Plurals consists of Agents +(LLMs, optionally with personas) which deliberate within customizable +Structures, with Moderators overseeing deliberation. Plurals is a generator of +simulated social ensembles. Plurals integrates with government datasets to +create nationally representative personas, includes deliberation templates +inspired by democratic deliberation theory, and allows users to customize both +information-sharing structures and deliberation behavior within Structures. Six +case studies demonstrate fidelity to theoretical constructs and efficacy. Three +randomized experiments show simulated focus groups produced output resonant +with an online sample of the relevant audiences (chosen over zero-shot +generation in 75% of trials). Plurals is both a paradigm and a concrete system +for pluralistic AI. The Plurals library is available at +https://github.com/josh-ashkinaze/plurals and will be continually updated. + +
+
+
+
+
+ + ♻ ☆ Semi-Supervised Cognitive State Classification from Speech with + Multi-View Pseudo-Labeling + + +
+ The lack of labeled data is a common challenge in speech classification +tasks, particularly those requiring extensive subjective assessment, such as +cognitive state classification. In this work, we propose a Semi-Supervised +Learning (SSL) framework, introducing a novel multi-view pseudo-labeling method +that leverages both acoustic and linguistic characteristics to select the most +confident data for training the classification model. Acoustically, unlabeled +data are compared to labeled data using the Frechet audio distance, calculated +from embeddings generated by multiple audio encoders. Linguistically, large +language models are prompted to revise automatic speech recognition +transcriptions and predict labels based on our proposed task-specific +knowledge. High-confidence data are identified when pseudo-labels from both +sources align, while mismatches are treated as low-confidence data. A bimodal +classifier is then trained to iteratively label the low-confidence data until a +predefined criterion is met. We evaluate our SSL framework on emotion +recognition and dementia detection tasks. Experimental results demonstrate that +our method achieves competitive performance compared to fully supervised +learning using only 30% of the labeled data and significantly outperforms two +selected baselines. + +
+
+
+
+
+ + ♻ ☆ M$^2$PT: Multimodal Prompt Tuning for Zero-shot Instruction Learning EMNLP 2024 + + +
+ Multimodal Large Language Models (MLLMs) demonstrate remarkable performance +across a wide range of domains, with increasing emphasis on enhancing their +zero-shot generalization capabilities for unseen tasks across various +modalities. Instruction tuning has emerged as an effective strategy for +achieving zero-shot generalization by finetuning pretrained models on diverse +multimodal tasks. As the scale of MLLMs continues to grow, parameter-efficient +finetuning becomes increasingly critical. However, most existing +parameter-efficient approaches focus only on single modalities and often +overlook the multimodal characteristics during finetuning. In this work, we +introduce a novel Multimodal Prompt Tuning (M$^2$PT) approach for efficient +instruction tuning of MLLMs. M$^2$PT effectively integrates visual and textual +prompts into the vision encoder and language processor respectively during +finetuning, facilitating the extraction and alignment of features across +modalities. Empirical results on various multimodal evaluation datasets +demonstrate the superior performance of our approach compared to several +state-of-the-art baselines. A comprehensive set of ablation studies validates +the effectiveness of our prompt design and the efficiency of our approach. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ LLM Detectors Still Fall Short of Real World: Case of LLM-Generated + Short News-Like Posts EMNLP + + +
+ With the emergence of widely available powerful LLMs, disinformation +generated by large Language Models (LLMs) has become a major concern. +Historically, LLM detectors have been touted as a solution, but their +effectiveness in the real world is still to be proven. In this paper, we focus +on an important setting in information operations -- short news-like posts +generated by moderately sophisticated attackers. + We demonstrate that existing LLM detectors, whether zero-shot or +purpose-trained, are not ready for real-world use in that setting. All tested +zero-shot detectors perform inconsistently with prior benchmarks and are highly +vulnerable to sampling temperature increase, a trivial attack absent from +recent benchmarks. A purpose-trained detector generalizing across LLMs and +unseen attacks can be developed, but it fails to generalize to new +human-written texts. + We argue that the former indicates domain-specific benchmarking is needed, +while the latter suggests a trade-off between the adversarial evasion +resilience and overfitting to the reference human text, with both needing +evaluation in benchmarks and currently absent. We believe this suggests a +re-consideration of current LLM detector benchmarking approaches and provides a +dynamically extensible benchmark to allow it +(https://github.com/Reliable-Information-Lab-HEVS/benchmark_llm_texts_detection). + +
+
+ comment: 20 pages, 7 tables, 13 figures, under consideration for EMNLP +
+
+
+
+
+ + ♻ ☆ Bridging the Social & Technical Divide in Augmentative and Alternative + Communication (AAC) Applications for Autistic Adults + + +
+ Natural Language Processing (NLP) techniques are being used more frequently +to improve high-tech Augmentative and Alternative Communication (AAC), but many +of these techniques are integrated without the inclusion of the users' +perspectives. Autistic adults are particularly neglected in the design of AAC +tools. We conducted in-depth interviews with 12 autistic adults to find the +pain points of current AAC and determine what technological advances they might +find helpful. We found that in addition to technological issues, there are many +societal issues as well. We found 9 different categories of themes from our +interviews: input flexibility, output flexibility, selecting or adapting AAC +for a good fit, when to start or swap AAC, benefits, access as an adult, +stumbling blocks for continued use, social concerns, and control of +communication. In this paper, we go through these categories in depth and then +suggest possible guidelines for developers, NLP researchers, and policy makers. + +
+
+
+
+
+ + ♻ ☆ Paraphrase Types Elicit Prompt Engineering Capabilities + + +
+ Much of the success of modern language models depends on finding a suitable +prompt to instruct the model. Until now, it has been largely unknown how +variations in the linguistic expression of prompts affect these models. This +study systematically and empirically evaluates which linguistic features +influence models through paraphrase types, i.e., different linguistic changes +at particular positions. We measure behavioral changes for five models across +120 tasks and six families of paraphrases (i.e., morphology, syntax, lexicon, +lexico-syntax, discourse, and others). We also control for other prompt +engineering factors (e.g., prompt length, lexical diversity, and proximity to +training data). Our results show a potential for language models to improve +tasks when their prompts are adapted in specific paraphrase types (e.g., 6.7% +median gain in Mixtral 8x7B; 5.5% in LLaMA 3 8B). In particular, changes in +morphology and lexicon, i.e., the vocabulary used, showed promise in improving +prompts. These findings contribute to developing more robust language models +capable of handling variability in linguistic expression. + +
+
+
+
+
+ + ♻ ☆ Interpretation of Intracardiac Electrograms Through Textual + Representations + + +
+ Understanding the irregular electrical activity of atrial fibrillation (AFib) +has been a key challenge in electrocardiography. For serious cases of AFib, +catheter ablations are performed to collect intracardiac electrograms (EGMs). +EGMs offer intricately detailed and localized electrical activity of the heart +and are an ideal modality for interpretable cardiac studies. Recent +advancements in artificial intelligence (AI) has allowed some works to utilize +deep learning frameworks to interpret EGMs during AFib. Additionally, language +models (LMs) have shown exceptional performance in being able to generalize to +unseen domains, especially in healthcare. In this study, we are the first to +leverage pretrained LMs for finetuning of EGM interpolation and AFib +classification via masked language modeling. We formulate the EGM as a textual +sequence and present competitive performances on AFib classification compared +against other representations. Lastly, we provide a comprehensive +interpretability study to provide a multi-perspective intuition of the model's +behavior, which could greatly benefit the clinical use. + +
+
+ comment: 17 pages, 7 figures; Accepted to CHIL 2024 +
+
+
+
+
+ + ♻ ☆ Improving Diversity of Commonsense Generation by Large Language Models + via In-Context Learning EMNLP 2024 + + +
+ Generative Commonsense Reasoning (GCR) requires a model to reason about a +situation using commonsense knowledge, while generating coherent sentences. +Although the quality of the generated sentences is crucial, the diversity of +the generation is equally important because it reflects the model's ability to +use a range of commonsense knowledge facts. Large Language Models (LLMs) have +shown proficiency in enhancing the generation quality across various tasks +through in-context learning (ICL) using given examples without the need for any +fine-tuning. However, the diversity aspect in LLM outputs has not been +systematically studied before. To address this, we propose a simple method that +diversifies the LLM generations, while preserving their quality. Experimental +results on three benchmark GCR datasets show that our method achieves an ideal +balance between the quality and diversity. Moreover, the sentences generated by +our proposed method can be used as training data to improve diversity in +existing commonsense generators. + +
+
+ comment: EMNLP 2024 Findings, Camera-ready version +
+
+
+
+
+ + ♻ ☆ Lego: Learning to Disentangle and Invert Personalized Concepts Beyond + Object Appearance in Text-to-Image Diffusion Models + + +
+ Text-to-Image (T2I) models excel at synthesizing concepts such as nouns, +appearances, and styles. To enable customized content creation based on a few +example images of a concept, methods such as Textual Inversion and DreamBooth +invert the desired concept and enable synthesizing it in new scenes. However, +inverting personalized concepts that go beyond object appearance and style +(adjectives and verbs) through natural language remains a challenge. Two key +characteristics of these concepts contribute to the limitations of current +inversion methods. 1) Adjectives and verbs are entangled with nouns (subject) +and can hinder appearance-based inversion methods, where the subject appearance +leaks into the concept embedding, and 2) describing such concepts often extends +beyond single word embeddings. + In this study, we introduce Lego, a textual inversion method designed to +invert subject-entangled concepts from a few example images. Lego disentangles +concepts from their associated subjects using a simple yet effective Subject +Separation step and employs a Context Loss that guides the inversion of +single/multi-embedding concepts. In a thorough user study, Lego-generated +concepts were preferred over 70% of the time when compared to the baseline in +terms of authentically generating concepts according to a reference. +Additionally, visual question answering using an LLM suggested Lego-generated +concepts are better aligned with the text description of the concept. + +
+
+
+
+
+ + ♻ ☆ A Chatbot for Asylum-Seeking Migrants in Europe ICTAI + + +
+ We present ACME: A Chatbot for asylum-seeking Migrants in Europe. ACME relies +on computational argumentation and aims to help migrants identify the highest +level of protection they can apply for. This would contribute to a more +sustainable migration by reducing the load on territorial commissions, Courts, +and humanitarian organizations supporting asylum applicants. We describe the +background context, system architecture, underlying technologies, and a case +study used to validate the tool with domain experts. + +
+
+ comment: Accepted for publication at IEEE International Conference on Tools + with Artificial Intelligence (ICTAI) @IEEE +
+
+
+
+
+ + ♻ ☆ Bi-Directional Transformers vs. word2vec: Discovering Vulnerabilities in + Lifted Compiled Code + + +
+ Detecting vulnerabilities within compiled binaries is challenging due to lost +high-level code structures and other factors such as architectural +dependencies, compilers, and optimization options. To address these obstacles, +this research explores vulnerability detection using natural language +processing (NLP) embedding techniques with word2vec, BERT, and RoBERTa to learn +semantics from intermediate representation (LLVM IR) code. Long short-term +memory (LSTM) neural networks were trained on embeddings from encoders created +using approximately 48k LLVM functions from the Juliet dataset. This study is +pioneering in its comparison of word2vec models with multiple bidirectional +transformers (BERT, RoBERTa) embeddings built using LLVM code to train neural +networks to detect vulnerabilities in compiled binaries. Word2vec Skip-Gram +models achieved 92% validation accuracy in detecting vulnerabilities, +outperforming word2vec Continuous Bag of Words (CBOW), BERT, and RoBERTa. This +suggests that complex contextual embeddings may not provide advantages over +simpler word2vec models for this task when a limited number (e.g. 48K) of data +samples are used to train the bidirectional transformer-based models. The +comparative results provide novel insights into selecting optimal embeddings +for learning compiler-independent semantic code representations to advance +machine learning detection of vulnerabilities in compiled binaries. + +
+
+ comment: Updated with improvements +
+
+
+
+
+ + ♻ ☆ MultiPragEval: Multilingual Pragmatic Evaluation of Large Language + Models EMNLP 2024 + + +
+ As the capabilities of Large Language Models (LLMs) expand, it becomes +increasingly important to evaluate them beyond basic knowledge assessment, +focusing on higher-level language understanding. This study introduces +MultiPragEval, the first multilingual pragmatic evaluation of LLMs, designed +for English, German, Korean, and Chinese. Comprising 1200 question units +categorized according to Grice's Cooperative Principle and its four +conversational maxims, MultiPragEval enables an in-depth assessment of LLMs' +contextual awareness and their ability to infer implied meanings. Our findings +demonstrate that Claude3-Opus significantly outperforms other models in all +tested languages, establishing a state-of-the-art in the field. Among +open-source models, Solar-10.7B and Qwen1.5-14B emerge as strong competitors. +By analyzing pragmatic inference, we provide valuable insights into the +capabilities essential for advanced language comprehension in AI systems. + +
+
+ comment: The 2nd GenBench workshop on generalisation (benchmarking) in NLP - + EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ The Impact of Unstated Norms in Bias Analysis of Language Models + + +
+ Bias in large language models (LLMs) has many forms, from overt +discrimination to implicit stereotypes. Counterfactual bias evaluation is a +widely used approach to quantifying bias and often relies on template-based +probes that explicitly state group membership. It measures whether the outcome +of a task, performed by an LLM, is invariant to a change of group membership. +In this work, we find that template-based probes can lead to unrealistic bias +measurements. For example, LLMs appear to mistakenly cast text associated with +White race as negative at higher rates than other groups. We hypothesize that +this arises artificially via a mismatch between commonly unstated norms, in the +form of markedness, in the pretraining text of LLMs (e.g., Black president vs. +president) and templates used for bias measurement (e.g., Black president vs. +White president). The findings highlight the potential misleading impact of +varying group membership through explicit mention in counterfactual bias +quantification. + +
+
+ comment: 23 Pages, 5 Figures, 3 Tables +
+
+
+
+
+ + ♻ ☆ Dual-Layer Training and Decoding of Large Language Model with + Simultaneously Thinking and Speaking + + +
+ Large Language Model can reasonably understand and generate human expressions +but may lack of thorough thinking and reasoning mechanisms. Recently there have +been several studies which enhance the thinking ability of language models but +most of them are not data-driven or training-based. In this paper, we are +motivated by the cognitive mechanism in the natural world, and design a novel +model architecture called TaS which allows it to first consider the thoughts +and then express the response based upon the query. We design several pipelines +to annotate or generate the thought contents from prompt-response samples, then +add language heads in a middle layer which behaves as the thinking layer. We +train the language model by the thoughts-augmented data and successfully let +the thinking layer automatically generate reasonable thoughts and finally +output more reasonable responses. Both qualitative examples and quantitative +results validate the effectiveness and performance of TaS. Our code is +available at https://anonymous.4open.science/r/TadE. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Reward-Robust RLHF in LLMs + + +
+ As Large Language Models (LLMs) continue to progress toward more advanced +forms of intelligence, Reinforcement Learning from Human Feedback (RLHF) is +increasingly seen as a key pathway toward achieving Artificial General +Intelligence (AGI). However, the reliance on reward-model-based (RM-based) +alignment methods introduces significant challenges due to the inherent +instability and imperfections of Reward Models (RMs), which can lead to +critical issues such as reward hacking and misalignment with human intentions. +In this paper, we introduce a reward-robust RLHF framework aimed at addressing +these fundamental challenges, paving the way for more reliable and resilient +learning in LLMs. Our approach introduces a novel optimization objective that +carefully balances performance and robustness by incorporating Bayesian Reward +Model Ensembles (BRME) to model the uncertainty set of reward functions. This +allows the framework to integrate both nominal performance and minimum reward +signals, ensuring more stable learning even with imperfect RMs. Empirical +results demonstrate that our framework consistently outperforms baselines +across diverse benchmarks, showing improved accuracy and long-term stability. +We also provide a theoretical analysis, demonstrating that reward-robust RLHF +approaches the stability of constant reward settings, which proves to be +acceptable even in a stochastic-case analysis. Together, these contributions +highlight the framework potential to enhance both the performance and stability +of LLM alignment. + +
+
+
+
+
+ + ♻ ☆ Hierarchical Windowed Graph Attention Network and a Large Scale Dataset + for Isolated Indian Sign Language Recognition + + +
+ Automatic Sign Language (SL) recognition is an important task in the computer +vision community. To build a robust SL recognition system, we need a +considerable amount of data which is lacking particularly in Indian sign +language (ISL). In this paper, we introduce a large-scale isolated ISL dataset +and a novel SL recognition model based on skeleton graph structure. The dataset +covers 2002 daily used common words in the deaf community recorded by 20 (10 +male and 10 female) deaf adult signers (contains 40033 videos). We propose a SL +recognition model namely Hierarchical Windowed Graph Attention Network (HWGAT) +by utilizing the human upper body skeleton graph. The HWGAT tries to capture +distinctive motions by giving attention to different body parts induced by the +human skeleton graph. The utility of the proposed dataset and the usefulness of +our model are evaluated through extensive experiments. We pre-trained the +proposed model on the presented dataset and fine-tuned it across different sign +language datasets further boosting the performance of 1.10, 0.46, 0.78, and +6.84 percentage points on INCLUDE, LSA64, AUTSL and WLASL respectively compared +to the existing state-of-the-art keypoints-based models. + +
+
+
+
+
+ + ♻ ☆ EnterpriseEM: Fine-tuned Embeddings for Enterprise Semantic Search + + +
+ Enterprises grapple with the significant challenge of managing proprietary +unstructured data, hindering efficient information retrieval. This has led to +the emergence of AI-driven information retrieval solutions, designed to adeptly +extract relevant insights to address employee inquiries. These solutions often +leverage pre-trained embedding models and generative models as foundational +components. While pre-trained embeddings may exhibit proximity or disparity +based on their original training objectives, they might not fully align with +the unique characteristics of enterprise-specific data, leading to suboptimal +alignment with the retrieval goals of enterprise environments. In this paper, +we propose a comprehensive methodology for contextualizing pre-trained +embedding models to enterprise environments, covering the entire process from +data preparation to model fine-tuning and evaluation. By adapting the +embeddings to better suit the retrieval tasks prevalent in enterprises, we aim +to enhance the performance of information retrieval solutions. We discuss the +process of fine-tuning, its effect on retrieval accuracy, and the potential +benefits for enterprise information management. Our findings demonstrate the +efficacy of fine-tuned embedding models in improving the precision and +relevance of search results in enterprise settings. + +
+
+
+
+
+ + ♻ ☆ QPaug: Question and Passage Augmentation for Open-Domain Question + Answering of LLMs EMNLP + + +
+ Retrieval-augmented generation (RAG) has received much attention for +Open-domain question-answering (ODQA) tasks as a means to compensate for the +parametric knowledge of large language models (LLMs). While previous approaches +focused on processing retrieved passages to remove irrelevant context, they +still rely heavily on the quality of retrieved passages which can degrade if +the question is ambiguous or complex. In this paper, we propose a simple yet +efficient method called question and passage augmentation (QPaug) via LLMs for +open-domain QA. QPaug first decomposes the original questions into +multiple-step sub-questions. By augmenting the original question with detailed +sub-questions and planning, we are able to make the query more specific on what +needs to be retrieved, improving the retrieval performance. In addition, to +compensate for the case where the retrieved passages contain distracting +information or divided opinions, we augment the retrieved passages with +self-generated passages by LLMs to guide the answer extraction. Experimental +results show that QPaug outperforms the previous state-of-the-art and achieves +significant performance gain over existing RAG methods. The source code is +available at \url{https://github.com/kmswin1/QPaug}. + +
+
+ comment: The 2024 Conference on Empirical Methods in Natural Language + Processing (EMNLP), Findings +
+
+
+
+
+ + ♻ ☆ Schrodinger's Memory: Large Language Models + + +
+ Memory is the foundation of all human activities; without memory, it would be +nearly impossible for people to perform any task in daily life. With the +development of Large Language Models (LLMs), their language capabilities are +becoming increasingly comparable to those of humans. But do LLMs have memory? +Based on current performance, LLMs do appear to exhibit memory. So, what is the +underlying mechanism of this memory? Previous research has lacked a deep +exploration of LLMs' memory capabilities and the underlying theory. In this +paper, we use Universal Approximation Theorem (UAT) to explain the memory +mechanism in LLMs. We also conduct experiments to verify the memory +capabilities of various LLMs, proposing a new method to assess their abilities +based on these memory ability. We argue that LLM memory operates like +Schr\"odinger's memory, meaning that it only becomes observable when a specific +memory is queried. We can only determine if the model retains a memory based on +its output in response to the query; otherwise, it remains indeterminate. +Finally, we expand on this concept by comparing the memory capabilities of the +human brain and LLMs, highlighting the similarities and differences in their +operational mechanisms. + +
+
+
+
+
+ + ♻ ☆ HyperBERT: Mixing Hypergraph-Aware Layers with Language Models for Node + Classification on Text-Attributed Hypergraphs EMNLP 2024 + + +
+ Hypergraphs are characterized by complex topological structure, representing +higher-order interactions among multiple entities through hyperedges. Lately, +hypergraph-based deep learning methods to learn informative data +representations for the problem of node classification on text-attributed +hypergraphs have garnered increasing research attention. However, existing +methods struggle to simultaneously capture the full extent of hypergraph +structural information and the rich linguistic attributes inherent in the nodes +attributes, which largely hampers their effectiveness and generalizability. To +overcome these challenges, we explore ways to further augment a pretrained BERT +model with specialized hypergraph-aware layers for the task of node +classification. Such layers introduce higher-order structural inductive bias +into the language model, thus improving the model's capacity to harness both +higher-order context information from the hypergraph structure and semantic +information present in text. In this paper, we propose a new architecture, +HyperBERT, a mixed text-hypergraph model which simultaneously models hypergraph +relational structure while maintaining the high-quality text encoding +capabilities of a pre-trained BERT. Notably, HyperBERT presents results that +achieve a new state-of-the-art on five challenging text-attributed hypergraph +node classification benchmarks. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Large Language Models are Limited in Out-of-Context Knowledge Reasoning + + +
+ Large Language Models (LLMs) possess extensive knowledge and strong +capabilities in performing in-context reasoning. However, previous work +challenges their out-of-context reasoning ability, i.e., the ability to infer +information from their training data, instead of from the context or prompt. +This paper focuses on a significant aspect of out-of-context reasoning: +Out-of-Context Knowledge Reasoning (OCKR), which is to combine multiple +knowledge to infer new knowledge. We designed a synthetic dataset with seven +representative OCKR tasks to systematically assess the OCKR capabilities of +LLMs. Using this dataset, we evaluated several LLMs and discovered that their +proficiency in this aspect is limited, regardless of whether the knowledge is +trained in a separate or adjacent training settings. Moreover, training the +model to reason with reasoning examples does not result in significant +improvement, while training the model to perform explicit knowledge retrieval +helps for retrieving attribute knowledge but not the relation knowledge, +indicating that the model's limited OCKR capabilities are due to difficulties +in knowledge retrieval. Furthermore, we treat cross-lingual knowledge transfer +as a distinct form of OCKR, and evaluate this ability. Our results show that +the evaluated model also exhibits limited ability in transferring knowledge +across languages. + +
+
+
+
+
+ + ♻ ☆ MATHWELL: Generating Educational Math Word Problems Using Teacher + Annotations EMNLP 2024 + + +
+ Math word problems are critical K-8 educational tools, but writing them is +time consuming and requires extensive expertise. To be educational, problems +must be solvable, have accurate answers, and, most importantly, be +educationally appropriate. We propose that language models have potential to +support K-8 math education by automatically generating word problems. However, +evaluating educational appropriateness is hard to quantify. We fill this gap by +having teachers evaluate problems generated by LLMs, who find existing models +and data often fail to be educationally appropriate. We then explore +automatically generating educational word problems, ultimately using our expert +annotations to finetune a 70B language model. Our model, MATHWELL, is the first +K-8 word problem generator targeted at educational appropriateness. Further +expert studies find MATHWELL generates problems far more solvable, accurate, +and appropriate than public models. MATHWELL also matches GPT-4's problem +quality while attaining more appropriate reading levels for K-8 students and +avoiding generating harmful questions. + +
+
+ comment: 24 pages, 10 figures Accepted to EMNLP 2024 (Findings) +
+
+
+
+
+ + ♻ ☆ Rethinking Emotion Bias in Music via Frechet Audio Distance + + +
+ The subjective nature of music emotion introduces inherent bias in both +recognition and generation, especially when relying on a single audio encoder, +emotion classifier, or evaluation metric. In this work, we conduct a study on +Music Emotion Recognition (MER) and Emotional Music Generation (EMG), employing +diverse audio encoders alongside the Frechet Audio Distance (FAD), a +reference-free evaluation metric. Our study begins with a benchmark evaluation +of MER, highlighting the limitations associated with using a single audio +encoder and the disparities observed across different measurements. We then +propose assessing MER performance using FAD from multiple encoders to provide a +more objective measure of music emotion. Furthermore, we introduce an enhanced +EMG approach designed to improve both the variation and prominence of generated +music emotion, thus enhancing realism. Additionally, we investigate the realism +disparities between the emotions conveyed in real and synthetic music, +comparing our EMG model against two baseline models. Experimental results +underscore the emotion bias problem in both MER and EMG and demonstrate the +potential of using FAD and diverse audio encoders to evaluate music emotion +objectively. + +
+
+
+
+
+ + ♻ ☆ 2D or not 2D: How Does the Dimensionality of Gesture Representation + Affect 3D Co-Speech Gesture Generation? + + +
+ Co-speech gestures are fundamental for communication. The advent of recent +deep learning techniques has facilitated the creation of lifelike, synchronous +co-speech gestures for Embodied Conversational Agents. "In-the-wild" datasets, +aggregating video content from platforms like YouTube via human pose detection +technologies, provide a feasible solution by offering 2D skeletal sequences +aligned with speech. Concurrent developments in lifting models enable the +conversion of these 2D sequences into 3D gesture databases. However, it is +important to note that the 3D poses estimated from the 2D extracted poses are, +in essence, approximations of the ground-truth, which remains in the 2D domain. +This distinction raises questions about the impact of gesture representation +dimensionality on the quality of generated motions - a topic that, to our +knowledge, remains largely unexplored. Our study examines the effect of using +either 2D or 3D joint coordinates as training data on the performance of +speech-to-gesture deep generative models. We employ a lifting model for +converting generated 2D pose sequences into 3D and assess how gestures created +directly in 3D stack up against those initially generated in 2D and then +converted to 3D. We perform an objective evaluation using widely used metrics +in the gesture generation field as well as a user study to qualitatively +evaluate the different approaches. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2406.15111 +
+
+
+
+
+ + ♻ ☆ HW-TSC's Submission to the CCMT 2024 Machine Translation Tasks + + +
+ This paper presents the submission of Huawei Translation Services Center +(HW-TSC) to machine translation tasks of the 20th China Conference on Machine +Translation (CCMT 2024). We participate in the bilingual machine translation +task and multi-domain machine translation task. For these two translation +tasks, we use training strategies such as regularized dropout, bidirectional +training, data diversification, forward translation, back translation, +alternated training, curriculum learning, and transductive ensemble learning to +train neural machine translation (NMT) models based on the deep Transformer-big +architecture. Furthermore, to explore whether large language model (LLM) can +help improve the translation quality of NMT systems, we use supervised +fine-tuning to train llama2-13b as an Automatic post-editing (APE) model to +improve the translation results of the NMT model on the multi-domain machine +translation task. By using these plyometric strategies, our submission achieves +a competitive result in the final evaluation. + +
+
+ comment: 14 pages, 2 figures, 6 Tables, CCMT2024. arXiv admin note: + substantial text overlap with arXiv:2409.14800 +
+
+
+
+
+ + ♻ ☆ More Effective LLM Compressed Tokens with Uniformly Spread Position + Identifiers and Compression Loss + + +
+ Compressing Transformer inputs into compressd tokens allows running LLMs with +improved speed and cost efficiency. Based on the compression method ICAE, we +carefully examine the position identifier choices for compressed tokens and +also propose a new compression loss. We demonstrate empirically that our +proposed methods achieve significantly higher compression ratios (15x compared +to 4x for ICAE), while being able to attain comparable reconstruction +performance. + +
+
+
+
+
+ + ♻ ☆ Fishing for Magikarp: Automatically Detecting Under-trained Tokens in + Large Language Models EMNLP 2024 + + +
+ The disconnect between tokenizer creation and model training in language +models allows for specific inputs, such as the infamous SolidGoldMagikarp +token, to induce unwanted model behaviour. Although such `glitch tokens', +tokens present in the tokenizer vocabulary but that are nearly or entirely +absent during model training, have been observed across various models, a +reliable method to identify and address them has been missing. We present a +comprehensive analysis of Large Language Model tokenizers, specifically +targeting this issue of detecting under-trained tokens. Through a combination +of tokenizer analysis, model weight-based indicators, and prompting techniques, +we develop novel and effective methods for automatically detecting these +problematic tokens. Our findings demonstrate the prevalence of such tokens +across a diverse set of models and provide insights into improving the +efficiency and safety of language models. + +
+
+ comment: 16 pages, 6 figures. Accepted at EMNLP 2024, main track. For + associated code, see https://github.com/cohere-ai/magikarp/ +
+
+
+
+
+ + ♻ ☆ Multimodal Shannon Game with Images + + +
+ The Shannon game has long been used as a thought experiment in linguistics +and NLP, asking participants to guess the next letter in a sentence based on +its preceding context. We extend the game by introducing an optional extra +modality in the form of image information. To investigate the impact of +multimodal information in this game, we use human participants and a language +model (LM, GPT-2). + We show that the addition of image information improves both self-reported +confidence and accuracy for both humans and LM. Certain word classes, such as +nouns and determiners, benefit more from the additional modality information. +The priming effect in both humans and the LM becomes more apparent as the +context size (extra modality information + sentence context) increases. These +findings highlight the potential of multimodal information in improving +language understanding and modeling. + +
+
+
+
+
+ + ♻ ☆ Make Large Language Model a Better Ranker + + +
+ Large Language Models (LLMs) demonstrate robust capabilities across various +fields, leading to a paradigm shift in LLM-enhanced Recommender System (RS). +Research to date focuses on point-wise and pair-wise recommendation paradigms, +which are inefficient for LLM-based recommenders due to high computational +costs. However, existing list-wise approaches also fall short in ranking tasks +due to misalignment between ranking objectives and next-token prediction. +Moreover, these LLM-based methods struggle to effectively address the order +relation among candidates, particularly given the scale of ratings. To address +these challenges, this paper introduces the large language model framework with +Aligned Listwise Ranking Objectives (ALRO). ALRO is designed to bridge the gap +between the capabilities of LLMs and the nuanced requirements of ranking tasks. +Specifically, ALRO employs explicit feedback in a listwise manner by +introducing soft lambda loss, a customized adaptation of lambda loss designed +for optimizing order relations. This mechanism provides more accurate +optimization goals, enhancing the ranking process. Additionally, ALRO +incorporates a permutation-sensitive learning mechanism that addresses position +bias, a prevalent issue in generative models, without imposing additional +computational burdens during inference. Our evaluative studies reveal that ALRO +outperforms both existing embedding-based recommendation methods and LLM-based +recommendation baselines. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ DICTDIS: Dictionary Constrained Disambiguation for Improved NMT EMNLP + + +
+ Domain-specific neural machine translation (NMT) systems (e.g., in +educational applications) are socially significant with the potential to help +make information accessible to a diverse set of users in multilingual +societies. It is desirable that such NMT systems be lexically constrained and +draw from domain-specific dictionaries. Dictionaries could present multiple +candidate translations for a source word/phrase due to the polysemous nature of +words. The onus is then on the NMT model to choose the contextually most +appropriate candidate. Prior work has largely ignored this problem and focused +on the single candidate constraint setting wherein the target word or phrase is +replaced by a single constraint. In this work we present DictDis, a lexically +constrained NMT system that disambiguates between multiple candidate +translations derived from dictionaries. We achieve this by augmenting training +data with multiple dictionary candidates to actively encourage disambiguation +during training by implicitly aligning multiple candidate constraints. We +demonstrate the utility of DictDis via extensive experiments on English-Hindi +and English-German sentences in a variety of domains including regulatory, +finance, engineering. We also present comparisons on standard benchmark test +datasets. In comparison with existing approaches for lexically constrained and +unconstrained NMT, we demonstrate superior performance with respect to +constraint copy and disambiguation related measures on all domains while also +obtaining improved fluency of up to 2-3 BLEU points on some domains. + +
+
+ comment: In Findings of EMNLP, 2024 +
+
+
+
+
+ + ♻ ☆ SEER: Facilitating Structured Reasoning and Explanation via + Reinforcement Learning ACL 2024 + + +
+ Elucidating the reasoning process with structured explanations from question +to answer is crucial, as it significantly enhances the interpretability, +traceability, and trustworthiness of question-answering (QA) systems. However, +structured explanations demand models to perform intricately structured +reasoning, which poses great challenges. Most existing methods focus on +single-step reasoning through supervised learning, ignoring logical +dependencies between steps. Moreover, existing reinforcement learning (RL) +based methods overlook the structured relationships, underutilizing the +potential of RL in structured reasoning. In this paper, we propose SEER, a +novel method that maximizes a structure-based return to facilitate structured +reasoning and explanation. Our proposed structure-based return precisely +describes the hierarchical and branching structure inherent in structured +reasoning, effectively capturing the intricate relationships between different +reasoning steps. In addition, we introduce a fine-grained reward function to +meticulously delineate diverse reasoning steps. Extensive experiments show that +SEER significantly outperforms state-of-the-art methods, achieving an absolute +improvement of 6.9% over RL-based methods on EntailmentBank, a 4.4% average +improvement on STREET benchmark, and exhibiting outstanding efficiency and +cross-dataset generalization performance. Our code is available at +https://github.com/Chen-GX/SEER. + +
+
+ comment: Camera ready version for ACL 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Think Twice Before Trusting: Self-Detection for Large Language Models + through Comprehensive Answer Reflection EMNLP + + +
+ Self-detection for Large Language Models (LLMs) seeks to evaluate the +trustworthiness of the LLM's output by leveraging its own capabilities, thereby +alleviating the issue of output hallucination. However, existing self-detection +approaches only retrospectively evaluate answers generated by LLM, typically +leading to the over-trust in incorrectly generated answers. To tackle this +limitation, we propose a novel self-detection paradigm that considers the +comprehensive answer space beyond LLM-generated answers. It thoroughly compares +the trustworthiness of multiple candidate answers to mitigate the over-trust in +LLM-generated incorrect answers. Building upon this paradigm, we introduce a +two-step framework, which firstly instructs LLM to reflect and provide +justifications for each candidate answer, and then aggregates the +justifications for comprehensive target answer evaluation. This framework can +be seamlessly integrated with existing approaches for superior self-detection. +Extensive experiments on six datasets spanning three tasks demonstrate the +effectiveness of the proposed framework. + +
+
+ comment: EMNLP findings 2024 +
+
+
+
+
+ + ♻ ☆ AlphaMath Almost Zero: Process Supervision without Process NeurIPS 2024 + + +
+ Although recent advancements in large language models (LLMs) have +significantly improved their performance on various tasks, they still face +challenges with complex and symbolic multi-step reasoning, particularly in +mathematical reasoning. To bolster the mathematical reasoning capabilities of +LLMs, most existing efforts concentrate on seeking assistance from either +domain experts or GPT-4 for high-quality process-supervised data, which is not +only expensive but also labor-intensive. In our study, we propose an innovative +framework, AlphaMath, that bypasses the need for process annotations (from +humans or GPTs) by leveraging Monte Carlo Tree Search (MCTS). This framework +focuses on unleashing the potential of a well-pretrained LLM to autonomously +enhance its mathematical reasoning. Specifically, we integrate a value model +with the LLM, automatically generating both process supervision and step-level +evaluation signals in MCTS. Furthermore, we propose an efficient inference +strategy, step-level beam search, where the value model is crafted to assist +the policy model (i.e., LLM) in navigating more effective reasoning paths, +rather than solely relying on prior probabilities. The experimental results on +both in-domain and out-of-domain datasets demonstrate that even without GPT-4 +or human-annotated process supervision, our AlphaMath framework achieves +comparable or superior results to previous state-of-the-art methods. + +
+
+ comment: Camera ready version for NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Step-level Value Preference Optimization for Mathematical Reasoning EMNLP2024 + + +
+ Direct Preference Optimization (DPO) using an implicit reward model has +proven to be an effective alternative to reinforcement learning from human +feedback (RLHF) for fine-tuning preference aligned large language models +(LLMs). However, the overall preference annotations of responses do not fully +capture the fine-grained quality of model outputs in complex multi-step +reasoning tasks, such as mathematical reasoning. To address this limitation, we +introduce a novel algorithm called Step-level Value Preference Optimization +(SVPO). Our approach employs Monte Carlo Tree Search (MCTS) to automatically +annotate step-level preferences for multi-step reasoning. Furthermore, from the +perspective of learning-to-rank, we train an explicit value model to replicate +the behavior of the implicit reward model, complementing standard preference +optimization. This value model enables the LLM to generate higher reward +responses with minimal cost during inference. Experimental results demonstrate +that our method achieves state-of-the-art performance on both in-domain and +out-of-domain mathematical reasoning benchmarks. Our code is available at +\url{https://github.com/MARIO-Math-Reasoning/Super_MARIO}. + +
+
+ comment: Camera ready version for EMNLP2024-Findings +
+
+
+
+
+ + ♻ ☆ Self-Evaluation of Large Language Model based on Glass-box Features EMNLP2024 + + +
+ The proliferation of open-source Large Language Models (LLMs) underscores the +pressing need for evaluation methods. Existing works primarily rely on external +evaluators, focusing on training and prompting strategies. However, a crucial +aspect, model-aware glass-box features, is overlooked. In this study, we +explore the utility of glass-box features under the scenario of +self-evaluation, namely applying an LLM to evaluate its own output. We +investigate various glass-box feature groups and discovered that the softmax +distribution serves as a reliable quality indicator for self-evaluation. +Experimental results on public benchmarks validate the feasibility of +self-evaluation of LLMs using glass-box features. + +
+
+ comment: accepted as Findings of EMNLP2024 +
+
+
+
+
+ + ♻ ☆ PromptKD: Distilling Student-Friendly Knowledge for Generative Language + Models via Prompt Tuning EMNLP 2024 + + +
+ Recent advancements in large language models (LLMs) have raised concerns +about inference costs, increasing the need for research into model compression. +While knowledge distillation (KD) is a prominent method for this, research on +KD for generative language models like LLMs is relatively sparse, and the +approach of distilling student-friendly knowledge, which has shown promising +performance in KD for classification models, remains unexplored in generative +language models. To explore this approach, we propose PromptKD, a simple yet +effective method that utilizes prompt tuning - for the first time in KD - to +enable generative language models to transfer student-friendly knowledge. +Unlike previous works in classification that require fine-tuning the entire +teacher model for extracting student-friendly knowledge, PromptKD achieves +similar effects by adding a small number of prompt tokens and tuning only the +prompt with student guidance. Extensive experiments on instruction-following +datasets show that PromptKD achieves state-of-the-art performance while adding +only 0.0007% of the teacher's parameters as prompts. Further analysis suggests +that distilling student-friendly knowledge alleviates exposure bias effectively +throughout the entire training process, leading to performance enhancements. + +
+
+ comment: EMNLP 2024 Findings. Our project page: https://promptkd.github.io +
+
+
+
+
+ + ♻ ☆ OWL: A Large Language Model for IT Operations ICLR 2024 + + +
+ With the rapid development of IT operations, it has become increasingly +crucial to efficiently manage and analyze large volumes of data for practical +applications. The techniques of Natural Language Processing (NLP) have shown +remarkable capabilities for various tasks, including named entity recognition, +machine translation and dialogue systems. Recently, Large Language Models +(LLMs) have achieved significant improvements across various NLP downstream +tasks. However, there is a lack of specialized LLMs for IT operations. In this +paper, we introduce the OWL, a large language model trained on our collected +OWL-Instruct dataset with a wide range of IT-related information, where the +mixture-of-adapter strategy is proposed to improve the parameter-efficient +tuning across different domains or tasks. Furthermore, we evaluate the +performance of our OWL on the OWL-Bench established by us and open IT-related +benchmarks. OWL demonstrates superior performance results on IT tasks, which +outperforms existing models by significant margins. Moreover, we hope that the +findings of our work will provide more insights to revolutionize the techniques +of IT operations with specialized LLMs. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ SKT5SciSumm -- Revisiting Extractive-Generative Approach for + Multi-Document Scientific Summarization + + +
+ Summarization for scientific text has shown significant benefits both for the +research community and human society. Given the fact that the nature of +scientific text is distinctive and the input of the multi-document +summarization task is substantially long, the task requires sufficient +embedding generation and text truncation without losing important information. +To tackle these issues, in this paper, we propose SKT5SciSumm - a hybrid +framework for multi-document scientific summarization (MDSS). We leverage the +Sentence-Transformer version of Scientific Paper Embeddings using +Citation-Informed Transformers (SPECTER) to encode and represent textual +sentences, allowing for efficient extractive summarization using k-means +clustering. We employ the T5 family of models to generate abstractive summaries +using extracted sentences. SKT5SciSumm achieves state-of-the-art performance on +the Multi-XScience dataset. Through extensive experiments and evaluation, we +showcase the benefits of our model by using less complicated models to achieve +remarkable results, thereby highlighting its potential in advancing the field +of multi-document summarization for scientific text. + +
+
+
+
+
+ + ♻ ☆ Continual Learning Optimizations for Auto-regressive Decoder of + Multilingual ASR systems + + +
+ Continual Learning (CL) involves fine-tuning pre-trained models with new data +while maintaining the performance on the pre-trained data. This is particularly +relevant for expanding multilingual ASR (MASR) capabilities. However, existing +CL methods, mainly designed for computer vision and reinforcement learning +tasks, often yield sub-optimal results when directly applied to MASR. We +hypothesise that this is because CL of the auto-regressive decoder in the MASR +model is difficult. To verify this, we propose four optimizations on the +decoder. They include decoder-layer gradient surgery, freezing unused token +embeddings, suppressing output of newly added tokens, and learning rate +re-scaling. Our experiments on adapting Whisper to 10 unseen languages from the +Common Voice dataset demonstrate that these optimizations reduce the Average +Word Error Rate (AWER) of pretrained languages from 14.2% to 12.4% compared +with Experience Replay, without compromising the AWER of new languages. + +
+
+ comment: Proceedings of Interspeech +
+
+
+
+
+ + ♻ ☆ "Global is Good, Local is Bad?": Understanding Brand Bias in LLMs EMNLP-2024 + + +
+ Many recent studies have investigated social biases in LLMs but brand bias +has received little attention. This research examines the biases exhibited by +LLMs towards different brands, a significant concern given the widespread use +of LLMs in affected use cases such as product recommendation and market +analysis. Biased models may perpetuate societal inequalities, unfairly favoring +established global brands while marginalizing local ones. Using a curated +dataset across four brand categories, we probe the behavior of LLMs in this +space. We find a consistent pattern of bias in this space -- both in terms of +disproportionately associating global brands with positive attributes and +disproportionately recommending luxury gifts for individuals in high-income +countries. We also find LLMs are subject to country-of-origin effects which may +boost local brand preference in LLM outputs in specific contexts. + +
+
+ comment: Accepted at EMNLP-2024 (main) +
+
+
+
+
+ + ♻ ☆ Towards Efficient Methods in Medical Question Answering using Knowledge + Graph Embeddings + + +
+ In Natural Language Processing (NLP), Machine Reading Comprehension (MRC) is +the task of answering a question based on a given context. To handle questions +in the medical domain, modern language models such as BioBERT, SciBERT and even +ChatGPT are trained on vast amounts of in-domain medical corpora. However, +in-domain pre-training is expensive in terms of time and resources. In this +paper, we propose a resource-efficient approach for injecting domain knowledge +into a model without relying on such domain-specific pre-training. + Knowledge graphs are powerful resources for accessing medical information. +Building on existing work, we introduce a method using Multi-Layer Perceptrons +(MLPs) for aligning and integrating embeddings extracted from medical knowledge +graphs with the embedding spaces of pre-trained language models (LMs). The +aligned embeddings are fused with open-domain LMs BERT and RoBERTa that are +fine-tuned for two MRC tasks, span detection (COVID-QA) and multiple-choice +questions (PubMedQA). We compare our method to prior techniques that rely on a +vocabulary overlap for embedding alignment and show how our method circumvents +this requirement to deliver better performance. On both datasets, our method +allows BERT/RoBERTa to either perform on par (occasionally exceeding) with +stronger domain-specific models or show improvements in general over prior +techniques. With the proposed approach, we signal an alternative method to +in-domain pre-training to achieve domain proficiency. + +
+
+
+
+
+ + ♻ ☆ LongLaMP: A Benchmark for Personalized Long-form Text Generation EMNLP + + +
+ Long-text generation is seemingly ubiquitous in real-world applications of +large language models such as generating an email or writing a review. Despite +the fundamental importance and prevalence of long-text generation in many +practical applications, existing work on personalized generation has focused on +the generation of very short text. To overcome these limitations, we study the +problem of personalized long-text generation, that is, generating long-text +that is personalized for a specific user while being practically useful for the +vast majority of real-world applications that naturally require the generation +of longer text. In this work, we demonstrate the importance of user-specific +personalization for long-text generation tasks and develop the Long-text +Language Model Personalization (LongLaMP) Benchmark. LongLaMP provides a +comprehensive and diverse evaluation framework for personalized long-text +generation. Extensive experiments on LongLaMP for zero-shot and fine-tuned +language tasks demonstrate the effectiveness of the proposed benchmark and its +utility for developing and evaluating techniques for personalized long-text +generation across a wide variety of long-text generation tasks. The results +highlight the importance of personalization across a wide variety of long-text +generation tasks. Finally, we release the benchmark for others to use for this +important problem. + +
+
+ comment: 9 pages, 4 figures, 20 tables(including appendix) submitted to EMNLP +
+
+
+
+
+ + ♻ ☆ A Survey on In-context Learning + + +
+ With the increasing capabilities of large language models (LLMs), in-context +learning (ICL) has emerged as a new paradigm for natural language processing +(NLP), where LLMs make predictions based on contexts augmented with a few +examples. It has been a significant trend to explore ICL to evaluate and +extrapolate the ability of LLMs. In this paper, we aim to survey and summarize +the progress and challenges of ICL. We first present a formal definition of ICL +and clarify its correlation to related studies. Then, we organize and discuss +advanced techniques, including training strategies, prompt designing +strategies, and related analysis. Additionally, we explore various ICL +application scenarios, such as data engineering and knowledge updating. +Finally, we address the challenges of ICL and suggest potential directions for +further research. We hope that our work can encourage more research on +uncovering how ICL works and improving ICL. + +
+
+ comment: Update +
+
+
+
+
+
+
+
+ + General Literature 1 + +
+
+
+ + ♻ ☆ Automation and AI Technology in Surface Mining With a Brief Introduction + to Open-Pit Operations in the Pilbara ICRA + + +
+ This survey article provides a synopsis on some of the engineering problems, +technological innovations, robotic development and automation efforts +encountered in the mining industry -- particularly in the Pilbara iron-ore +region of Western Australia. The goal is to paint the technology landscape and +highlight issues relevant to an engineering audience to raise awareness of AI +and automation trends in mining. It assumes the reader has no prior knowledge +of mining and builds context gradually through focused discussion and short +summaries of common open-pit mining operations. The principal activities that +take place may be categorized in terms of resource development, mine-, rail- +and port operations. From mineral exploration to ore shipment, there are +roughly nine steps in between. These include: geological assessment, mine +planning and development, production drilling and assaying, blasting and +excavation, transportation of ore and waste, crush and screen, stockpile and +load-out, rail network distribution, and ore-car dumping. The objective is to +describe these processes and provide insights on some of the +challenges/opportunities from the perspective of a decade-long +industry-university R&D partnership. + +
+
+ comment: Accepted manuscript. Paper provides insights on state-of-the-art + technologies and future trends. Keywords: Mining automation, robotics, + intelligent systems, machine learning, remote sensing, geostatistics, + planning, scheduling, optimization, modelling, geology, complex systems. + Document: 21 pages, 6 figures, 2 tables. 2024 Update: Added ICRA conference + poster + slides as ancilliary files +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 117 + +
+
+
+ + ☆ Open-World Evaluation for Retrieving Diverse Perspectives + + +
+ We study retrieving a set of documents that covers various perspectives on a +complex and contentious question (e.g., will ChatGPT do more harm than good?). +We curate a Benchmark for Retrieval Diversity for Subjective questions (BERDS), +where each example consists of a question and diverse perspectives associated +with the question, sourced from survey questions and debate websites. On this +data, retrievers paired with a corpus are evaluated to surface a document set +that contains diverse perspectives. Our framing diverges from most retrieval +tasks in that document relevancy cannot be decided by simple string matches to +references. Instead, we build a language model based automatic evaluator that +decides whether each retrieved document contains a perspective. This allows us +to evaluate the performance of three different types of corpus (Wikipedia, web +snapshot, and corpus constructed on the fly with retrieved pages from the +search engine) paired with retrievers. Retrieving diverse documents remains +challenging, with the outputs from existing retrievers covering all +perspectives on only 33.74% of the examples. We further study the impact of +query expansion and diversity-focused reranking approaches and analyze +retriever sycophancy. Together, we lay the foundation for future studies in +retrieval diversity handling complex queries. + +
+
+
+
+
+ + ☆ Infer Human's Intentions Before Following Natural Language Instructions + + +
+ For AI agents to be helpful to humans, they should be able to follow natural +language instructions to complete everyday cooperative tasks in human +environments. However, real human instructions inherently possess ambiguity, +because the human speakers assume sufficient prior knowledge about their hidden +goals and intentions. Standard language grounding and planning methods fail to +address such ambiguities because they do not model human internal goals as +additional partially observable factors in the environment. We propose a new +framework, Follow Instructions with Social and Embodied Reasoning (FISER), +aiming for better natural language instruction following in collaborative +embodied tasks. Our framework makes explicit inferences about human goals and +intentions as intermediate reasoning steps. We implement a set of +Transformer-based models and evaluate them over a challenging benchmark, +HandMeThat. We empirically demonstrate that using social reasoning to +explicitly infer human intentions before making action plans surpasses purely +end-to-end approaches. We also compare our implementation with strong +baselines, including Chain of Thought prompting on the largest available +pre-trained language models, and find that FISER provides better performance on +the embodied social reasoning tasks under investigation, reaching the +state-of-the-art on HandMeThat. + +
+
+
+
+
+ + ☆ IFCap: Image-like Retrieval and Frequency-based Entity Filtering for + Zero-shot Captioning EMNLP 2024 + + +
+ Recent advancements in image captioning have explored text-only training +methods to overcome the limitations of paired image-text data. However, +existing text-only training methods often overlook the modality gap between +using text data during training and employing images during inference. To +address this issue, we propose a novel approach called Image-like Retrieval, +which aligns text features with visually relevant features to mitigate the +modality gap. Our method further enhances the accuracy of generated captions by +designing a Fusion Module that integrates retrieved captions with input +features. Additionally, we introduce a Frequency-based Entity Filtering +technique that significantly improves caption quality. We integrate these +methods into a unified framework, which we refer to as IFCap +($\textbf{I}$mage-like Retrieval and $\textbf{F}$requency-based Entity +Filtering for Zero-shot $\textbf{Cap}$tioning). Through extensive +experimentation, our straightforward yet powerful approach has demonstrated its +efficacy, outperforming the state-of-the-art methods by a significant margin in +both image captioning and video captioning compared to zero-shot captioning +based on text-only training. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ Unveiling the Role of Pretraining in Direct Speech Translation EMNLP 2024 + + +
+ Direct speech-to-text translation systems encounter an important drawback in +data scarcity. A common solution consists on pretraining the encoder on +automatic speech recognition, hence losing efficiency in the training process. +In this study, we compare the training dynamics of a system using a pretrained +encoder, the conventional approach, and one trained from scratch. We observe +that, throughout the training, the randomly initialized model struggles to +incorporate information from the speech inputs for its predictions. Hence, we +hypothesize that this issue stems from the difficulty of effectively training +an encoder for direct speech translation. While a model trained from scratch +needs to learn acoustic and semantic modeling simultaneously, a pretrained one +can just focus on the latter. Based on these findings, we propose a subtle +change in the decoder cross-attention to integrate source information from +earlier steps in training. We show that with this change, the model trained +from scratch can achieve comparable performance to the pretrained one, while +reducing the training time. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ☆ EMOVA: Empowering Language Models to See, Hear and Speak with Vivid + Emotions + + +
+ GPT-4o, an omni-modal model that enables vocal conversations with diverse +emotions and tones, marks a milestone for omni-modal foundation models. +However, empowering Large Language Models to perceive and generate images, +texts, and speeches end-to-end with publicly available data remains challenging +in the open-source community. Existing vision-language models rely on external +tools for the speech processing, while speech-language models still suffer from +limited or even without vision-understanding abilities. To address this gap, we +propose EMOVA (EMotionally Omni-present Voice Assistant), to enable Large +Language Models with end-to-end speech capabilities while maintaining the +leading vision-language performance. With a semantic-acoustic disentangled +speech tokenizer, we notice surprisingly that omni-modal alignment can further +enhance vision-language and speech abilities compared with the corresponding +bi-modal aligned counterparts. Moreover, a lightweight style module is proposed +for flexible speech style controls (e.g., emotions and pitches). For the first +time, EMOVA achieves state-of-the-art performance on both the vision-language +and speech benchmarks, and meanwhile, supporting omni-modal spoken dialogue +with vivid emotions. + +
+
+ comment: Project Page: https://emova-ollm.github.io/ +
+
+
+
+
+ + ☆ Automated Detection and Analysis of Power Words in Persuasive Text Using + Natural Language Processing + + +
+ Power words are terms that evoke strong emotional responses and significantly +influence readers' behavior, playing a crucial role in fields like marketing, +politics, and motivational writing. This study proposes a methodology for the +automated detection and analysis of power words in persuasive text using a +custom lexicon and the TextBlob library in Python. By identifying the presence +and frequency of power words within a given text, we aim to classify and +analyze their impact on sentiment and reader engagement. This research examines +diverse datasets across various domains to provide insights into the +effectiveness of power words, offering practical applications for content +creators, advertisers, and policymakers. + +
+
+
+
+
+ + ☆ Compositional Hardness of Code in Large Language Models -- A + Probabilistic Perspective + + +
+ A common practice in large language model (LLM) usage for complex analytical +tasks such as code generation, is to sample a solution for the entire task +within the model's context window. Previous works have shown that subtask +decomposition within the model's context (chain of thought), is beneficial for +solving such tasks. In this work, we point a limitation of LLMs' ability to +perform several sub-tasks within the same context window - an in-context +hardness of composition, pointing to an advantage for distributing a decomposed +problem in a multi-agent system of LLMs. The hardness of composition is +quantified by a generation complexity metric, i.e., the number of LLM +generations required to sample at least one correct solution. We find a gap +between the generation complexity of solving a compositional problem within the +same context relative to distributing it among multiple agents, that increases +exponentially with the solution's length. We prove our results theoretically +and demonstrate them empirically. + +
+
+
+
+
+ + ☆ An Adversarial Perspective on Machine Unlearning for AI Safety + + +
+ Large language models are finetuned to refuse questions about hazardous +knowledge, but these protections can often be bypassed. Unlearning methods aim +at completely removing hazardous capabilities from models and make them +inaccessible to adversaries. This work challenges the fundamental differences +between unlearning and traditional safety post-training from an adversarial +perspective. We demonstrate that existing jailbreak methods, previously +reported as ineffective against unlearning, can be successful when applied +carefully. Furthermore, we develop a variety of adaptive methods that recover +most supposedly unlearned capabilities. For instance, we show that finetuning +on 10 unrelated examples or removing specific directions in the activation +space can recover most hazardous capabilities for models edited with RMU, a +state-of-the-art unlearning method. Our findings challenge the robustness of +current unlearning approaches and question their advantages over safety +training. + +
+
+
+
+
+ + ☆ DARE: Diverse Visual Question Answering with Robustness Evaluation + + +
+ Vision Language Models (VLMs) extend remarkable capabilities of text-only +large language models and vision-only models, and are able to learn from and +process multi-modal vision-text input. While modern VLMs perform well on a +number of standard image classification and image-text matching tasks, they +still struggle with a number of crucial vision-language (VL) reasoning +abilities such as counting and spatial reasoning. Moreover, while they might be +very brittle to small variations in instructions and/or evaluation protocols, +existing benchmarks fail to evaluate their robustness (or rather the lack of +it). In order to couple challenging VL scenarios with comprehensive robustness +evaluation, we introduce DARE, Diverse Visual Question Answering with +Robustness Evaluation, a carefully created and curated multiple-choice VQA +benchmark. DARE evaluates VLM performance on five diverse categories and +includes four robustness-oriented evaluations based on the variations of: +prompts, the subsets of answer options, the output format and the number of +correct answers. Among a spectrum of other findings, we report that +state-of-the-art VLMs still struggle with questions in most categories and are +unable to consistently deliver their peak performance across the tested +robustness evaluations. The worst case performance across the subsets of +options is up to 34% below the performance in the standard case. The robustness +of the open-source VLMs such as LLaVA 1.6 and Idefics2 cannot match the +closed-source models such as GPT-4 and Gemini, but even the latter remain very +brittle to different variations. + +
+
+
+
+
+ + ☆ Multilingual Evaluation of Long Context Retrieval and Reasoning + + +
+ Recent large language models (LLMs) demonstrate impressive capabilities in +handling long contexts, some exhibiting near-perfect recall on synthetic +retrieval tasks. However, these evaluations have mainly focused on English text +and involved a single target sentence within lengthy contexts. Our work +investigates how LLM performance generalizes to multilingual settings with +multiple hidden target sentences. We comprehensively evaluate several +long-context LLMs on retrieval and reasoning tasks across five languages: +English, Vietnamese, Indonesian, Swahili, and Somali. These languages share the +Latin script but belong to distinct language families and resource levels. Our +analysis reveals a significant performance gap between languages. The +best-performing models such as Gemini-1.5 and GPT-4o, achieve around 96% +accuracy in English to around 36% in Somali with a single target sentence. +However, this accuracy drops to 40% in English and 0% in Somali when dealing +with three target sentences. Our findings highlight the challenges long-context +LLMs face when processing longer contexts, an increase in the number of target +sentences, or languages of lower resource levels. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Extracting Affect Aggregates from Longitudinal Social Media Data with + Temporal Adapters for Large Language Models + + +
+ This paper proposes temporally aligned Large Language Models (LLMs) as a tool +for longitudinal analysis of social media data. We fine-tune Temporal Adapters +for Llama 3 8B on full timelines from a panel of British Twitter users, and +extract longitudinal aggregates of emotions and attitudes with established +questionnaires. We validate our estimates against representative British survey +data and find strong positive, significant correlations for several collective +emotions. The obtained estimates are robust across multiple training seeds and +prompt formulations, and in line with collective emotions extracted using a +traditional classification model trained on labeled data. To the best of our +knowledge, this is the first work to extend the analysis of affect in LLMs to a +longitudinal setting through Temporal Adapters. Our work enables new approaches +towards the longitudinal analysis of social media data. + +
+
+ comment: Code available at https://github.com/dess-mannheim/temporal-adapters +
+
+
+
+
+ + ☆ BEATS: Optimizing LLM Mathematical Capabilities with BackVerify and + Adaptive Disambiguate based Efficient Tree Search + + +
+ Large Language Models (LLMs) have exhibited exceptional performance across a +broad range of tasks and domains. However, they still encounter difficulties in +solving mathematical problems due to the rigorous and logical nature of +mathematics. Previous studies have employed techniques such as supervised +fine-tuning (SFT), prompt engineering, and search-based methods to improve the +mathematical problem-solving abilities of LLMs. Despite these efforts, their +performance remains suboptimal and demands substantial computational resources. +To address this issue, we propose a novel approach, BEATS, to enhance +mathematical problem-solving abilities. Our method leverages newly designed +prompts that guide the model to iteratively rewrite, advance by one step, and +generate answers based on previous steps. Additionally, we introduce a new +back-verification technique that uses LLMs to validate the correctness of the +generated answers. Furthermore, we employ a pruning tree search to optimize +search time while achieving strong performance. Notably, our method improves +Qwen2-7b-Instruct's score from 36.94 to 61.52, outperforming GPT4's 42.5 on the +MATH benchmark. + +
+
+
+
+
+ + ☆ The Hard Positive Truth about Vision-Language Compositionality ECCV 2024 + + +
+ Several benchmarks have concluded that our best vision-language models (e.g., +CLIP) are lacking in compositionality. Given an image, these benchmarks probe a +model's ability to identify its associated caption amongst a set of +compositional distractors. In response, a surge of recent proposals show +improvements by finetuning CLIP with distractors as hard negatives. Our +investigations reveal that these improvements have, in fact, been significantly +overstated -- because existing benchmarks do not probe whether finetuned +vision-language models remain invariant to hard positives. By curating an +evaluation dataset with 112,382 hard negatives and hard positives, we uncover +that including hard positives decreases CLIP's performance by 12.9%, while +humans perform effortlessly at 99%. CLIP finetuned with hard negatives results +in an even larger decrease, up to 38.7%. With this finding, we then produce a +1,775,259 image-text training set with both hard negative and hard positive +captions. By training with both, we see improvements on existing benchmarks +while simultaneously improving performance on hard positives, indicating a more +robust improvement in compositionality. Our work suggests the need for future +research to rigorously test and improve CLIP's understanding of semantic +relationships between related "positive" concepts. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Weak-To-Strong Backdoor Attacks for LLMs with Contrastive Knowledge + Distillation + + +
+ Despite being widely applied due to their exceptional capabilities, Large +Language Models (LLMs) have been proven to be vulnerable to backdoor attacks. +These attacks introduce targeted vulnerabilities into LLMs by poisoning +training samples and full-parameter fine-tuning. However, this kind of backdoor +attack is limited since they require significant computational resources, +especially as the size of LLMs increases. Besides, parameter-efficient +fine-tuning (PEFT) offers an alternative but the restricted parameter updating +may impede the alignment of triggers with target labels. In this study, we +first verify that backdoor attacks with PEFT may encounter challenges in +achieving feasible performance. To address these issues and improve the +effectiveness of backdoor attacks with PEFT, we propose a novel backdoor attack +algorithm from weak to strong based on contrastive knowledge distillation +(W2SAttack). Specifically, we poison small-scale language models through +full-parameter fine-tuning to serve as the teacher model. The teacher model +then covertly transfers the backdoor to the large-scale student model through +contrastive knowledge distillation, which employs PEFT. Theoretical analysis +reveals that W2SAttack has the potential to augment the effectiveness of +backdoor attacks. We demonstrate the superior performance of W2SAttack on +classification tasks across four language models, four backdoor attack +algorithms, and two different architectures of teacher models. Experimental +results indicate success rates close to 100% for backdoor attacks targeting +PEFT. + +
+
+
+
+
+ + ☆ On Translating Technical Terminology: A Translation Workflow for + Machine-Translated Acronyms + + +
+ The typical workflow for a professional translator to translate a document +from its source language (SL) to a target language (TL) is not always focused +on what many language models in natural language processing (NLP) do - predict +the next word in a series of words. While high-resource languages like English +and French are reported to achieve near human parity using common metrics for +measurement such as BLEU and COMET, we find that an important step is being +missed: the translation of technical terms, specifically acronyms. Some +state-of-the art machine translation systems like Google Translate which are +publicly available can be erroneous when dealing with acronyms - as much as 50% +in our findings. This article addresses acronym disambiguation for MT systems +by proposing an additional step to the SL-TL (FR-EN) translation workflow where +we first offer a new acronym corpus for public consumption and then experiment +with a search-based thresholding algorithm that achieves nearly 10% increase +when compared to Google Translate and OpusMT. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ Predicting Anchored Text from Translation Memories for Machine + Translation Using Deep Learning Methods + + +
+ Translation memories (TMs) are the backbone for professional translation +tools called computer-aided translation (CAT) tools. In order to perform a +translation using a CAT tool, a translator uses the TM to gather translations +similar to the desired segment to translate (s'). Many CAT tools offer a +fuzzy-match algorithm to locate segments (s) in the TM that are close in +distance to s'. After locating two similar segments, the CAT tool will present +parallel segments (s, t) that contain one segment in the source language along +with its translation in the target language. Additionally, CAT tools contain +fuzzy-match repair (FMR) techniques that will automatically use the parallel +segments from the TM to create new TM entries containing a modified version of +the original with the idea in mind that it will be the translation of s'. Most +FMR techniques use machine translation as a way of "repairing" those words that +have to be modified. In this article, we show that for a large part of those +words which are anchored, we can use other techniques that are based on machine +learning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we +show that for anchored words that follow the continuous bag-of-words (CBOW) +paradigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for +some cases, better results than neural machine translation for translating +anchored words from French to English. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ The Lou Dataset -- Exploring the Impact of Gender-Fair Language in + German Text Classification + + +
+ Gender-fair language, an evolving German linguistic variation, fosters +inclusion by addressing all genders or using neutral forms. Nevertheless, there +is a significant lack of resources to assess the impact of this linguistic +shift on classification using language models (LMs), which are probably not +trained on such variations. To address this gap, we present Lou, the first +dataset featuring high-quality reformulations for German text classification +covering seven tasks, like stance detection and toxicity classification. +Evaluating 16 mono- and multi-lingual LMs on Lou shows that gender-fair +language substantially impacts predictions by flipping labels, reducing +certainty, and altering attention patterns. However, existing evaluations +remain valid, as LM rankings of original and reformulated instances do not +significantly differ. While we offer initial insights on the effect on German +text classification, the findings likely apply to other languages, as +consistent patterns were observed in multi-lingual and English LMs. + +
+
+
+
+
+ + ☆ Pioneering Reliable Assessment in Text-to-Image Knowledge Editing: + Leveraging a Fine-Grained Dataset and an Innovative Criterion EMNLP24 + + +
+ During pre-training, the Text-to-Image (T2I) diffusion models encode factual +knowledge into their parameters. These parameterized facts enable realistic +image generation, but they may become obsolete over time, thereby +misrepresenting the current state of the world. Knowledge editing techniques +aim to update model knowledge in a targeted way. However, facing the dual +challenges posed by inadequate editing datasets and unreliable evaluation +criterion, the development of T2I knowledge editing encounter difficulties in +effectively generalizing injected knowledge. In this work, we design a T2I +knowledge editing framework by comprehensively spanning on three phases: First, +we curate a dataset \textbf{CAKE}, comprising paraphrase and multi-object test, +to enable more fine-grained assessment on knowledge generalization. Second, we +propose a novel criterion, \textbf{adaptive CLIP threshold}, to effectively +filter out false successful images under the current criterion and achieve +reliable editing evaluation. Finally, we introduce \textbf{MPE}, a simple but +effective approach for T2I knowledge editing. Instead of tuning parameters, MPE +precisely recognizes and edits the outdated part of the conditioning +text-prompt to accommodate the up-to-date knowledge. A straightforward +implementation of MPE (Based on in-context learning) exhibits better overall +performance than previous model editors. We hope these efforts can further +promote faithful evaluation of T2I knowledge editing methods. + +
+
+ comment: EMNLP24 Findings +
+
+
+
+
+ + ☆ Atlas-Chat: Adapting Large Language Models for Low-Resource Moroccan + Arabic Dialect + + +
+ We introduce Atlas-Chat, the first-ever collection of large language models +specifically developed for dialectal Arabic. Focusing on Moroccan Arabic, also +known as Darija, we construct our instruction dataset by consolidating existing +Darija language resources, creating novel datasets both manually and +synthetically, and translating English instructions with stringent quality +control. Atlas-Chat-9B and 2B models, fine-tuned on the dataset, exhibit +superior ability in following Darija instructions and performing standard NLP +tasks. Notably, our models outperform both state-of-the-art and +Arabic-specialized LLMs like LLaMa, Jais, and AceGPT, e.g., achieving a 13% +performance boost over a larger 13B model on DarijaMMLU, in our newly +introduced evaluation suite for Darija covering both discriminative and +generative tasks. Furthermore, we perform an experimental analysis of various +fine-tuning strategies and base model choices to determine optimal +configurations. All our resources are publicly accessible, and we believe our +work offers comprehensive design methodologies of instruction-tuning for +low-resource language variants, which are often neglected in favor of data-rich +languages by contemporary LLMs. + +
+
+
+
+
+ + ☆ Revisiting Acoustic Similarity in Emotional Speech and Music via + Self-Supervised Representations + + +
+ Emotion recognition from speech and music shares similarities due to their +acoustic overlap, which has led to interest in transferring knowledge between +these domains. However, the shared acoustic cues between speech and music, +particularly those encoded by Self-Supervised Learning (SSL) models, remain +largely unexplored, given the fact that SSL models for speech and music have +rarely been applied in cross-domain research. In this work, we revisit the +acoustic similarity between emotion speech and music, starting with an analysis +of the layerwise behavior of SSL models for Speech Emotion Recognition (SER) +and Music Emotion Recognition (MER). Furthermore, we perform cross-domain +adaptation by comparing several approaches in a two-stage fine-tuning process, +examining effective ways to utilize music for SER and speech for MER. Lastly, +we explore the acoustic similarities between emotional speech and music using +Frechet audio distance for individual emotions, uncovering the issue of emotion +bias in both speech and music SSL models. Our findings reveal that while speech +and music SSL models do capture shared acoustic features, their behaviors can +vary depending on different emotions due to their training strategies and +domain-specificities. Additionally, parameter-efficient fine-tuning can enhance +SER and MER performance by leveraging knowledge from each other. This study +provides new insights into the acoustic similarity between emotional speech and +music, and highlights the potential for cross-domain generalization to improve +SER and MER systems. + +
+
+
+
+
+ + ☆ EMMA-500: Enhancing Massively Multilingual Adaptation of Large Language + Models + + +
+ In this work, we introduce EMMA-500, a large-scale multilingual language +model continue-trained on texts across 546 languages designed for enhanced +multilingual performance, focusing on improving language coverage for +low-resource languages. To facilitate continual pre-training, we compile the +MaLA corpus, a comprehensive multilingual dataset enriched with curated +datasets across diverse domains. Leveraging this corpus, we conduct extensive +continual pre-training of the Llama 2 7B model, resulting in EMMA-500, which +demonstrates robust performance across a wide collection of benchmarks, +including a comprehensive set of multilingual tasks and PolyWrite, an +open-ended generation benchmark developed in this study. Our results highlight +the effectiveness of continual pre-training in expanding large language models' +language capacity, particularly for underrepresented languages, demonstrating +significant gains in cross-lingual transfer, task generalization, and language +adaptability. + +
+
+
+
+
+ + ☆ Implementing a Nordic-Baltic Federated Health Data Network: a case + report + + +
+ Background: Centralized collection and processing of healthcare data across +national borders pose significant challenges, including privacy concerns, data +heterogeneity and legal barriers. To address some of these challenges, we +formed an interdisciplinary consortium to develop a feder-ated health data +network, comprised of six institutions across five countries, to facilitate +Nordic-Baltic cooperation on secondary use of health data. The objective of +this report is to offer early insights into our experiences developing this +network. Methods: We used a mixed-method ap-proach, combining both experimental +design and implementation science to evaluate the factors affecting the +implementation of our network. Results: Technically, our experiments indicate +that the network functions without significant performance degradation compared +to centralized simu-lation. Conclusion: While use of interdisciplinary +approaches holds a potential to solve challeng-es associated with establishing +such collaborative networks, our findings turn the spotlight on the uncertain +regulatory landscape playing catch up and the significant operational costs. + +
+
+ comment: 24 pages (including appendices), 1 figure +
+
+
+
+
+ + ☆ PEDRO: Parameter-Efficient Fine-tuning with Prompt DEpenDent + Representation MOdification + + +
+ Due to their substantial sizes, large language models (LLMs) are typically +deployed within a single-backbone multi-tenant framework. In this setup, a +single instance of an LLM backbone must cater to multiple users or tasks +through the application of various parameter-efficient fine-tuning (PEFT) +models. Despite the availability of numerous effective PEFT techniques such as +LoRA, there remains a need for a PEFT approach that achieves both high +efficiency during inference and competitive performance on downstream tasks. In +this research, we introduce a new and straightforward PEFT methodology named +\underline{P}rompt D\underline{E}pen\underline{D}ent \underline{R}epresentation +M\underline{O}dification (PEDRO). The proposed method involves integrating a +lightweight vector generator into each Transformer layer, which generates +vectors contingent upon the input prompts. These vectors then modify the hidden +representations created by the LLM through a dot product operation, thereby +influencing the semantic output and generated content of the model. Extensive +experimentation across a variety of tasks indicates that: (a) PEDRO surpasses +recent PEFT benchmarks when using a similar number of tunable parameters. (b) +Under the single-backbone multi-tenant deployment model, PEDRO exhibits +superior efficiency compared to LoRA, indicating significant industrial +potential. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2405.18203 +
+
+
+
+
+ + ☆ BeanCounter: A low-toxicity, large-scale, and open dataset of + business-oriented text + + +
+ Many of the recent breakthroughs in language modeling have resulted from +scaling effectively the same model architecture to larger datasets. In this +vein, recent work has highlighted performance gains from increasing training +dataset size and quality, suggesting a need for novel sources of large-scale +datasets. In this work, we introduce BeanCounter, a public dataset consisting +of more than 159B tokens extracted from businesses' disclosures. We show that +this data is indeed novel: less than 0.1% of BeanCounter appears in Common +Crawl-based datasets and it is an order of magnitude larger than datasets +relying on similar sources. Given the data's provenance, we hypothesize that +BeanCounter is comparatively more factual and less toxic than web-based +datasets. Exploring this hypothesis, we find that many demographic identities +occur with similar prevalence in BeanCounter but with significantly less toxic +context relative to other datasets. To demonstrate the utility of BeanCounter, +we evaluate and compare two LLMs continually pre-trained on BeanCounter with +their base models. We find an 18-33% reduction in toxic generation and improved +performance within the finance domain for the continually pretrained models. +Collectively, our work suggests that BeanCounter is a novel source of +low-toxicity and high-quality domain-specific data with sufficient scale to +train multi-billion parameter LLMs. + +
+
+
+
+
+ + ☆ Inference-Time Language Model Alignment via Integrated Value Guidance EMNLP 2024 + + +
+ Large language models are typically fine-tuned to align with human +preferences, but tuning large models is computationally intensive and complex. +In this work, we introduce $\textit{Integrated Value Guidance}$ (IVG), a method +that uses implicit and explicit value functions to guide language model +decoding at token and chunk-level respectively, efficiently aligning large +language models purely at inference time. This approach circumvents the +complexities of direct fine-tuning and outperforms traditional methods. +Empirically, we demonstrate the versatility of IVG across various tasks. In +controlled sentiment generation and summarization tasks, our method +significantly improves the alignment of large models using inference-time +guidance from $\texttt{gpt2}$-based value functions. Moreover, in a more +challenging instruction-following benchmark AlpacaEval 2.0, we show that both +specifically tuned and off-the-shelf value functions greatly improve the +length-controlled win rates of large models against $\texttt{gpt-4-turbo}$ +(e.g., $19.51\% \rightarrow 26.51\%$ for $\texttt{Mistral-7B-Instruct-v0.2}$ +and $25.58\% \rightarrow 33.75\%$ for $\texttt{Mixtral-8x7B-Instruct-v0.1}$ +with Tulu guidance). + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ☆ Self-supervised Preference Optimization: Enhance Your Language Model + with Preference Degree Awareness EMNLP 2024 + + +
+ Recently, there has been significant interest in replacing the reward model +in Reinforcement Learning with Human Feedback (RLHF) methods for Large Language +Models (LLMs), such as Direct Preference Optimization (DPO) and its variants. +These approaches commonly use a binary cross-entropy mechanism on pairwise +samples, i.e., minimizing and maximizing the loss based on preferred or +dis-preferred responses, respectively. However, while this training strategy +omits the reward model, it also overlooks the varying preference degrees within +different responses. We hypothesize that this is a key factor hindering LLMs +from sufficiently understanding human preferences. To address this problem, we +propose a novel Self-supervised Preference Optimization (SPO) framework, which +constructs a self-supervised preference degree loss combined with the alignment +loss, thereby helping LLMs improve their ability to understand the degree of +preference. Extensive experiments are conducted on two widely used datasets of +different tasks. The results demonstrate that SPO can be seamlessly integrated +with existing preference optimization methods and significantly boost their +performance to achieve state-of-the-art performance. We also conduct detailed +analyses to offer comprehensive insights into SPO, which verifies its +effectiveness. The code is available at https://github.com/lijian16/SPO. + +
+
+ comment: Accepted at EMNLP 2024 Findings +
+
+
+
+
+ + ☆ Faithfulness and the Notion of Adversarial Sensitivity in NLP + Explanations EMNLP 2024 + + +
+ Faithfulness is arguably the most critical metric to assess the reliability +of explainable AI. In NLP, current methods for faithfulness evaluation are +fraught with discrepancies and biases, often failing to capture the true +reasoning of models. We introduce Adversarial Sensitivity as a novel approach +to faithfulness evaluation, focusing on the explainer's response when the model +is under adversarial attack. Our method accounts for the faithfulness of +explainers by capturing sensitivity to adversarial input changes. This work +addresses significant limitations in existing evaluation techniques, and +furthermore, quantifies faithfulness from a crucial yet underexplored paradigm. + +
+
+ comment: Accepted as a Full Paper at EMNLP 2024 Workshop BlackBoxNLP +
+
+
+
+
+ + ☆ Integrating Hierarchical Semantic into Iterative Generation Model for + Entailment Tree Explanation + + +
+ Manifestly and logically displaying the line of reasoning from evidence to +answer is significant to explainable question answering (QA). The entailment +tree exhibits the lines structurally, which is different from the +self-explanation principle in large-scale language models. Existing methods +rarely consider the semantic association of sentences between and within +hierarchies within the tree structure, which is prone to apparent mistakes in +combinations. In this work, we propose an architecture of integrating the +Hierarchical Semantics of sentences under the framework of Controller-Generator +(HiSCG) to explain answers. The HiSCG designs a hierarchical mapping between +hypotheses and facts, discriminates the facts involved in tree constructions, +and optimizes single-step entailments. To the best of our knowledge, We are the +first to notice hierarchical semantics of sentences between the same layer and +adjacent layers to yield improvements. The proposed method achieves comparable +performance on all three settings of the EntailmentBank dataset. The +generalization results on two out-of-domain datasets also demonstrate the +effectiveness of our method. + +
+
+
+
+
+ + ☆ SECURE: Semantics-aware Embodied Conversation under Unawareness for + Lifelong Robot Learning + + +
+ This paper addresses a challenging interactive task learning scenario we call +rearrangement under unawareness: to manipulate a rigid-body environment in a +context where the robot is unaware of a concept that's key to solving the +instructed task. We propose SECURE, an interactive task learning framework +designed to solve such problems by fixing a deficient domain model using +embodied conversation. Through dialogue, the robot discovers and then learns to +exploit unforeseen possibilities. Using SECURE, the robot not only learns from +the user's corrective feedback when it makes a mistake, but it also learns to +make strategic dialogue decisions for revealing useful evidence about novel +concepts for solving the instructed task. Together, these abilities allow the +robot to generalise to subsequent tasks using newly acquired knowledge. We +demonstrate that a robot that is semantics-aware -- that is, it exploits the +logical consequences of both sentence and discourse semantics in the learning +and inference process -- learns to solve rearrangement under unawareness more +effectively than a robot that lacks such capabilities. + +
+
+ comment: 10 pages,4 figures, 2 tables +
+
+
+
+
+ + ☆ Are Transformers in Pre-trained LM A Good ASR Encoder? An Empirical + Study + + +
+ In this study, we delve into the efficacy of transformers within pre-trained +language models (PLMs) when repurposed as encoders for Automatic Speech +Recognition (ASR). Our underlying hypothesis posits that, despite being +initially trained on text-based corpora, these transformers possess a +remarkable capacity to extract effective features from the input sequence. This +inherent capability, we argue, is transferrable to speech data, thereby +augmenting the acoustic modeling ability of ASR. Through rigorous empirical +analysis, our findings reveal a notable improvement in Character Error Rate +(CER) and Word Error Rate (WER) across diverse ASR tasks when transformers from +pre-trained LMs are incorporated. Particularly, they serve as an advantageous +starting point for initializing ASR encoders. Furthermore, we uncover that +these transformers, when integrated into a well-established ASR encoder, can +significantly boost performance, especially in scenarios where profound +semantic comprehension is pivotal. This underscores the potential of leveraging +the semantic prowess embedded within pre-trained transformers to advance ASR +systems' capabilities. + +
+
+ comment: 8pages +
+
+
+
+
+ + ☆ Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval + Model EMNLP 2024 + + +
+ A supervised ranking model, despite its advantage of being effective, usually +involves complex processing - typically multiple stages of task-specific +pre-training and fine-tuning. This has motivated researchers to explore simpler +pipelines leveraging large language models (LLMs) that are capable of working +in a zero-shot manner. However, since zero-shot inference does not make use of +a training set of pairs of queries and their relevant documents, its +performance is mostly worse than that of supervised models, which are trained +on such example pairs. Motivated by the existing findings that training +examples generally improve zero-shot performance, in our work, we explore if +this also applies to ranking models. More specifically, given a query and a +pair of documents, the preference prediction task is improved by augmenting +examples of preferences for similar queries from a training set. Our proposed +pairwise few-shot ranker demonstrates consistent improvements over the +zero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset) +retrieval benchmarks. Our method also achieves a close performance to that of a +supervised model without requiring any complex training pipeline. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ MIO: A Foundation Model on Multimodal Tokens + + +
+ In this paper, we introduce MIO, a novel foundation model built on multimodal +tokens, capable of understanding and generating speech, text, images, and +videos in an end-to-end, autoregressive manner. While the emergence of large +language models (LLMs) and multimodal large language models (MM-LLMs) propels +advancements in artificial general intelligence through their versatile +capabilities, they still lack true any-to-any understanding and generation. +Recently, the release of GPT-4o has showcased the remarkable potential of +any-to-any LLMs for complex real-world tasks, enabling omnidirectional input +and output across images, speech, and text. However, it is closed-source and +does not support the generation of multimodal interleaved sequences. To address +this gap, we present MIO, which is trained on a mixture of discrete tokens +across four modalities using causal multimodal modeling. MIO undergoes a +four-stage training process: (1) alignment pre-training, (2) interleaved +pre-training, (3) speech-enhanced pre-training, and (4) comprehensive +supervised fine-tuning on diverse textual, visual, and speech tasks. Our +experimental results indicate that MIO exhibits competitive, and in some cases +superior, performance compared to previous dual-modal baselines, any-to-any +model baselines, and even modality-specific baselines. Moreover, MIO +demonstrates advanced capabilities inherent to its any-to-any feature, such as +interleaved video-text generation, chain-of-visual-thought reasoning, visual +guideline generation, instructional image editing, etc. + +
+
+ comment: Technical Report. Codes and models will be available soon +
+
+
+
+
+ + ☆ Zero- and Few-shot Named Entity Recognition and Text Expansion in + Medication Prescriptions using ChatGPT + + +
+ Introduction: Medication prescriptions are often in free text and include a +mix of two languages, local brand names, and a wide range of idiosyncratic +formats and abbreviations. Large language models (LLMs) have shown promising +ability to generate text in response to input prompts. We use ChatGPT 3.5 to +automatically structure and expand medication statements in discharge summaries +and thus make them easier to interpret for people and machines. Methods: +Named-entity Recognition (NER) and Text Expansion (EX) are used in a zero- and +few-shot setting with different prompt strategies. 100 medication statements +were manually annotated and curated. NER performance was measured by using +strict and partial matching. For the task EX, two experts interpreted the +results by assessing semantic equivalence between original and expanded +statements. The model performance was measured by precision, recall, and F1 +score. Results: For NER, the best-performing prompt reached an average F1 score +of 0.94 in the test set. For EX, the few-shot prompt showed superior +performance among other prompts, with an average F1 score of 0.87. Conclusion: +Our study demonstrates good performance for NER and EX tasks in free-text +medication statements using ChatGPT. Compared to a zero-shot baseline, a +few-shot approach prevented the system from hallucinating, which would be +unacceptable when processing safety-relevant medication data. + +
+
+
+
+
+ + ☆ Cross-lingual Human-Preference Alignment for Neural Machine Translation + with Direct Quality Optimization + + +
+ Reinforcement Learning from Human Feedback (RLHF) and derivative techniques +like Direct Preference Optimization (DPO) are task-alignment algorithms used to +repurpose general, foundational models for specific tasks. We show that +applying task-alignment to neural machine translation (NMT) addresses an +existing task--data mismatch in NMT, leading to improvements across all +languages of a multilingual model, even when task-alignment is only applied to +a subset of those languages. We do so by introducing Direct Quality +Optimization (DQO), a variant of DPO leveraging a pre-trained translation +quality estimation model as a proxy for human preferences, and verify the +improvements with both automatic metrics and human evaluation. + +
+
+ comment: 17 pages, 1 figure +
+
+
+
+
+ + ☆ Digital Twin Ecosystem for Oncology Clinical Operations + + +
+ Artificial Intelligence (AI) and Large Language Models (LLMs) hold +significant promise in revolutionizing healthcare, especially in clinical +applications. Simultaneously, Digital Twin technology, which models and +simulates complex systems, has gained traction in enhancing patient care. +However, despite the advances in experimental clinical settings, the potential +of AI and digital twins to streamline clinical operations remains largely +untapped. This paper introduces a novel digital twin framework specifically +designed to enhance oncology clinical operations. We propose the integration of +multiple specialized digital twins, such as the Medical Necessity Twin, Care +Navigator Twin, and Clinical History Twin, to enhance workflow efficiency and +personalize care for each patient based on their unique data. Furthermore, by +synthesizing multiple data sources and aligning them with the National +Comprehensive Cancer Network (NCCN) guidelines, we create a dynamic Cancer Care +Path, a continuously evolving knowledge base that enables these digital twins +to provide precise, tailored clinical recommendations. + +
+
+ comment: Pre Print +
+
+
+
+
+ + ☆ Efficient In-Domain Question Answering for Resource-Constrained + Environments + + +
+ Retrieval Augmented Generation (RAG) is a common method for integrating +external knowledge into pretrained Large Language Models (LLMs) to enhance +accuracy and relevancy in question answering (QA) tasks. However, prompt +engineering and resource efficiency remain significant bottlenecks in +developing optimal and robust RAG solutions for real-world QA applications. +Recent studies have shown success in using fine tuning to address these +problems; in particular, Retrieval Augmented Fine Tuning (RAFT) applied to +smaller 7B models has demonstrated superior performance compared to RAG setups +with much larger models such as GPT-3.5. The combination of RAFT with +parameter-efficient fine tuning (PEFT) techniques, such as Low-Rank Adaptation +(LoRA), promises an even more efficient solution, yet remains an unexplored +area. In this work, we combine RAFT with LoRA to reduce fine tuning and storage +requirements and gain faster inference times while maintaining comparable RAG +performance. This results in a more compute-efficient RAFT, or CRAFT, which is +particularly useful for knowledge-intensive QA tasks in resource-constrained +environments where internet access may be restricted and hardware resources +limited. + +
+
+ comment: 6 pages, 2 tables +
+
+
+
+
+ + ☆ T3: A Novel Zero-shot Transfer Learning Framework Iteratively Training + on an Assistant Task for a Target Task + + +
+ Long text summarization, gradually being essential for efficiently processing +large volumes of information, stays challenging for Large Language Models +(LLMs) such as GPT and LLaMA families because of the insufficient open-sourced +training datasets and the high requirement of contextual details dealing. To +address the issue, we design a novel zero-shot transfer learning framework, +abbreviated as T3, to iteratively training a baseline LLM on an assistant task +for the target task, where the former should own richer data resources and +share structural or semantic similarity with the latter. In practice, T3 is +approached to deal with the long text summarization task by utilizing question +answering as the assistant task, and further validated its effectiveness on the +BBC summary, NarraSum, FairytaleQA, and NLQuAD datasets, with up to nearly 14% +improvement in ROUGE, 35% improvement in BLEU, and 16% improvement in Factscore +compared to three baseline LLMs, demonstrating its potential for more +assistant-target task combinations. + +
+
+
+
+
+ + ☆ ZALM3: Zero-Shot Enhancement of Vision-Language Alignment via In-Context + Information in Multi-Turn Multimodal Medical Dialogue + + +
+ The rocketing prosperity of large language models (LLMs) in recent years has +boosted the prevalence of vision-language models (VLMs) in the medical sector. +In our online medical consultation scenario, a doctor responds to the texts and +images provided by a patient in multiple rounds to diagnose her/his health +condition, forming a multi-turn multimodal medical dialogue format. Unlike +high-quality images captured by professional equipment in traditional medical +visual question answering (Med-VQA), the images in our case are taken by +patients' mobile phones. These images have poor quality control, with issues +such as excessive background elements and the lesion area being significantly +off-center, leading to degradation of vision-language alignment in the model +training phase. In this paper, we propose ZALM3, a Zero-shot strategy to +improve vision-language ALignment in Multi-turn Multimodal Medical dialogue. +Since we observe that the preceding text conversations before an image can +infer the regions of interest (RoIs) in the image, ZALM3 employs an LLM to +summarize the keywords from the preceding context and a visual grounding model +to extract the RoIs. The updated images eliminate unnecessary background noise +and provide more effective vision-language alignment. To better evaluate our +proposed method, we design a new subjective assessment metric for multi-turn +unimodal/multimodal medical dialogue to provide a fine-grained performance +comparison. Our experiments across three different clinical departments +remarkably demonstrate the efficacy of ZALM3 with statistical significance. + +
+
+
+
+
+ + ☆ Deep CLAS: Deep Contextual Listen, Attend and Spell + + +
+ Contextual-LAS (CLAS) has been shown effective in improving Automatic Speech +Recognition (ASR) of rare words. It relies on phrase-level contextual modeling +and attention-based relevance scoring without explicit contextual constraint +which lead to insufficient use of contextual information. In this work, we +propose deep CLAS to use contextual information better. We introduce bias loss +forcing model to focus on contextual information. The query of bias attention +is also enriched to improve the accuracy of the bias attention score. To get +fine-grained contextual information, we replace phrase-level encoding with +character-level encoding and encode contextual information with conformer +rather than LSTM. Moreover, we directly use the bias attention score to correct +the output probability distribution of the model. Experiments using the public +AISHELL-1 and AISHELL-NER. On AISHELL-1, compared to CLAS baselines, deep CLAS +obtains a 65.78% relative recall and a 53.49% relative F1-score increase in the +named entity recognition scene. + +
+
+ comment: Accepted by NCMMSC 2022 +
+
+
+
+
+ + ☆ DualCoTs: Dual Chain-of-Thoughts Prompting for Sentiment Lexicon + Expansion of Idioms + + +
+ Idioms represent a ubiquitous vehicle for conveying sentiments in the realm +of everyday discourse, rendering the nuanced analysis of idiom sentiment +crucial for a comprehensive understanding of emotional expression within +real-world texts. Nevertheless, the existing corpora dedicated to idiom +sentiment analysis considerably limit research in text sentiment analysis. In +this paper, we propose an innovative approach to automatically expand the +sentiment lexicon for idioms, leveraging the capabilities of large language +models through the application of Chain-of-Thought prompting. To demonstrate +the effectiveness of this approach, we integrate multiple existing resources +and construct an emotional idiom lexicon expansion dataset (called EmoIdiomE), +which encompasses a comprehensive repository of Chinese and English idioms. +Then we designed the Dual Chain-of-Thoughts (DualCoTs) method, which combines +insights from linguistics and psycholinguistics, to demonstrate the +effectiveness of using large models to automatically expand the sentiment +lexicon for idioms. Experiments show that DualCoTs is effective in idioms +sentiment lexicon expansion in both Chinese and English. For reproducibility, +we will release the data and code upon acceptance. + +
+
+
+
+
+ + ☆ Leveraging Annotator Disagreement for Text Classification + + +
+ It is common practice in text classification to only use one majority label +for model training even if a dataset has been annotated by multiple annotators. +Doing so can remove valuable nuances and diverse perspectives inherent in the +annotators' assessments. This paper proposes and compares three different +strategies to leverage annotator disagreement for text classification: a +probability-based multi-label method, an ensemble system, and instruction +tuning. All three approaches are evaluated on the tasks of hate speech and +abusive conversation detection, which inherently entail a high degree of +subjectivity. Moreover, to evaluate the effectiveness of embracing annotation +disagreements for model training, we conduct an online survey that compares the +performance of the multi-label model against a baseline model, which is trained +with the majority label. + The results show that in hate speech detection, the multi-label method +outperforms the other two approaches, while in abusive conversation detection, +instruction tuning achieves the best performance. The results of the survey +also show that the outputs from the multi-label models are considered a better +representation of the texts than the single-label model. + +
+
+
+
+
+ + ☆ Modulated Intervention Preference Optimization (MIPO): Keey the Easy, + Refine the Difficult AAAI 2025 + + +
+ Preference optimization methods typically begin training with a well-trained +SFT model as a reference model. In RLHF and DPO, a regularization term is used +during the preference optimization process to prevent the policy model from +deviating too far from the reference model's distribution, thereby avoiding the +generation of anomalous responses. When the reference model is already +well-aligned with the given data or only requires slight adjustments, this +approach can produce a well-aligned model. However, if the reference model is +not aligned with the given data and requires significant deviation from its +current state, a regularization term may actually hinder the model alignment. +In this study, we propose \textbf{Modulated Intervention Preference +Optimization (MIPO)} to address this issue. MIPO modulates the degree of +intervention from the reference model based on how well the given data is +aligned with it. If the data is well-aligned, the intervention is increased to +prevent the policy model from diverging significantly from reference model. +Conversely, if the alignment is poor, the interference is reduced to facilitate +more extensive training. We compare the performance of MIPO and DPO using +Mistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental +results demonstrate that MIPO consistently outperforms DPO across various +evaluation scenarios. + +
+
+ comment: 8pages, submitted to AAAI 2025 +
+
+
+
+
+ + ☆ Logic-of-Thought: Injecting Logic into Contexts for Full Reasoning in + Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities across +various tasks but their performance in complex logical reasoning tasks remains +unsatisfactory. Although some prompting methods, such as Chain-of-Thought, can +improve the reasoning ability of LLMs to some extent, they suffer from an +unfaithful issue where derived conclusions may not align with the generated +reasoning chain. To address this issue, some studies employ the approach of +propositional logic to further enhance logical reasoning abilities of LLMs. +However, the potential omissions in the extraction of logical expressions in +these methods can cause information loss in the logical reasoning process, +thereby generating incorrect results. To this end, we propose Logic-of-Thought +(LoT) prompting which employs propositional logic to generate expanded logical +information from input context, and utilizes the generated logical information +as an additional augmentation to the input prompts, thereby enhancing the +capability of logical reasoning. The LoT is orthogonal to existing prompting +methods and can be seamlessly integrated with them. Extensive experiments +demonstrate that LoT boosts the performance of various prompting methods with a +striking margin across five logical reasoning tasks. In particular, the LoT +enhances Chain-of-Thought's performance on the ReClor dataset by +4.35%; +moreover, it improves Chain-of-Thought with Self-Consistency's performance on +LogiQA by +5%; additionally, it boosts performance of Tree-of-Thoughts on +ProofWriter dataset by +8%. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ On the Implicit Relation Between Low-Rank Adaptation and Differential + Privacy + + +
+ A significant approach in natural language processing involves large-scale +pre-training on general domain data followed by adaptation to specific tasks or +domains. As models grow in size, full fine-tuning all parameters becomes +increasingly impractical. To address this, some methods for low-rank task +adaptation of language models have been proposed, e.g. LoRA and FLoRA. These +methods keep the pre-trained model weights fixed and incorporate trainable +low-rank decomposition matrices into some layers of the transformer +architecture, called adapters. This approach significantly reduces the number +of trainable parameters required for downstream tasks compared to full +fine-tuning all parameters. In this work, we look at low-rank adaptation from +the lens of data privacy. We show theoretically that the low-rank adaptation +used in LoRA and FLoRA is equivalent to injecting some random noise into the +batch gradients w.r.t the adapter parameters coming from their full +fine-tuning, and we quantify the variance of the injected noise. By +establishing a Berry-Esseen type bound on the total variation distance between +the noise distribution and a Gaussian distribution with the same variance, we +show that the dynamics of LoRA and FLoRA are very close to differentially +private full fine-tuning the adapters, which suggests that low-rank adaptation +implicitly provides privacy w.r.t the fine-tuning data. Finally, using +Johnson-Lindenstrauss lemma, we show that when augmented with gradient +clipping, low-rank adaptation is almost equivalent to differentially private +full fine-tuning adapters with a fixed noise scale. + +
+
+
+
+
+ + ☆ MUSE: Integrating Multi-Knowledge for Knowledge Graph Completion + + +
+ Knowledge Graph Completion (KGC) aims to predict the missing [relation] part +of (head entity)--[relation]->(tail entity) triplet. Most existing KGC methods +focus on single features (e.g., relation types) or sub-graph aggregation. +However, they do not fully explore the Knowledge Graph (KG) features and +neglect the guidance of external semantic knowledge. To address these +shortcomings, we propose a knowledge-aware reasoning model (MUSE), which +designs a novel multi-knowledge representation learning mechanism for missing +relation prediction. Our model develops a tailored embedding space through +three parallel components: 1) Prior Knowledge Learning for enhancing the +triplets' semantic representation by fine-tuning BERT; 2) Context Message +Passing for enhancing the context messages of KG; 3) Relational Path +Aggregation for enhancing the path representation from the head entity to the +tail entity. The experimental results show that MUSE significantly outperforms +other baselines on four public datasets, achieving over 5.50% H@1 improvement +and 4.20% MRR improvement on the NELL995 dataset. The code and datasets will be +released via https://github.com/SUSTech-TP/ADMA2024-MUSE.git. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2408.05283 +
+
+
+
+
+ + ☆ Data Proportion Detection for Optimized Data Management for Large + Language Models + + +
+ Large language models (LLMs) have demonstrated exceptional performance across +a wide range of tasks and domains, with data preparation playing a critical +role in achieving these results. Pre-training data typically combines +information from multiple domains. To maximize performance when integrating +data from various domains, determining the optimal data proportion is +essential. However, state-of-the-art (SOTA) LLMs rarely disclose details about +their pre-training data, making it difficult for researchers to identify ideal +data proportions. In this paper, we introduce a new topic, \textit{data +proportion detection}, which enables the automatic estimation of pre-training +data proportions by analyzing the generated outputs of LLMs. We provide +rigorous theoretical proofs, practical algorithms, and preliminary experimental +results for data proportion detection. Based on these findings, we offer +valuable insights into the challenges and future directions for effective data +proportion detection and data management. + +
+
+
+
+
+ + ☆ When A Man Says He Is Pregnant: ERP Evidence for A Rational Account of + Speaker-contextualized Language Comprehension + + +
+ Spoken language is often, if not always, understood in a context that +includes the identities of speakers. For instance, we can easily make sense of +an utterance such as "I'm going to have a manicure this weekend" or "The first +time I got pregnant I had a hard time" when the utterance is spoken by a woman, +but it would be harder to understand when it is spoken by a man. Previous +event-related potential (ERP) studies have shown mixed results regarding the +neurophysiological responses to such speaker-mismatched utterances, with some +reporting an N400 effect and others a P600 effect. In an experiment involving +64 participants, we showed that these different ERP effects reflect distinct +cognitive processes employed to resolve the speaker-message mismatch. When +possible, the message is integrated with the speaker context to arrive at an +interpretation, as in the case of violations of social stereotypes (e.g., men +getting a manicure), resulting in an N400 effect. However, when such +integration is impossible due to violations of biological knowledge (e.g., men +getting pregnant), listeners engage in an error correction process to revise +either the perceived utterance or the speaker context, resulting in a P600 +effect. Additionally, we found that the social N400 effect decreased as a +function of the listener's personality trait of openness, while the biological +P600 effect remained robust. Our findings help to reconcile the empirical +inconsistencies in the literature and provide a rational account of +speaker-contextualized language comprehension. + +
+
+
+
+
+ + ☆ Comparing Unidirectional, Bidirectional, and Word2vec Models for + Discovering Vulnerabilities in Compiled Lifted Code + + +
+ Ransomware and other forms of malware cause significant financial and +operational damage to organizations by exploiting long-standing and often +difficult-to-detect software vulnerabilities. To detect vulnerabilities such as +buffer overflows in compiled code, this research investigates the application +of unidirectional transformer-based embeddings, specifically GPT-2. Using a +dataset of LLVM functions, we trained a GPT-2 model to generate embeddings, +which were subsequently used to build LSTM neural networks to differentiate +between vulnerable and non-vulnerable code. Our study reveals that embeddings +from the GPT-2 model significantly outperform those from bidirectional models +of BERT and RoBERTa, achieving an accuracy of 92.5% and an F1-score of 89.7%. +LSTM neural networks were developed with both frozen and unfrozen embedding +model layers. The model with the highest performance was achieved when the +embedding layers were unfrozen. Further, the research finds that, in exploring +the impact of different optimizers within this domain, the SGD optimizer +demonstrates superior performance over Adam. Overall, these findings reveal +important insights into the potential of unidirectional transformer-based +approaches in enhancing cybersecurity defenses. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ☆ HaloScope: Harnessing Unlabeled LLM Generations for Hallucination + Detection NeurIPS 2024 + + +
+ The surge in applications of large language models (LLMs) has prompted +concerns about the generation of misleading or fabricated information, known as +hallucinations. Therefore, detecting hallucinations has become critical to +maintaining trust in LLM-generated content. A primary challenge in learning a +truthfulness classifier is the lack of a large amount of labeled truthful and +hallucinated data. To address the challenge, we introduce HaloScope, a novel +learning framework that leverages the unlabeled LLM generations in the wild for +hallucination detection. Such unlabeled data arises freely upon deploying LLMs +in the open world, and consists of both truthful and hallucinated information. +To harness the unlabeled data, we present an automated membership estimation +score for distinguishing between truthful and untruthful generations within +unlabeled mixture data, thereby enabling the training of a binary truthfulness +classifier on top. Importantly, our framework does not require extra data +collection and human annotations, offering strong flexibility and practicality +for real-world applications. Extensive experiments show that HaloScope can +achieve superior hallucination detection performance, outperforming the +competitive rivals by a significant margin. Code is available at +https://github.com/deeplearningwisc/haloscope. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ☆ MaskLLM: Learnable Semi-Structured Sparsity for Large Language Models NeurIPS 2024 + + +
+ Large Language Models (LLMs) are distinguished by their massive parameter +counts, which typically result in significant redundancy. This work introduces +MaskLLM, a learnable pruning method that establishes Semi-structured (or +``N:M'') Sparsity in LLMs, aimed at reducing computational overhead during +inference. Instead of developing a new importance criterion, MaskLLM explicitly +models N:M patterns as a learnable distribution through Gumbel Softmax +sampling. This approach facilitates end-to-end training on large-scale datasets +and offers two notable advantages: 1) High-quality Masks - our method +effectively scales to large datasets and learns accurate masks; 2) +Transferability - the probabilistic modeling of mask distribution enables the +transfer learning of sparsity across domains or tasks. We assessed MaskLLM +using 2:4 sparsity on various LLMs, including LLaMA-2, Nemotron-4, and GPT-3, +with sizes ranging from 843M to 15B parameters, and our empirical results show +substantial improvements over state-of-the-art methods. For instance, leading +approaches achieve a perplexity (PPL) of 10 or greater on Wikitext compared to +the dense model's 5.12 PPL, but MaskLLM achieves a significantly lower 6.72 PPL +solely by learning the masks with frozen weights. Furthermore, MaskLLM's +learnable nature allows customized masks for lossless application of 2:4 +sparsity to downstream tasks or domains. Code is available at +\url{https://github.com/NVlabs/MaskLLM}. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ☆ Reducing and Exploiting Data Augmentation Noise through Meta Reweighting + Contrastive Learning for Text Classification + + +
+ Data augmentation has shown its effectiveness in resolving the data-hungry +problem and improving model's generalization ability. However, the quality of +augmented data can be varied, especially compared with the raw/original data. +To boost deep learning models' performance given augmented data/samples in text +classification tasks, we propose a novel framework, which leverages both meta +learning and contrastive learning techniques as parts of our design for +reweighting the augmented samples and refining their feature representations +based on their quality. As part of the framework, we propose novel +weight-dependent enqueue and dequeue algorithms to utilize augmented samples' +weight/quality information effectively. Through experiments, we show that our +framework can reasonably cooperate with existing deep learning models (e.g., +RoBERTa-base and Text-CNN) and augmentation techniques (e.g., Wordnet and +Easydata) for specific supervised learning tasks. Experiment results show that +our framework achieves an average of 1.6%, up to 4.3% absolute improvement on +Text-CNN encoders and an average of 1.4%, up to 4.4% absolute improvement on +RoBERTa-base encoders on seven GLUE benchmark datasets compared with the best +baseline. We present an indepth analysis of our framework design, revealing the +non-trivial contributions of our network components. Our code is publicly +available for better reproducibility. + +
+
+ comment: IEEE BigData 2021 +
+
+
+
+
+ + ☆ Autoregressive Multi-trait Essay Scoring via Reinforcement Learning with + Scoring-aware Multiple Rewards EMNLP 2024 + + +
+ Recent advances in automated essay scoring (AES) have shifted towards +evaluating multiple traits to provide enriched feedback. Like typical AES +systems, multi-trait AES employs the quadratic weighted kappa (QWK) to measure +agreement with human raters, aligning closely with the rating schema; however, +its non-differentiable nature prevents its direct use in neural network +training. In this paper, we propose Scoring-aware Multi-reward Reinforcement +Learning (SaMRL), which integrates actual evaluation schemes into the training +process by designing QWK-based rewards with a mean-squared error penalty for +multi-trait AES. Existing reinforcement learning (RL) applications in AES are +limited to classification models despite associated performance degradation, as +RL requires probability distributions; instead, we adopt an autoregressive +score generation framework to leverage token generation probabilities for +robust multi-trait score predictions. Empirical analyses demonstrate that SaMRL +facilitates model training, notably enhancing scoring of previously inferior +prompts. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ☆ What is the social benefit of hate speech detection research? A + Systematic Review + + +
+ While NLP research into hate speech detection has grown exponentially in the +last three decades, there has been minimal uptake or engagement from policy +makers and non-profit organisations. We argue the absence of ethical frameworks +have contributed to this rift between current practice and best practice. By +adopting appropriate ethical frameworks, NLP researchers may enable the social +impact potential of hate speech research. This position paper is informed by +reviewing forty-eight hate speech detection systems associated with +thirty-seven publications from different venues. + +
+
+ comment: Accepted to the 3rd Workshop on NLP for Positive Impact +
+
+
+
+
+ + ☆ RED QUEEN: Safeguarding Large Language Models against Concealed + Multi-Turn Jailbreaking + + +
+ The rapid progress of Large Language Models (LLMs) has opened up new +opportunities across various domains and applications; yet it also presents +challenges related to potential misuse. To mitigate such risks, red teaming has +been employed as a proactive security measure to probe language models for +harmful outputs via jailbreak attacks. However, current jailbreak attack +approaches are single-turn with explicit malicious queries that do not fully +capture the complexity of real-world interactions. In reality, users can engage +in multi-turn interactions with LLM-based chat assistants, allowing them to +conceal their true intentions in a more covert manner. To bridge this gap, we, +first, propose a new jailbreak approach, RED QUEEN ATTACK. This method +constructs a multi-turn scenario, concealing the malicious intent under the +guise of preventing harm. We craft 40 scenarios that vary in turns and select +14 harmful categories to generate 56k multi-turn attack data points. We conduct +comprehensive experiments on the RED QUEEN ATTACK with four representative LLM +families of different sizes. Our experiments reveal that all LLMs are +vulnerable to RED QUEEN ATTACK, reaching 87.62% attack success rate on GPT-4o +and 75.4% on Llama3-70B. Further analysis reveals that larger models are more +susceptible to the RED QUEEN ATTACK, with multi-turn structures and concealment +strategies contributing to its success. To prioritize safety, we introduce a +straightforward mitigation strategy called RED QUEEN GUARD, which aligns LLMs +to effectively counter adversarial attacks. This approach reduces the attack +success rate to below 1% while maintaining the model's performance across +standard benchmarks. Full implementation and dataset are publicly accessible at +https://github.com/kriti-hippo/red_queen. + +
+
+
+
+
+ + ☆ Navigating the Shortcut Maze: A Comprehensive Analysis of Shortcut + Learning in Text Classification by Language Models + + +
+ Language models (LMs), despite their advances, often depend on spurious +correlations, undermining their accuracy and generalizability. This study +addresses the overlooked impact of subtler, more complex shortcuts that +compromise model reliability beyond oversimplified shortcuts. We introduce a +comprehensive benchmark that categorizes shortcuts into occurrence, style, and +concept, aiming to explore the nuanced ways in which these shortcuts influence +the performance of LMs. Through extensive experiments across traditional LMs, +large language models, and state-of-the-art robust models, our research +systematically investigates models' resilience and susceptibilities to +sophisticated shortcuts. Our benchmark and code can be found at: +https://github.com/yuqing-zhou/shortcut-learning-in-text-classification. + +
+
+
+
+
+ + ☆ Description-based Controllable Text-to-Speech with Cross-Lingual Voice + Control ICASSP 2025 + + +
+ We propose a novel description-based controllable text-to-speech (TTS) method +with cross-lingual control capability. To address the lack of audio-description +paired data in the target language, we combine a TTS model trained on the +target language with a description control model trained on another language, +which maps input text descriptions to the conditional features of the TTS +model. These two models share disentangled timbre and style representations +based on self-supervised learning (SSL), allowing for disentangled voice +control, such as controlling speaking styles while retaining the original +timbre. Furthermore, because the SSL-based timbre and style representations are +language-agnostic, combining the TTS and description control models while +sharing the same embedding space effectively enables cross-lingual control of +voice characteristics. Experiments on English and Japanese TTS demonstrate that +our method achieves high naturalness and controllability for both languages, +even though no Japanese audio-description pairs are used. + +
+
+ comment: Submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Enhancing Financial Sentiment Analysis with Expert-Designed Hint + + +
+ This paper investigates the role of expert-designed hint in enhancing +sentiment analysis on financial social media posts. We explore the capability +of large language models (LLMs) to empathize with writer perspectives and +analyze sentiments. Our findings reveal that expert-designed hint, i.e., +pointing out the importance of numbers, significantly improve performances +across various LLMs, particularly in cases requiring perspective-taking skills. +Further analysis on tweets containing different types of numerical data +demonstrates that the inclusion of expert-designed hint leads to notable +improvements in sentiment analysis performance, especially for tweets with +monetary-related numbers. Our findings contribute to the ongoing discussion on +the applicability of Theory of Mind in NLP and open new avenues for improving +sentiment analysis in financial domains through the strategic use of expert +knowledge. + +
+
+
+
+
+ + ☆ MultiClimate: Multimodal Stance Detection on Climate Change Videos + + +
+ Climate change (CC) has attracted increasing attention in NLP in recent +years. However, detecting the stance on CC in multimodal data is understudied +and remains challenging due to a lack of reliable datasets. To improve the +understanding of public opinions and communication strategies, this paper +presents MultiClimate, the first open-source manually-annotated stance +detection dataset with $100$ CC-related YouTube videos and $4,209$ +frame-transcript pairs. We deploy state-of-the-art vision and language models, +as well as multimodal models for MultiClimate stance detection. Results show +that text-only BERT significantly outperforms image-only ResNet50 and ViT. +Combining both modalities achieves state-of-the-art, $0.747$/$0.749$ in +accuracy/F1. Our 100M-sized fusion models also beat CLIP and BLIP, as well as +the much larger 9B-sized multimodal IDEFICS and text-only Llama3 and Gemma2, +indicating that multimodal stance detection remains challenging for large +language models. Our code, dataset, as well as supplementary materials, are +available at https://github.com/werywjw/MultiClimate. + +
+
+ comment: 5 pages, 1 figure +
+
+
+
+
+ + ☆ A Generalized LLM-Augmented BIM Framework: Application to a + Speech-to-BIM system + + +
+ Performing building information modeling (BIM) tasks is a complex process +that imposes a steep learning curve and a heavy cognitive load due to the +necessity of remembering sequences of numerous commands. With the rapid +advancement of large language models (LLMs), it is foreseeable that BIM tasks, +including querying and managing BIM data, 4D and 5D BIM, design compliance +checking, or authoring a design, using written or spoken natural language +(i.e., text-to-BIM or speech-to-BIM), will soon supplant traditional graphical +user interfaces. This paper proposes a generalized LLM-augmented BIM framework +to expedite the development of LLM-enhanced BIM applications by providing a +step-by-step development process. The proposed framework consists of six steps: +interpret-fill-match-structure-execute-check. The paper demonstrates the +applicability of the proposed framework through implementing a speech-to-BIM +application, NADIA-S (Natural-language-based Architectural Detailing through +Interaction with Artificial Intelligence via Speech), using exterior wall +detailing as an example. + +
+
+
+
+
+ + ☆ AER-LLM: Ambiguity-aware Emotion Recognition Leveraging Large Language + Models + + +
+ Recent advancements in Large Language Models (LLMs) have demonstrated great +success in many Natural Language Processing (NLP) tasks. In addition to their +cognitive intelligence, exploring their capabilities in emotional intelligence +is also crucial, as it enables more natural and empathetic conversational AI. +Recent studies have shown LLMs' capability in recognizing emotions, but they +often focus on single emotion labels and overlook the complex and ambiguous +nature of human emotions. This study is the first to address this gap by +exploring the potential of LLMs in recognizing ambiguous emotions, leveraging +their strong generalization capabilities and in-context learning. We design +zero-shot and few-shot prompting and incorporate past dialogue as context +information for ambiguous emotion recognition. Experiments conducted using +three datasets indicate significant potential for LLMs in recognizing ambiguous +emotions, and highlight the substantial benefits of including context +information. Furthermore, our findings indicate that LLMs demonstrate a high +degree of effectiveness in recognizing less ambiguous emotions and exhibit +potential for identifying more ambiguous emotions, paralleling human perceptual +capabilities. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ A Fairness-Driven Method for Learning Human-Compatible Negotiation + Strategies EMNLP + + +
+ Despite recent advancements in AI and NLP, negotiation remains a difficult +domain for AI agents. Traditional game theoretic approaches that have worked +well for two-player zero-sum games struggle in the context of negotiation due +to their inability to learn human-compatible strategies. On the other hand, +approaches that only use human data tend to be domain-specific and lack the +theoretical guarantees provided by strategies grounded in game theory. +Motivated by the notion of fairness as a criterion for optimality in general +sum games, we propose a negotiation framework called FDHC which incorporates +fairness into both the reward design and search to learn human-compatible +negotiation strategies. Our method includes a novel, RL+search technique called +LGM-Zero which leverages a pre-trained language model to retrieve +human-compatible offers from large action spaces. Our results show that our +method is able to achieve more egalitarian negotiation outcomes and improve +negotiation quality. + +
+
+ comment: EMNLP Findings 2024 +
+
+
+
+
+ + ☆ Cross-Institutional Structured Radiology Reporting for Lung Cancer + Screening Using a Dynamic Template-Constrained Large Language Model + + +
+ Structured radiology reporting is advantageous for optimizing clinical +workflows and patient outcomes. Current LLMs in creating structured reports +face the challenges of formatting errors, content hallucinations, and privacy +leakage concerns when uploaded to external servers. We aim to develop an +enhanced open-source LLM for creating structured and standardized LCS reports +from free-text descriptions. After institutional IRB approvals, 5,442 +de-identified LCS reports from two institutions were retrospectively analyzed. +500 reports were randomly selected from the two institutions evenly and then +manually labeled for evaluation. Two radiologists from the two institutions +developed a standardized template including 29 features for lung nodule +reporting. We proposed template-constrained decoding to enhance +state-of-the-art open-source LLMs, including LLAMA, Qwen, and Mistral. The LLM +performance was extensively evaluated in terms of F1 score, confidence +interval, McNemar test, and z-test. Based on the structured reports created +from the large-scale dataset, a nodule-level retrieval system was prototyped +and an automatic statistical analysis was performed. Our software, +vLLM-structure, is publicly available for local deployment with enhanced LLMs. +Our template-constrained decoding approach consistently enhanced the LLM +performance on multi-institutional datasets, with neither formatting errors nor +content hallucinations. Our method improved the best open-source LLAMA-3.1 405B +by up to 10.42%, and outperformed GPT-4o by 17.19%. A novel nodule retrieval +system was successfully prototyped and demonstrated on a large-scale multimodal +database using our enhanced LLM technologies. The automatically derived +statistical distributions were closely consistent with the prior findings in +terms of nodule type, location, size, status, and Lung-RADS. + +
+
+
+
+
+ + ☆ Realistic Evaluation of Model Merging for Compositional Generalization + + +
+ Merging has become a widespread way to cheaply combine individual models into +a single model that inherits their capabilities and attains better performance. +This popularity has spurred rapid development of many new merging methods, +which are typically validated in disparate experimental settings and frequently +differ in the assumptions made about model architecture, data availability, and +computational budget. In this work, we characterize the relative merits of +different merging methods by evaluating them in a shared experimental setting +and precisely identifying the practical requirements of each method. +Specifically, our setting focuses on using merging for compositional +generalization of capabilities in image classification, image generation, and +natural language processing. Additionally, we measure the computational costs +of different merging methods as well as how they perform when scaling the +number of models being merged. Taken together, our results clarify the state of +the field of model merging and provide a comprehensive and rigorous +experimental setup to test new methods. + +
+
+
+
+
+ + ☆ Advancing Object Detection in Transportation with Multimodal Large + Language Models (MLLMs): A Comprehensive Review and Empirical Testing + + +
+ This study aims to comprehensively review and empirically evaluate the +application of multimodal large language models (MLLMs) and Large Vision Models +(VLMs) in object detection for transportation systems. In the first fold, we +provide a background about the potential benefits of MLLMs in transportation +applications and conduct a comprehensive review of current MLLM technologies in +previous studies. We highlight their effectiveness and limitations in object +detection within various transportation scenarios. The second fold involves +providing an overview of the taxonomy of end-to-end object detection in +transportation applications and future directions. Building on this, we +proposed empirical analysis for testing MLLMs on three real-world +transportation problems that include object detection tasks namely, road safety +attributes extraction, safety-critical event detection, and visual reasoning of +thermal images. Our findings provide a detailed assessment of MLLM performance, +uncovering both strengths and areas for improvement. Finally, we discuss +practical limitations and challenges of MLLMs in enhancing object detection in +transportation, thereby offering a roadmap for future research and development +in this critical area. + +
+
+
+
+
+ + ☆ DisGeM: Distractor Generation for Multiple Choice Questions with Span + Masking + + +
+ Recent advancements in Natural Language Processing (NLP) have impacted +numerous sub-fields such as natural language generation, natural language +inference, question answering, and more. However, in the field of question +generation, the creation of distractors for multiple-choice questions (MCQ) +remains a challenging task. In this work, we present a simple, generic +framework for distractor generation using readily available Pre-trained +Language Models (PLMs). Unlike previous methods, our framework relies solely on +pre-trained language models and does not require additional training on +specific datasets. Building upon previous research, we introduce a two-stage +framework consisting of candidate generation and candidate selection. Our +proposed distractor generation framework outperforms previous methods without +the need for training or fine-tuning. Human evaluations confirm that our +approach produces more effective and engaging distractors. The related codebase +is publicly available at https://github.com/obss/disgem. + +
+
+
+
+
+ + ☆ MMMT-IF: A Challenging Multimodal Multi-Turn Instruction Following + Benchmark + + +
+ Evaluating instruction following capabilities for multimodal, multi-turn +dialogue is challenging. With potentially multiple instructions in the input +model context, the task is time-consuming for human raters and we show LLM +based judges are biased towards answers from the same model. We propose +MMMT-IF, an image based multi-turn Q$\&$A evaluation set with added global +instructions between questions, constraining the answer format. This challenges +models to retrieve instructions dispersed across long dialogues and reason +under instruction constraints. All instructions are objectively verifiable +through code execution. We introduce the Programmatic Instruction Following +($\operatorname{PIF}$) metric to measure the fraction of the instructions that +are correctly followed while performing a reasoning task. The +$\operatorname{PIF-N-K}$ set of metrics further evaluates robustness by +measuring the fraction of samples in a corpus where, for each sample, at least +K out of N generated model responses achieve a $\operatorname{PIF}$ score of +one. The $\operatorname{PIF}$ metric aligns with human instruction following +ratings, showing 60 percent correlation. Experiments show Gemini 1.5 Pro, +GPT-4o, and Claude 3.5 Sonnet, have a $\operatorname{PIF}$ metric that drops +from 0.81 on average at turn 1 across the models, to 0.64 at turn 20. Across +all turns, when each response is repeated 4 times ($\operatorname{PIF-4-4}$), +GPT-4o and Gemini successfully follow all instructions only $11\%$ of the time. +When all the instructions are also appended to the end of the model input +context, the $\operatorname{PIF}$ metric improves by 22.3 points on average, +showing that the challenge with the task lies not only in following the +instructions, but also in retrieving the instructions spread out in the model +context. We plan to open source the MMMT-IF dataset and metric computation +code. + +
+
+ comment: 24 pages, 16 figures +
+
+
+
+
+ + ☆ AI Policy Projector: Grounding LLM Policy Design in Iterative Mapmaking + + +
+ Whether a large language model policy is an explicit constitution or an +implicit reward model, it is challenging to assess coverage over the unbounded +set of real-world situations that a policy must contend with. We introduce an +AI policy design process inspired by mapmaking, which has developed tactics for +visualizing and iterating on maps even when full coverage is not possible. With +Policy Projector, policy designers can survey the landscape of model +input-output pairs, define custom regions (e.g., "violence"), and navigate +these regions with rules that can be applied to LLM outputs (e.g., if output +contains "violence" and "graphic details," then rewrite without "graphic +details"). Policy Projector supports interactive policy authoring using LLM +classification and steering and a map visualization reflecting the policy +designer's work. In an evaluation with 12 AI safety experts, our system helps +policy designers to address problematic model behaviors extending beyond an +existing, comprehensive harm taxonomy. + +
+
+
+
+
+ + ☆ LangSAMP: Language-Script Aware Multilingual Pretraining + + +
+ Recent multilingual pretrained language models (mPLMs) often avoid using +language embeddings -- learnable vectors assigned to different languages. These +embeddings are discarded for two main reasons: (1) mPLMs are expected to have a +single, unified parameter set across all languages, and (2) they need to +function seamlessly as universal text encoders without requiring language IDs +as input. However, this removal increases the burden on token embeddings to +encode all language-specific information, which may hinder the model's ability +to produce more language-neutral representations. To address this challenge, we +propose Language-Script Aware Multilingual Pretraining (LangSAMP), a method +that incorporates both language and script embeddings to enhance representation +learning while maintaining a simple architecture. Specifically, we integrate +these embeddings into the output of the transformer blocks before passing the +final representations to the language modeling head for prediction. We apply +LangSAMP to the continual pretraining of XLM-R on a highly multilingual corpus +covering more than 500 languages. The resulting model consistently outperforms +the baseline. Extensive analysis further shows that language/script embeddings +encode language/script-specific information, which improves the selection of +source languages for crosslingual transfer. We make our code and models +publicly available at \url{https://github.com/cisnlp/LangSAMP}. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ LowREm: A Repository of Word Embeddings for 87 Low-Resource Languages + Enhanced with Multilingual Graph Knowledge + + +
+ Contextualized embeddings based on large language models (LLMs) are available +for various languages, but their coverage is often limited for lower resourced +languages. Training LLMs for such languages is often difficult due to +insufficient data and high computational cost. Especially for very low resource +languages, static word embeddings thus still offer a viable alternative. There +is, however, a notable lack of comprehensive repositories with such embeddings +for diverse languages. To address this, we present LowREm, a centralized +repository of static embeddings for 87 low-resource languages. We also propose +a novel method to enhance GloVe-based embeddings by integrating multilingual +graph knowledge, utilizing another source of knowledge. We demonstrate the +superior performance of our enhanced embeddings as compared to contextualized +embeddings extracted from XLM-R on sentiment analysis. Our code and data are +publicly available under https://huggingface.co/DFKI. + +
+
+ comment: Short paper, preview +
+
+
+
+
+ + ☆ Evaluation of Large Language Models for Summarization Tasks in the + Medical Domain: A Narrative Review + + +
+ Large Language Models have advanced clinical Natural Language Generation, +creating opportunities to manage the volume of medical text. However, the +high-stakes nature of medicine requires reliable evaluation, which remains a +challenge. In this narrative review, we assess the current evaluation state for +clinical summarization tasks and propose future directions to address the +resource constraints of expert human evaluation. + +
+
+
+
+
+ + ☆ Data-Prep-Kit: getting your data ready for LLM application development + + +
+ Data preparation is the first and a very important step towards any Large +Language Model (LLM) development. This paper introduces an easy-to-use, +extensible, and scale-flexible open-source data preparation toolkit called Data +Prep Kit (DPK). DPK is architected and designed to enable users to scale their +data preparation to their needs. With DPK they can prepare data on a local +machine or effortlessly scale to run on a cluster with thousands of CPU Cores. +DPK comes with a highly scalable, yet extensible set of modules that transform +natural language and code data. If the user needs additional transforms, they +can be easily developed using extensive DPK support for transform creation. +These modules can be used independently or pipelined to perform a series of +operations. In this paper, we describe DPK architecture and show its +performance from a small scale to a very large number of CPUs. The modules from +DPK have been used for the preparation of Granite Models [1] [2]. We believe +DPK is a valuable contribution to the AI community to easily prepare data to +enhance the performance of their LLM models or to fine-tune models with +Retrieval-Augmented Generation (RAG). + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Is It Good Data for Multilingual Instruction Tuning or Just Bad + Multilingual Evaluation for Large Language Models? EMNLP 2024 + + +
+ Multilingual large language models are designed, claimed, and expected to +cater to speakers of varied languages. We hypothesise that the current +practices of fine-tuning and evaluating these models may not perfectly align +with this objective owing to a heavy reliance on translation, which cannot +cover language-specific knowledge but can introduce translation defects. It +remains unknown whether the nature of the instruction data has an impact on the +model output; conversely, it is questionable whether translated test sets can +capture such nuances. Due to the often coupled practices of using translated +data in both stages, such imperfections could have been overlooked. This work +investigates these issues using controlled native or translated data during the +instruction tuning and evaluation stages. We show that native or generation +benchmarks reveal a notable difference between native and translated +instruction data especially when model performance is high, whereas other types +of test sets cannot. The comparison between round-trip and single-pass +translations reflects the importance of knowledge from language-native +resources. Finally, we demonstrate that regularization is beneficial to +bridging this gap on structured but not generative tasks. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ KAG: Boosting LLMs in Professional Domains via Knowledge Augmented + Generation + + +
+ The recently developed retrieval-augmented generation (RAG) technology has +enabled the efficient construction of domain-specific applications. However, it +also has limitations, including the gap between vector similarity and the +relevance of knowledge reasoning, as well as insensitivity to knowledge logic, +such as numerical values, temporal relations, expert rules, and others, which +hinder the effectiveness of professional knowledge services. In this work, we +introduce a professional domain knowledge service framework called Knowledge +Augmented Generation (KAG). KAG is designed to address the aforementioned +challenges with the motivation of making full use of the advantages of +knowledge graph(KG) and vector retrieval, and to improve generation and +reasoning performance by bidirectionally enhancing large language models (LLMs) +and KGs through five key aspects: (1) LLM-friendly knowledge representation, +(2) mutual-indexing between knowledge graphs and original chunks, (3) +logical-form-guided hybrid reasoning engine, (4) knowledge alignment with +semantic reasoning, and (5) model capability enhancement for KAG. We compared +KAG with existing RAG methods in multihop question answering and found that it +significantly outperforms state-of-theart methods, achieving a relative +improvement of 19.6% on 2wiki and 33.5% on hotpotQA in terms of F1 score. We +have successfully applied KAG to two professional knowledge Q&A tasks of Ant +Group, including E-Government Q&A and E-Health Q&A, achieving significant +improvement in professionalism compared to RAG methods. + +
+
+ comment: 33 pages +
+
+
+
+
+ + ♻ ☆ Recent Trends in Unsupervised Summarization + + +
+ Unsupervised summarization is a powerful technique that enables training +summarizing models without requiring labeled datasets. This survey covers +different recent techniques and models used for unsupervised summarization. We +cover extractive, abstractive, and hybrid models and strategies used to achieve +unsupervised summarization. While the main focus of this survey is on recent +research, we also cover some of the important previous research. We +additionally introduce a taxonomy, classifying different research based on +their approach to unsupervised training. Finally, we discuss the current +approaches and mention some datasets and evaluation methods. + +
+
+
+
+
+ + ♻ ☆ Language agents achieve superhuman synthesis of scientific knowledge + + +
+ Language models are known to hallucinate incorrect information, and it is +unclear if they are sufficiently accurate and reliable for use in scientific +research. We developed a rigorous human-AI comparison methodology to evaluate +language model agents on real-world literature search tasks covering +information retrieval, summarization, and contradiction detection tasks. We +show that PaperQA2, a frontier language model agent optimized for improved +factuality, matches or exceeds subject matter expert performance on three +realistic literature research tasks without any restrictions on humans (i.e., +full access to internet, search tools, and time). PaperQA2 writes cited, +Wikipedia-style summaries of scientific topics that are significantly more +accurate than existing, human-written Wikipedia articles. We also introduce a +hard benchmark for scientific literature research called LitQA2 that guided +design of PaperQA2, leading to it exceeding human performance. Finally, we +apply PaperQA2 to identify contradictions within the scientific literature, an +important scientific task that is challenging for humans. PaperQA2 identifies +2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of +which 70% are validated by human experts. These results demonstrate that +language model agents are now capable of exceeding domain experts across +meaningful tasks on scientific literature. + +
+
+
+
+
+ + ♻ ☆ Granularity is crucial when applying differential privacy to text: An + investigation for neural machine translation EMNLP + + +
+ Applying differential privacy (DP) by means of the DP-SGD algorithm to +protect individual data points during training is becoming increasingly popular +in NLP. However, the choice of granularity at which DP is applied is often +neglected. For example, neural machine translation (NMT) typically operates on +the sentence-level granularity. From the perspective of DP, this setup assumes +that each sentence belongs to a single person and any two sentences in the +training dataset are independent. This assumption is however violated in many +real-world NMT datasets, e.g., those including dialogues. For proper +application of DP we thus must shift from sentences to entire documents. In +this paper, we investigate NMT at both the sentence and document levels, +analyzing the privacy/utility trade-off for both scenarios, and evaluating the +risks of not using the appropriate privacy granularity in terms of leaking +personally identifiable information (PII). Our findings indicate that the +document-level NMT system is more resistant to membership inference attacks, +emphasizing the significance of using the appropriate granularity when working +with DP. + +
+
+ comment: Accepted at EMNLP Findings 2024 +
+
+
+
+
+ + ♻ ☆ Transformers, Contextualism, and Polysemy + + +
+ The transformer architecture, introduced by Vaswani et al. (2017), is at the +heart of the remarkable recent progress in the development of language models, +including widely-used chatbots such as Chat-GPT and Claude. In this paper, I +argue that we can extract from the way the transformer architecture works a +theory of the relationship between context and meaning. I call this the +transformer theory, and I argue that it is novel with regard to two related +philosophical debates: the contextualism debate regarding the extent of +context-sensitivity across natural language, and the polysemy debate regarding +how polysemy should be captured within an account of word meaning. + +
+
+
+
+
+ + ♻ ☆ Investigating OCR-Sensitive Neurons to Improve Entity Recognition in + Historical Documents + + +
+ This paper investigates the presence of OCR-sensitive neurons within the +Transformer architecture and their influence on named entity recognition (NER) +performance on historical documents. By analysing neuron activation patterns in +response to clean and noisy text inputs, we identify and then neutralise +OCR-sensitive neurons to improve model performance. Based on two open access +large language models (Llama2 and Mistral), experiments demonstrate the +existence of OCR-sensitive regions and show improvements in NER performance on +historical newspapers and classical commentaries, highlighting the potential of +targeted neuron modulation to improve models' performance on noisy text. + +
+
+
+
+
+ + ♻ ☆ AC4: Algebraic Computation Checker for Circuit Constraints in ZKPs + + +
+ Zero-knowledge proof (ZKP) systems have surged attention and held a +fundamental role in contemporary cryptography. Zero-knowledge succinct +non-interactive argument of knowledge (zk-SNARK) protocols dominate the ZKP +usage, implemented through arithmetic circuit programming paradigm. However, +underconstrained or overconstrained circuits may lead to bugs. The former +refers to circuits that lack the necessary constraints, resulting in unexpected +solutions and causing the verifier to accept a bogus witness, and the latter +refers to circuits that are constrained excessively, resulting in lacking +necessary solutions and causing the verifier to accept no witness. This paper +introduces a novel approach for pinpointing two distinct types of bugs in ZKP +circuits. The method involves encoding the arithmetic circuit constraints to +polynomial equation systems and solving them over finite fields by the computer +algebra system. The classification of verification results is refined, greatly +enhancing the expressive power of the system. A tool, AC4, is proposed to +represent the implementation of the method. Experiments show that AC4 +demonstrates a increase in the checked ratio, showing a 29% improvement over +Picus, a checker for Circom circuits, and a 10% improvement over +halo2-analyzer, a checker for halo2 circuits. Within a solvable range, the +checking time has also exhibited noticeable improvement, demonstrating a +magnitude increase compared to previous efforts. + +
+
+ comment: 24 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ GTSinger: A Global Multi-Technique Singing Corpus with Realistic Music + Scores for All Singing Tasks NeurIPS 2024 + + +
+ The scarcity of high-quality and multi-task singing datasets significantly +hinders the development of diverse controllable and personalized singing tasks, +as existing singing datasets suffer from low quality, limited diversity of +languages and singers, absence of multi-technique information and realistic +music scores, and poor task suitability. To tackle these problems, we present +GTSinger, a large global, multi-technique, free-to-use, high-quality singing +corpus with realistic music scores, designed for all singing tasks, along with +its benchmarks. Particularly, (1) we collect 80.59 hours of high-quality +singing voices, forming the largest recorded singing dataset; (2) 20 +professional singers across nine widely spoken languages offer diverse timbres +and styles; (3) we provide controlled comparison and phoneme-level annotations +of six commonly used singing techniques, helping technique modeling and +control; (4) GTSinger offers realistic music scores, assisting real-world +musical composition; (5) singing voices are accompanied by manual +phoneme-to-audio alignments, global style labels, and 16.16 hours of paired +speech for various singing tasks. Moreover, to facilitate the use of GTSinger, +we conduct four benchmark experiments: technique-controllable singing voice +synthesis, technique recognition, style transfer, and speech-to-singing +conversion. The corpus and demos can be found at http://gtsinger.github.io. We +provide the dataset and the code for processing data and conducting benchmarks +at https://huggingface.co/datasets/GTSinger/GTSinger and +https://github.com/GTSinger/GTSinger. + +
+
+ comment: Accepted by NeurIPS 2024 (Spotlight) +
+
+
+
+
+ + ♻ ☆ EfficientRAG: Efficient Retriever for Multi-Hop Question Answering + + +
+ Retrieval-augmented generation (RAG) methods encounter difficulties when +addressing complex questions like multi-hop queries. While iterative retrieval +methods improve performance by gathering additional information, current +approaches often rely on multiple calls of large language models (LLMs). In +this paper, we introduce EfficientRAG, an efficient retriever for multi-hop +question answering. EfficientRAG iteratively generates new queries without the +need for LLM calls at each iteration and filters out irrelevant information. +Experimental results demonstrate that EfficientRAG surpasses existing RAG +methods on three open-domain multi-hop question-answering datasets. + +
+
+ comment: 20 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ ICON: Improving Inter-Report Consistency in Radiology Report Generation + via Lesion-aware Mixup Augmentation + + +
+ Previous research on radiology report generation has made significant +progress in terms of increasing the clinical accuracy of generated reports. In +this paper, we emphasize another crucial quality that it should possess, i.e., +inter-report consistency, which refers to the capability of generating +consistent reports for semantically equivalent radiographs. This quality is +even of greater significance than the overall report accuracy in terms of +ensuring the system's credibility, as a system prone to providing conflicting +results would severely erode users' trust. Regrettably, existing approaches +struggle to maintain inter-report consistency, exhibiting biases towards common +patterns and susceptibility to lesion variants. To address this issue, we +propose ICON, which improves the inter-report consistency of radiology report +generation. Aiming to enhance the system's ability to capture similarities in +semantically equivalent lesions, our approach first involves extracting lesions +from input images and examining their characteristics. Then, we introduce a +lesion-aware mixup technique to ensure that the representations of the +semantically equivalent lesions align with the same attributes, achieved +through a linear combination during the training phase. Extensive experiments +on three publicly available chest X-ray datasets verify the effectiveness of +our approach, both in terms of improving the consistency and accuracy of the +generated reports. + +
+
+
+
+
+ + ♻ ☆ An Empirical Study on Cross-lingual Vocabulary Adaptation for Efficient + Language Model Inference EMNLP 2024 + + +
+ The development of state-of-the-art generative large language models (LLMs) +disproportionately relies on English-centric tokenizers, vocabulary and +pre-training data. Despite the fact that some LLMs have multilingual +capabilities, recent studies have shown that their inference efficiency +deteriorates when generating text in languages other than English. This results +in increased inference time and costs. Cross-lingual vocabulary adaptation +(CVA) methods have been proposed for adapting models to a target language +aiming to improve downstream performance. However, the effectiveness of these +methods on increasing inference efficiency of generative LLMs has yet to be +explored. In this paper, we perform an empirical study of five CVA methods on +four generative LLMs (including monolingual and multilingual models) across +four typologically-diverse languages and four natural language understanding +tasks. We find that CVA substantially contributes to LLM inference speedups of +up to 271.5\%. We also show that adapting LLMs that have been pre-trained on +more balanced multilingual data results in downstream performance comparable to +the original models. + +
+
+ comment: Accepted at EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Abstraction-of-Thought Makes Language Models Better Reasoners EMNLP 2024 + + +
+ Abstract reasoning, the ability to reason from the abstract essence of a +problem, serves as a key to generalization in human reasoning. However, +eliciting language models to perform reasoning with abstraction remains +unexplored. This paper seeks to bridge this gap by introducing a novel +structured reasoning format called Abstraction-of-Thought (AoT). The uniqueness +of AoT lies in its explicit requirement for varying levels of abstraction +within the reasoning process. This approach could elicit language models to +first contemplate on the abstract level before incorporating concrete details, +which is overlooked by the prevailing step-by-step Chain-of-Thought (CoT) +method. To align models with the AoT format, we present AoT Collection, a +generic finetuning dataset consisting of 348k high-quality samples with AoT +reasoning processes, collected via an automated and scalable pipeline. We +finetune a wide range of language models with AoT Collection and conduct +extensive evaluations on 23 unseen tasks from the challenging benchmark +Big-Bench Hard. Experimental results indicate that models aligned to AoT +reasoning format substantially outperform those aligned to CoT in many +reasoning tasks. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ DAPE: Data-Adaptive Positional Encoding for Length Extrapolation NeurIPS 2024 + + +
+ Positional encoding plays a crucial role in transformers, significantly +impacting model performance and length generalization. Prior research has +introduced absolute positional encoding (APE) and relative positional encoding +(RPE) to distinguish token positions in given sequences. However, both APE and +RPE remain fixed after model training regardless of input data, limiting their +adaptability and flexibility. Hence, we expect that the desired positional +encoding should be data-adaptive and can be dynamically adjusted with the given +attention. In this paper, we propose a Data-Adaptive Positional Encoding (DAPE) +method, which dynamically and semantically adjusts based on input context and +learned fixed priors. Experimental validation on real-world datasets (Arxiv, +Books3, and CHE) demonstrates that DAPE enhances model performances in terms of +trained length and length generalization, where the improvements are +statistically significant. The model visualization suggests that our model can +keep both local and anti-local information. Finally, we successfully train the +model on sequence length 128 and achieve better performance at evaluation +sequence length 8192, compared with other static positional encoding methods, +revealing the benefit of the adaptive positional encoding method. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ On the Design and Analysis of LLM-Based Algorithms + + +
+ We initiate a formal investigation into the design and analysis of LLM-based +algorithms, i.e. algorithms that contain one or multiple calls of large +language models (LLMs) as sub-routines and critically rely on the capabilities +of LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt +engineering to complicated LLM-powered agent systems and compound AI systems, +have achieved remarkable empirical success, the design and optimization of them +have mostly relied on heuristics and trial-and-errors, which is largely due to +a lack of formal and analytical study for these algorithms. To fill this gap, +we start by identifying the computational-graph representation of LLM-based +algorithms, the design principle of task decomposition, and some key +abstractions, which then facilitate our formal analysis for the accuracy and +efficiency of LLM-based algorithms, despite the black-box nature of LLMs. +Through extensive analytical and empirical investigation in a series of case +studies, we demonstrate that the proposed framework is broadly applicable to a +wide range of scenarios and diverse patterns of LLM-based algorithms, such as +parallel, hierarchical and recursive task decomposition. Our proposed framework +holds promise for advancing LLM-based algorithms, by revealing the reasons +behind curious empirical phenomena, guiding the choices of hyperparameters, +predicting the empirical performance of algorithms, and inspiring new algorithm +design. To promote further study of LLM-based algorithms, we release our source +code at +https://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm. + +
+
+
+
+
+ + ♻ ☆ Fine Tuning vs. Retrieval Augmented Generation for Less Popular + Knowledge + + +
+ Language Models (LMs) memorize a vast amount of factual knowledge, exhibiting +strong performance across diverse tasks and domains. However, it has been +observed that the performance diminishes when dealing with less-popular or +low-frequency concepts and entities, for example in domain specific +applications. The two prominent approaches to enhance the performance of LMs on +low-frequent topics are: Retrieval Augmented Generation (RAG) and fine-tuning +(FT) over synthetic data. This paper explores and evaluates the impact of RAG +and FT on customizing LMs in handling low-frequency entities on question +answering tasks. We conduct extensive experiments on twelve LMs of varying size +and type and different fine tuning, data augmentation, and retrieval models. +Our findings indicate that while FT boosts the performance across entities of +varying popularity, RAG surpasses FT by a large margin particularly for least +popular factual knowledge. Additionally, the success of both RAG and FT +approaches is amplified by improving retrieval and data augmentation +techniques. Fine tuning, while beneficial for small LMs, requires extensive +resources. To address this issue, we propose the new Stimulus RAG approach that +surpasses the effectiveness of fine tuning based approaches, thereby +eliminating the need for the costly data augmentation and fine tuning step for +enriching LMs with less popular factual knowledge. + +
+
+
+
+
+ + ♻ ☆ J2N -- Nominal Adjective Identification and its Application + + +
+ This paper explores the challenges posed by nominal adjectives (NAs) in +natural language processing (NLP) tasks, particularly in part-of-speech (POS) +tagging. We propose treating NAs as a distinct POS tag, "JN," and investigate +its impact on POS tagging, BIO chunking, and coreference resolution. Our study +shows that reclassifying NAs can improve the accuracy of syntactic analysis and +structural understanding in NLP. We present experimental results using Hidden +Markov Models (HMMs), Maximum Entropy (MaxEnt) models, and Spacy, demonstrating +the feasibility and potential benefits of this approach. Additionally we +trained a bert model to identify the NA in untagged text. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ NumeroLogic: Number Encoding for Enhanced LLMs' Numerical Reasoning + + +
+ Language models struggle with handling numerical data and performing +arithmetic operations. We hypothesize that this limitation can be partially +attributed to non-intuitive textual numbers representation. When a digit is +read or generated by a causal language model it does not know its place value +(e.g. thousands vs. hundreds) until the entire number is processed. To address +this issue, we propose a simple adjustment to how numbers are represented by +including the count of digits before each number. For instance, instead of +"42", we suggest using "{2:42}" as the new format. This approach, which we term +NumeroLogic, offers an added advantage in number generation by serving as a +Chain of Thought (CoT). By requiring the model to consider the number of digits +first, it enhances the reasoning process before generating the actual number. +We use arithmetic tasks to demonstrate the effectiveness of the NumeroLogic +formatting. We further demonstrate NumeroLogic applicability to general natural +language modeling, improving language understanding performance in the MMLU +benchmark. + +
+
+
+
+
+ + ♻ ☆ Leveraging summary of radiology reports with transformers + + +
+ Two fundamental problems in health-care stem from patient handoff and triage. +Doctors are often required to perform complex findings summarization to +facilitate efficient communication with specialists and decision making on the +urgency of each case. To address these challenges, we present a state of the +art radiology report summarization model utilizing adjusted bidirectional +encoder representation from transformers BERTtoBERT encoder and decoder +architecture. We also provide a data processing pipeline for future models +developed on the the MIMIC CXR dataset. Our approach includes a novel method +for augmenting medical data and a comprehensive performance analysis. Our best +performing model achieved a recall oriented understudy for gisting evaluation L +F1 score of 58.75/100, outperforming specialized checkpoints with more +sophisticated attention mechanisms. We also provide a data processing pipeline +for future models developed on the MIMIC chest X-ray dataset. The model +introduced in this paper demonstrates significantly improved capacity in +radiology report summarization, highlighting the potential for ensuring better +clinical workflows and enhanced patient care. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ QRMeM: Unleash the Length Limitation through Question then Reflection + Memory Mechanism EMNLP 2024 + + +
+ While large language models (LLMs) have made notable advancements in natural +language processing, they continue to struggle with processing extensive text. +Memory mechanism offers a flexible solution for managing long contexts, +utilizing techniques such as compression, summarization, and structuring to +facilitate nuanced and efficient handling of large volumes of text. However, +existing techniques face challenges with static knowledge integration, leading +to insufficient adaptation to task-specific needs and missing +multi-segmentation relationships, which hinders the dynamic reorganization and +logical combination of relevant segments during the response process. To +address these issues, we introduce a novel strategy, Question then Reflection +Memory Mechanism (QRMeM), incorporating a dual-structured memory pool. This +pool synergizes static textual content with structured graph guidance, +fostering a reflective trial-and-error approach for navigating and identifying +relevant segments. Our evaluation across multiple-choice questions (MCQ) and +multi-document question answering (Multi-doc QA) benchmarks showcases QRMeM +enhanced performance compared to existing approaches. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ MMCode: Benchmarking Multimodal Large Language Models for Code + Generation with Visually Rich Programming Problems EMNLP 2024 + + +
+ Programming often involves converting detailed and complex specifications +into code, a process during which developers typically utilize visual aids to +more effectively convey concepts. While recent developments in Large Multimodal +Models have demonstrated remarkable abilities in visual reasoning and +mathematical tasks, there is little work on investigating whether these models +can effectively interpret visual elements for code generation. To this end, we +present MMCode, the first multi-modal coding dataset for evaluating algorithmic +problem-solving skills in visually rich contexts. MMCode contains 3,548 +questions and 6,620 images collected from real-world programming challenges +harvested from 10 code competition websites, presenting significant challenges +due to the extreme demand for reasoning abilities. Our experiment results show +that current state-of-the-art models struggle to solve these problems. The +results highlight the lack of powerful vision-code models, and we hope MMCode +can serve as an inspiration for future works in this domain. The data and code +are publicly available at https://github.com/likaixin2000/MMCode. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Explanation Regularisation through the Lens of Attributions + + +
+ Explanation regularisation (ER) has been introduced as a way to guide text +classifiers to form their predictions relying on input tokens that humans +consider plausible. This is achieved by introducing an auxiliary explanation +loss that measures how well the output of an input attribution technique for +the model agrees with human-annotated rationales. The guidance appears to +benefit performance in out-of-domain (OOD) settings, presumably due to an +increased reliance on "plausible" tokens. However, previous work has +under-explored the impact of guidance on that reliance, particularly when +reliance is measured using attribution techniques different from those used to +guide the model. In this work, we seek to close this gap, and also explore the +relationship between reliance on plausible features and OOD performance. We +find that the connection between ER and the ability of a classifier to rely on +plausible features has been overstated and that a stronger reliance on +plausible tokens does not seem to be the cause for OOD improvements. + +
+
+ comment: 22 pages, 14 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ AutoScraper: A Progressive Understanding Web Agent for Web Scraper + Generation EMNLP 2024 + + +
+ Web scraping is a powerful technique that extracts data from websites, +enabling automated data collection, enhancing data analysis capabilities, and +minimizing manual data entry efforts. Existing methods, wrappers-based methods +suffer from limited adaptability and scalability when faced with a new website, +while language agents, empowered by large language models (LLMs), exhibit poor +reusability in diverse web environments. In this work, we introduce the +paradigm of generating web scrapers with LLMs and propose AutoScraper, a +two-stage framework that can handle diverse and changing web environments more +efficiently. AutoScraper leverages the hierarchical structure of HTML and +similarity across different web pages for generating web scrapers. Besides, we +propose a new executability metric for better measuring the performance of web +scraper generation tasks. We conduct comprehensive experiments with multiple +LLMs and demonstrate the effectiveness of our framework. Resources of this +paper can be found at \url{https://github.com/EZ-hwh/AutoScraper} + +
+
+ comment: 19 pages, 4 figures, 18 tables. Accepted to EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Can Large Language Models Faithfully Express Their Intrinsic Uncertainty + in Words? EMNLP 2024 + + +
+ We posit that large language models (LLMs) should be capable of expressing +their intrinsic uncertainty in natural language. For example, if the LLM is +equally likely to output two contradicting answers to the same question, then +its generated response should reflect this uncertainty by hedging its answer +(e.g., "I'm not sure, but I think..."). We formalize faithful response +uncertainty based on the gap between the model's intrinsic confidence in the +assertions it makes and the decisiveness by which they are conveyed. This +example-level metric reliably indicates whether the model reflects its +uncertainty, as it penalizes both excessive and insufficient hedging. We +evaluate a variety of aligned LLMs at faithfully communicating uncertainty on +several knowledge-intensive question answering tasks. Our results provide +strong evidence that modern LLMs are poor at faithfully conveying their +uncertainty, and that better alignment is necessary to improve their +trustworthiness. + +
+
+ comment: To appear in EMNLP 2024 (main conference) +
+
+
+
+
+ + ♻ ☆ Soda-Eval: Open-Domain Dialogue Evaluation in the age of LLMs EMNLP2024 + + +
+ Although human evaluation remains the gold standard for open-domain dialogue +evaluation, the growing popularity of automated evaluation using Large Language +Models (LLMs) has also extended to dialogue. However, most frameworks leverage +benchmarks that assess older chatbots on aspects such as fluency and relevance, +which are not reflective of the challenges associated with contemporary models. +In fact, a qualitative analysis on Soda, a GPT-3.5 generated dialogue dataset, +suggests that current chatbots may exhibit several recurring issues related to +coherence and commonsense knowledge, but generally produce highly fluent and +relevant responses. + Noting the aforementioned limitations, this paper introduces Soda-Eval, an +annotated dataset based on Soda that covers over 120K turn-level assessments +across 10K dialogues, where the annotations were generated by GPT-4. Using +Soda-Eval as a benchmark, we then study the performance of several open-access +instruction-tuned LLMs, finding that dialogue evaluation remains challenging. +Fine-tuning these models improves performance over few-shot inferences, both in +terms of correlation and explanation. + +
+
+ comment: Accepted to EMNLP2024 (findings) +
+
+
+
+
+ + ♻ ☆ Neuro-Symbolic Integration Brings Causal and Reliable Reasoning Proofs + + +
+ Two lines of approaches are adopted for complex reasoning with LLMs. One line +of work prompts LLMs with various reasoning structures, while the structural +outputs can be naturally regarded as intermediate reasoning steps. Another line +of work adopt LLM-free declarative solvers to do the reasoning task, rendering +higher reasoning accuracy but lacking interpretability due to the black-box +nature of the solvers. Aiming to resolve the trade-off between answer accuracy +and interpretability, we present a simple extension to the latter line of work. +Specifically, we showcase that the intermediate search logs generated by Prolog +interpreters can be accessed and interpreted into human-readable reasoning +proofs. As long as LLMs correctly translate problem descriptions into Prolog +representations, the corresponding reasoning proofs are ensured to be causal +and reliable. On two logical reasoning and one arithmetic reasoning datasets, +our framework obtains significant improvements in terms of both answer accuracy +and reasoning proof accuracy. Our code is released at +https://github.com/DAMO-NLP-SG/CaRing + +
+
+
+
+
+ + ♻ ☆ Archon: An Architecture Search Framework for Inference-Time Techniques + + +
+ Inference-time techniques are emerging as highly effective tools to increase +large language model (LLM) capabilities. However, there is still limited +understanding of the best practices for developing systems that combine +inference-time techniques with one or more LLMs, with challenges including: (1) +effectively allocating inference compute budget, (2) understanding the +interactions between different combinations of inference-time techniques and +their impact on downstream performance, and 3) efficiently searching over the +large space of model choices, inference-time techniques, and their +compositions. To address these challenges, we introduce Archon, an automated +framework for designing inference-time architectures. Archon defines an +extensible design space, encompassing methods such as generation ensembling, +multi-sampling, ranking, fusion, critiquing, verification, and unit testing. It +then transforms the problem of selecting and combining LLMs and inference-time +techniques into a hyperparameter optimization objective. To optimize this +objective, we introduce automated Inference-Time Architecture Search (ITAS) +algorithms. Given target benchmark(s), an inference compute budget, and +available LLMs, ITAS outputs optimized architectures. We evaluate Archon +architectures across a wide range of instruction-following and reasoning +benchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval, +MixEval Hard, MATH, and CodeContests. We show that automatically designed +inference-time architectures by Archon outperform strong models such as GPT-4o +and Claude 3.5 Sonnet on these benchmarks, achieving an average increase of +15.1 and 11.2 percentage points with all-source models and open-source models, +respectively. We make our code and datasets available publicly on Github: +https://github.com/ScalingIntelligence/Archon. + +
+
+
+
+
+ + ♻ ☆ Quality Matters: Evaluating Synthetic Data for Tool-Using LLMs + + +
+ Training large language models (LLMs) for external tool usage is a rapidly +expanding field, with recent research focusing on generating synthetic data to +address the shortage of available data. However, the absence of systematic data +quality checks poses complications for properly training and testing models. To +that end, we propose two approaches for assessing the reliability of data for +training LLMs to use external tools. The first approach uses intuitive, +human-defined correctness criteria. The second approach uses a model-driven +assessment with in-context evaluation. We conduct a thorough evaluation of data +quality on two popular benchmarks, followed by an extrinsic evaluation that +showcases the impact of data quality on model performance. Our results +demonstrate that models trained on high-quality data outperform those trained +on unvalidated data, even when trained with a smaller quantity of data. These +findings empirically support the significance of assessing and ensuring the +reliability of training data for tool-using LLMs. + +
+
+
+
+
+ + ♻ ☆ Unused information in token probability distribution of generative LLM: + improving LLM reading comprehension through calculation of expected values + + +
+ LLM text decoding is key component for perceived LLM quality. We demonstrate +two experiments showing that decoding methods could be improved by manipulation +of token probabilities. First, we test few LLM on SummEval summary scoring +dataset, to measure reading comprehension. We compare scores from greedy +decoding to expected values over the next token distribution. We scale logits +by large temperature to increase the entropy of scores. This allows strong +improvement of performance on SummEval (in terms of correlations to human +judgement). We see improvement from 6-8% to 13-28% for 7B Mistral and from +20%-46% to 37%-56% for Mixtral, beating GPT 4 0314 result on two metrics. Part +of the gain seems related to positional bias. Secondly, we use +probability-based tree sampling algorithm, to examine all most probable +generations for given prompt. + +
+
+ comment: 7 pages, 1 figure, presented at FEDCSIS 2024 conference, +
+
+
+
+
+ + ♻ ☆ How does Architecture Influence the Base Capabilities of Pre-trained + Language Models? A Case Study Based on FFN-Wider and MoE Transformers + + +
+ Pre-trained language models have been proven to possess strong base +capabilities, which not only excel in in-distribution language modeling but +also show powerful abilities in out-of-distribution language modeling, transfer +learning and few-shot learning. Unlike existing work focusing on the influence +of scale on base capabilities, our work examines the influence of architecture +on those. Specifically, our concern is: How does architecture influence the +base capabilities of pre-trained language models? In this work, we attempt to +explain and reverse the decline in base capabilities caused by the architecture +of FFN-Wider Transformers, seeking to provide some insights. Through analysis, +we found the contribution ratio of Multi-Head Attention (a combination +function) to pre-trained language modeling is a key factor affecting base +capabilities. FFN-Wider Transformers reduce the contribution ratio of this +combination function, leading to a decline in base capabilities. We confirmed +this by experiments and proposed Combination Enhanced Architecture (CEA) to +address the decline in base capabilities of such models. Significantly, we +extended our explanation and CEA to Mixture of Experts (MoE) Transformers. We +successfully achieved significant improvements in base capabilities on a 14B +parameter MoE model, demonstrating the practical application value of our work. +This also indicates that our analysis has a certain guiding significance for +architecture analysis, architecture improvement and architecture design. + +
+
+
+
+
+ + ♻ ☆ CHIQ: Contextual History Enhancement for Improving Query Rewriting in + Conversational Search EMNLP 2024 + + +
+ In this paper, we study how open-source large language models (LLMs) can be +effectively deployed for improving query rewriting in conversational search, +especially for ambiguous queries. We introduce CHIQ, a two-step method that +leverages the capabilities of LLMs to resolve ambiguities in the conversation +history before query rewriting. This approach contrasts with prior studies that +predominantly use closed-source LLMs to directly generate search queries from +conversation history. We demonstrate on five well-established benchmarks that +CHIQ leads to state-of-the-art results across most settings, showing highly +competitive performances with systems leveraging closed-source LLMs. Our study +provides a first step towards leveraging open-source LLMs in conversational +search, as a competitive alternative to the prevailing reliance on commercial +LLMs. Data, models, and source code will be publicly available upon acceptance +at https://github.com/fengranMark/CHIQ. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ MPCODER: Multi-user Personalized Code Generator with Explicit and + Implicit Style Representation Learning ACL 2024 + + +
+ Large Language Models (LLMs) have demonstrated great potential for assisting +developers in their daily development. However, most research focuses on +generating correct code, how to use LLMs to generate personalized code has +seldom been investigated. To bridge this gap, we proposed MPCoder (Multi-user +Personalized Code Generator) to generate personalized code for multiple users. +To better learn coding style features, we utilize explicit coding style +residual learning to capture the syntax code style standards and implicit style +learning to capture the semantic code style conventions. We train a multi-user +style adapter to better differentiate the implicit feature representations of +different users through contrastive learning, ultimately enabling personalized +code generation for multiple users. We further propose a novel evaluation +metric for estimating similarities between codes of different coding styles. +The experimental results show the effectiveness of our approach for this novel +task. + +
+
+ comment: Accepted by ACL 2024, Main Conference +
+
+
+
+
+ + ♻ ☆ TCSinger: Zero-Shot Singing Voice Synthesis with Style Transfer and + Multi-Level Style Control EMNLP 2024 + + +
+ Zero-shot singing voice synthesis (SVS) with style transfer and style control +aims to generate high-quality singing voices with unseen timbres and styles +(including singing method, emotion, rhythm, technique, and pronunciation) from +audio and text prompts. However, the multifaceted nature of singing styles +poses a significant challenge for effective modeling, transfer, and control. +Furthermore, current SVS models often fail to generate singing voices rich in +stylistic nuances for unseen singers. To address these challenges, we introduce +TCSinger, the first zero-shot SVS model for style transfer across cross-lingual +speech and singing styles, along with multi-level style control. Specifically, +TCSinger proposes three primary modules: 1) the clustering style encoder +employs a clustering vector quantization model to stably condense style +information into a compact latent space; 2) the Style and Duration Language +Model (S\&D-LM) concurrently predicts style information and phoneme duration, +which benefits both; 3) the style adaptive decoder uses a novel mel-style +adaptive normalization method to generate singing voices with enhanced details. +Experimental results show that TCSinger outperforms all baseline models in +synthesis quality, singer similarity, and style controllability across various +tasks, including zero-shot style transfer, multi-level style control, +cross-lingual style transfer, and speech-to-singing style transfer. Singing +voice samples can be accessed at https://tcsinger.github.io/. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Humans or LLMs as the Judge? A Study on Judgement Biases EMNLP2024 + + +
+ Adopting human and large language models (LLM) as judges (a.k.a human- and +LLM-as-a-judge) for evaluating the performance of LLMs has recently gained +attention. Nonetheless, this approach concurrently introduces potential biases +from human and LLMs, questioning the reliability of the evaluation results. In +this paper, we propose a novel framework that is free from referencing +groundtruth annotations for investigating Misinformation Oversight Bias, Gender +Bias, Authority Bias and Beauty Bias on LLM and human judges. We curate a +dataset referring to the revised Bloom's Taxonomy and conduct thousands of +evaluations. Results show that human and LLM judges are vulnerable to +perturbations to various degrees, and that even the cutting-edge judges possess +considerable biases. We further exploit these biases to conduct attacks on LLM +judges. We hope that our work can notify the community of the bias and +vulnerability of human- and LLM-as-a-judge, as well as the urgency of +developing robust evaluation systems. + +
+
+ comment: EMNLP2024 +
+
+
+
+
+ + ♻ ☆ Can AI writing be salvaged? Mitigating Idiosyncrasies and Improving + Human-AI Alignment in the Writing Process through Edits + + +
+ LLM-based applications are helping people write, and LLM-generated text is +making its way into social media, journalism, and our classrooms. However, the +differences between LLM-generated and human-written text remain unclear. To +explore this, we hired professional writers to edit paragraphs in several +creative domains. We first found these writers agree on undesirable +idiosyncrasies in LLM-generated text, formalizing it into a seven-category +taxonomy (e.g. cliches, unnecessary exposition). Second, we curated the LAMP +corpus: 1,057 LLM-generated paragraphs edited by professional writers according +to our taxonomy. Analysis of LAMP reveals that none of the LLMs used in our +study (GPT4o, Claude-3.5-Sonnet, Llama-3.1-70b) outperform each other in terms +of writing quality, revealing common limitations across model families. Third, +we explored automatic editing methods to improve LLM-generated text. A +large-scale preference annotation confirms that although experts largely prefer +text edited by other experts, automatic editing methods show promise in +improving alignment between LLM-generated and human-written text. + +
+
+ comment: NLP+HCI, Behavioral Science +
+
+
+
+
+ + ♻ ☆ Unleashing the Power of Emojis in Texts via Self-supervised Graph + Pre-Training EMNLP 2024 + + +
+ Emojis have gained immense popularity on social platforms, serving as a +common means to supplement or replace text. However, existing data mining +approaches generally either completely ignore or simply treat emojis as +ordinary Unicode characters, which may limit the model's ability to grasp the +rich semantic information in emojis and the interaction between emojis and +texts. Thus, it is necessary to release the emoji's power in social media data +mining. To this end, we first construct a heterogeneous graph consisting of +three types of nodes, i.e. post, word and emoji nodes to improve the +representation of different elements in posts. The edges are also well-defined +to model how these three elements interact with each other. To facilitate the +sharing of information among post, word and emoji nodes, we propose a graph +pre-train framework for text and emoji co-modeling, which contains two graph +pre-training tasks: node-level graph contrastive learning and edge-level link +reconstruction learning. Extensive experiments on the Xiaohongshu and Twitter +datasets with two types of downstream tasks demonstrate that our approach +proves significant improvement over previous strong baseline methods. + +
+
+ comment: Accepted by EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ LAViTeR: Learning Aligned Visual and Textual Representations Assisted by + Image and Caption Generation + + +
+ Pre-training visual and textual representations from large-scale image-text +pairs is becoming a standard approach for many downstream vision-language +tasks. The transformer-based models learn inter and intra-modal attention +through a list of self-supervised learning tasks. This paper proposes LAViTeR, +a novel architecture for visual and textual representation learning. The main +module, Visual Textual Alignment (VTA) will be assisted by two auxiliary tasks, +GAN-based image synthesis and Image Captioning. We also propose a new +evaluation metric measuring the similarity between the learnt visual and +textual embedding. The experimental results on two public datasets, CUB and +MS-COCO, demonstrate superior visual and textual representation alignment in +the joint feature embedding space + +
+
+ comment: 15 pages, 10 Figures, 5 Tables. Oral Presentation at Irish Machine + Vision and Image Processing Conference Proceedings, 2024 +
+
+
+
+
+ + ♻ ☆ View From Above: A Framework for Evaluating Distribution Shifts in Model + Behavior + + +
+ When large language models (LLMs) are asked to perform certain tasks, how can +we be sure that their learned representations align with reality? We propose a +domain-agnostic framework for systematically evaluating distribution shifts in +LLMs decision-making processes, where they are given control of mechanisms +governed by pre-defined rules. While individual LLM actions may appear +consistent with expected behavior, across a large number of trials, +statistically significant distribution shifts can emerge. To test this, we +construct a well-defined environment with known outcome logic: blackjack. In +more than 1,000 trials, we uncover statistically significant evidence +suggesting behavioral misalignment in the learned representations of LLM. + +
+
+
+
+
+ + ♻ ☆ Enhancing Post-Hoc Attributions in Long Document Comprehension via + Coarse Grained Answer Decomposition + + +
+ Accurately attributing answer text to its source document is crucial for +developing a reliable question-answering system. However, attribution for long +documents remains largely unexplored. Post-hoc attribution systems are designed +to map answer text back to the source document, yet the granularity of this +mapping has not been addressed. Furthermore, a critical question arises: What +exactly should be attributed? This involves identifying the specific +information units within an answer that require grounding. In this paper, we +propose and investigate a novel approach to the factual decomposition of +generated answers for attribution, employing template-based in-context +learning. To accomplish this, we utilize the question and integrate negative +sampling during few-shot in-context learning for decomposition. This approach +enhances the semantic understanding of both abstractive and extractive answers. +We examine the impact of answer decomposition by providing a thorough +examination of various attribution approaches, ranging from retrieval-based +techniques to LLM-based attributors. + +
+
+
+
+
+ + ♻ ☆ RoLoRA: Fine-tuning Rotated Outlier-free LLMs for Effective + Weight-Activation Quantization EMNLP 2024 + + +
+ Low-Rank Adaptation (LoRA), as a representative Parameter-Efficient +Fine-Tuning (PEFT)method, significantly enhances the training efficiency by +updating only a small portion of the weights in Large Language Models (LLMs). +Recently, weight-only quantization techniques have also been applied to LoRA +methods to reduce the memory footprint of fine-tuning. However, applying +weight-activation quantization to the LoRA pipeline is under-explored, and we +observe substantial performance degradation primarily due to the presence of +activation outliers. In this work, we propose RoLoRA, the first LoRA-based +scheme for effective weight-activation quantization. RoLoRA utilizes rotation +for outlier elimination and proposes rotation-aware fine-tuning to preserve the +outlier-free characteristics in rotated LLMs. Experimental results show RoLoRA +consistently improves low-bit LoRA convergence and post-training quantization +robustness in weight-activation settings. We evaluate RoLoRA across +LLaMA2-7B/13B, LLaMA3-8B models, achieving up to 29.5% absolute accuracy gain +of 4-bit weight-activation quantized LLaMA2- 13B on commonsense reasoning tasks +compared to LoRA baseline. We further demonstrate its effectiveness on Large +Multimodal Models (LLaVA-1.5-7B). Codes are available at +https://github.com/HuangOwen/RoLoRA + +
+
+ comment: EMNLP 2024 Findings, Codes: https://github.com/HuangOwen/RoLoRA, + Models: + https://huggingface.co/collections/ScarletAce/rolora-66f5f228a90681c7c4512b28 +
+
+
+
+
+ + ♻ Eagle and Finch: RWKV with Matrix-Valued States and Dynamic Recurrence + + +
+ We present Eagle (RWKV-5) and Finch (RWKV-6), sequence models improving upon +the RWKV (RWKV-4) architecture. Our architectural design advancements include +multi-headed matrix-valued states and a dynamic recurrence mechanism that +improve expressivity while maintaining the inference efficiency characteristics +of RNNs. We introduce a new multilingual corpus with 1.12 trillion tokens and a +fast tokenizer based on greedy matching for enhanced multilinguality. We +trained four Eagle models, ranging from 0.46 to 7.5 billion parameters, and two +Finch models with 1.6 and 3.1 billion parameters and find that they achieve +competitive performance across a wide variety of benchmarks. We release all our +models on HuggingFace under the Apache 2.0 license. Models at: +https://huggingface.co/RWKV Training code at: https://github.com/RWKV/RWKV-LM +Inference code at: https://github.com/RWKV/ChatRWKV Time-parallel training code +at: https://github.com/RWKV/RWKV-infctx-trainer + +
+
+
+
+
+ + ♻ ☆ Contrastive Learning for Knowledge-Based Question Generation in Large + Language Models + + +
+ With the rapid development of artificial intelligence technology, especially +the increasingly widespread application of question-and-answer systems, +high-quality question generation has become a key component in supporting the +development of these systems. This article focuses on knowledge-based question +generation technology, which aims to enable computers to simulate the human +questioning process based on understanding specific texts or knowledge bases. +In light of the issues of hallucination and knowledge gaps present in +large-scale language models when applied to knowledge-intensive tasks, this +paper proposes an enhanced question generation method that incorporates +contrastive learning. This method utilizes multiple models to jointly mine +domain knowledge and uses contrastive learning to guide the model in reducing +noise and hallucinations in generation. Experimental results show that by +designing prompts containing contrasting examples, the model's performance in +question generation improves considerably, particularly when contrasting +instructions and examples are used simultaneously, leading to the highest +quality of generated questions and improved accuracy. These results demonstrate +that the method proposed in this study, which combines contrasting context and +chain-of-thought prompts, can effectively improve both the quality and the +practicality of question generation. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ What Are the Odds? Language Models Are Capable of Probabilistic + Reasoning + + +
+ Language models (LM) are capable of remarkably complex linguistic tasks; +however, numerical reasoning is an area in which they frequently struggle. An +important but rarely evaluated form of reasoning is understanding probability +distributions. In this paper, we focus on evaluating the probabilistic +reasoning capabilities of LMs using idealized and real-world statistical +distributions. We perform a systematic evaluation of state-of-the-art LMs on +three tasks: estimating percentiles, drawing samples, and calculating +probabilities. We evaluate three ways to provide context to LMs 1) anchoring +examples from within a distribution or family of distributions, 2) real-world +context, 3) summary statistics on which to base a Normal approximation. Models +can make inferences about distributions, and can be further aided by the +incorporation of real-world context, example shots and simplified assumptions, +even if these assumptions are incorrect or misspecified. To conduct this work, +we developed a comprehensive benchmark distribution dataset with associated +question-answer pairs that we will release publicly. + +
+
+ comment: 21 pages, 9 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ RISCORE: Enhancing In-Context Riddle Solving in Language Models through + Context-Reconstructed Example Augmentation + + +
+ Riddle-solving requires advanced reasoning skills, pushing LLMs to engage in +abstract thinking and creative problem-solving, often revealing limitations in +their cognitive abilities. In this paper, we examine the riddle-solving +capabilities of LLMs using a multiple-choice format, exploring how different +prompting techniques impact performance on riddles that demand diverse +reasoning skills. To enhance results, we introduce RISCORE (RIddle Solving with +COntext REcontruciton) a novel fully automated prompting method that generates +and utilizes contextually reconstructed sentence-based puzzles in conjunction +with the original examples to create few-shot exemplars. Our experiments +demonstrate that RISCORE significantly improves the performance of language +models in both vertical and lateral thinking tasks, surpassing traditional +exemplar selection strategies across a variety of few-shot settings. + +
+
+
+
+
+ + ♻ ☆ SPEER: Sentence-Level Planning of Long Clinical Summaries via Embedded + Entity Retrieval + + +
+ Clinician must write a lengthy summary each time a patient is discharged from +the hospital. This task is time-consuming due to the sheer number of unique +clinical concepts covered in the admission. Identifying and covering salient +entities is vital for the summary to be clinically useful. We fine-tune +open-source LLMs (Mistral-7B-Instruct and Zephyr-7B-beta) on the task and find +that they generate incomplete and unfaithful summaries. To increase entity +coverage, we train a smaller, encoder-only model to predict salient entities, +which are treated as content-plans to guide the LLM. To encourage the LLM to +focus on specific mentions in the source notes, we propose SPEER: +Sentence-level Planning via Embedded Entity Retrieval. Specifically, we mark +each salient entity span with special "{{ }}" boundary tags and instruct the +LLM to retrieve marked spans before generating each sentence. Sentence-level +planning acts as a form of state tracking in that the model is explicitly +recording the entities it uses. We fine-tune Mistral and Zephyr variants on a +large-scale, diverse dataset of ~167k in-patient hospital admissions and +evaluate on 3 datasets. SPEER shows gains in both coverage and faithfulness +metrics over non-guided and guided baselines. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ♻ ☆ GPT-4V(ision) for Robotics: Multimodal Task Planning from Human + Demonstration + + +
+ We introduce a pipeline that enhances a general-purpose Vision Language +Model, GPT-4V(ision), to facilitate one-shot visual teaching for robotic +manipulation. This system analyzes videos of humans performing tasks and +outputs executable robot programs that incorporate insights into affordances. +The process begins with GPT-4V analyzing the videos to obtain textual +explanations of environmental and action details. A GPT-4-based task planner +then encodes these details into a symbolic task plan. Subsequently, vision +systems spatially and temporally ground the task plan in the videos. Objects +are identified using an open-vocabulary object detector, and hand-object +interactions are analyzed to pinpoint moments of grasping and releasing. This +spatiotemporal grounding allows for the gathering of affordance information +(e.g., grasp types, waypoints, and body postures) critical for robot execution. +Experiments across various scenarios demonstrate the method's efficacy in +enabling real robots to operate from one-shot human demonstrations. Meanwhile, +quantitative tests have revealed instances of hallucination in GPT-4V, +highlighting the importance of incorporating human supervision within the +pipeline. The prompts of GPT-4V/GPT-4 are available at this project page: +https://microsoft.github.io/GPT4Vision-Robot-Manipulation-Prompts/ + +
+
+ comment: 8 pages, 10 figures, 3 tables. Published in IEEE Robotics and + Automation Letters (RA-L) (in press). Last updated on September 26th, 2024 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 174 + +
+
+
+ + ☆ FlowTurbo: Towards Real-time Flow-Based Image Generation with Velocity + Refiner NeurIPS 2024 + + +
+ Building on the success of diffusion models in visual generation, flow-based +models reemerge as another prominent family of generative models that have +achieved competitive or better performance in terms of both visual quality and +inference speed. By learning the velocity field through flow-matching, +flow-based models tend to produce a straighter sampling trajectory, which is +advantageous during the sampling process. However, unlike diffusion models for +which fast samplers are well-developed, efficient sampling of flow-based +generative models has been rarely explored. In this paper, we propose a +framework called FlowTurbo to accelerate the sampling of flow-based models +while still enhancing the sampling quality. Our primary observation is that the +velocity predictor's outputs in the flow-based models will become stable during +the sampling, enabling the estimation of velocity via a lightweight velocity +refiner. Additionally, we introduce several techniques including a pseudo +corrector and sample-aware compilation to further reduce inference time. Since +FlowTurbo does not change the multi-step sampling paradigm, it can be +effectively applied for various tasks such as image editing, inpainting, etc. +By integrating FlowTurbo into different flow-based models, we obtain an +acceleration ratio of 53.1%$\sim$58.3% on class-conditional generation and +29.8%$\sim$38.5% on text-to-image generation. Notably, FlowTurbo reaches an FID +of 2.12 on ImageNet with 100 (ms / img) and FID of 3.93 with 38 (ms / img), +achieving the real-time image generation and establishing the new +state-of-the-art. Code is available at https://github.com/shiml20/FlowTurbo. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ EgoLM: Multi-Modal Language Model of Egocentric Motions + + +
+ As the prevalence of wearable devices, learning egocentric motions becomes +essential to develop contextual AI. In this work, we present EgoLM, a versatile +framework that tracks and understands egocentric motions from multi-modal +inputs, e.g., egocentric videos and motion sensors. EgoLM exploits rich +contexts for the disambiguation of egomotion tracking and understanding, which +are ill-posed under single modality conditions. To facilitate the versatile and +multi-modal framework, our key insight is to model the joint distribution of +egocentric motions and natural languages using large language models (LLM). +Multi-modal sensor inputs are encoded and projected to the joint latent space +of language models, and used to prompt motion generation or text generation for +egomotion tracking or understanding, respectively. Extensive experiments on +large-scale multi-modal human motion dataset validate the effectiveness of +EgoLM as a generalist model for universal egocentric learning. + +
+
+ comment: Project Page: https://hongfz16.github.io/projects/EgoLM +
+
+
+
+
+ + ☆ LLaVA-3D: A Simple yet Effective Pathway to Empowering LMMs with + 3D-awareness + + +
+ Recent advancements in Large Multimodal Models (LMMs) have greatly enhanced +their proficiency in 2D visual understanding tasks, enabling them to +effectively process and understand images and videos. However, the development +of LMMs with 3D-awareness for 3D scene understanding has been hindered by the +lack of large-scale 3D vision-language datasets and powerful 3D encoders. In +this paper, we introduce a simple yet effective framework called LLaVA-3D. +Leveraging the strong 2D understanding priors from LLaVA, our LLaVA-3D +efficiently adapts LLaVA for 3D scene understanding without compromising 2D +understanding capabilities. To achieve this, we employ a simple yet effective +representation, 3D Patch, which connects 2D CLIP patch features with their +corresponding positions in 3D space. By integrating the 3D Patches into 2D LMMs +and employing joint 2D and 3D vision-language instruction tuning, we establish +a unified architecture for both 2D image understanding and 3D scene +understanding. Experimental results show that LLaVA-3D converges 3.5x faster +than existing 3D LMMs when trained on 3D vision-language datasets. Moreover, +LLaVA-3D not only achieves state-of-the-art performance across various 3D tasks +but also maintains comparable 2D image understanding and vision-language +conversation capabilities with LLaVA. + +
+
+ comment: Project page: https://zcmax.github.io/projects/LLaVA-3D/ +
+
+
+
+
+ + ☆ Lotus: Diffusion-based Visual Foundation Model for High-quality Dense + Prediction + + +
+ Leveraging the visual priors of pre-trained text-to-image diffusion models +offers a promising solution to enhance zero-shot generalization in dense +prediction tasks. However, existing methods often uncritically use the original +diffusion formulation, which may not be optimal due to the fundamental +differences between dense prediction and image generation. In this paper, we +provide a systemic analysis of the diffusion formulation for the dense +prediction, focusing on both quality and efficiency. And we find that the +original parameterization type for image generation, which learns to predict +noise, is harmful for dense prediction; the multi-step noising/denoising +diffusion process is also unnecessary and challenging to optimize. Based on +these insights, we introduce Lotus, a diffusion-based visual foundation model +with a simple yet effective adaptation protocol for dense prediction. +Specifically, Lotus is trained to directly predict annotations instead of +noise, thereby avoiding harmful variance. We also reformulate the diffusion +process into a single-step procedure, simplifying optimization and +significantly boosting inference speed. Additionally, we introduce a novel +tuning strategy called detail preserver, which achieves more accurate and +fine-grained predictions. Without scaling up the training data or model +capacity, Lotus achieves SoTA performance in zero-shot depth and normal +estimation across various datasets. It also significantly enhances efficiency, +being hundreds of times faster than most existing diffusion-based methods. + +
+
+ comment: Project page: https://lotus3d.github.io/ +
+
+
+
+
+ + ☆ Robot See Robot Do: Imitating Articulated Object Manipulation with + Monocular 4D Reconstruction CoRL 2024 + + +
+ Humans can learn to manipulate new objects by simply watching others; +providing robots with the ability to learn from such demonstrations would +enable a natural interface specifying new behaviors. This work develops Robot +See Robot Do (RSRD), a method for imitating articulated object manipulation +from a single monocular RGB human demonstration given a single static +multi-view object scan. We first propose 4D Differentiable Part Models +(4D-DPM), a method for recovering 3D part motion from a monocular video with +differentiable rendering. This analysis-by-synthesis approach uses part-centric +feature fields in an iterative optimization which enables the use of geometric +regularizers to recover 3D motions from only a single video. Given this 4D +reconstruction, the robot replicates object trajectories by planning bimanual +arm motions that induce the demonstrated object part motion. By representing +demonstrations as part-centric trajectories, RSRD focuses on replicating the +demonstration's intended behavior while considering the robot's own +morphological limits, rather than attempting to reproduce the hand's motion. We +evaluate 4D-DPM's 3D tracking accuracy on ground truth annotated 3D part +trajectories and RSRD's physical execution performance on 9 objects across 10 +trials each on a bimanual YuMi robot. Each phase of RSRD achieves an average of +87% success rate, for a total end-to-end success rate of 60% across 90 trials. +Notably, this is accomplished using only feature fields distilled from large +pretrained vision models -- without any task-specific training, fine-tuning, +dataset collection, or annotation. Project page: +https://robot-see-robot-do.github.io + +
+
+ comment: CoRL 2024, Project page: https://robot-see-robot-do.github.io +
+
+
+
+
+ + EvMAPPER: High Altitude Orthomapping with Event Cameras + + +
+ Traditionally, unmanned aerial vehicles (UAVs) rely on CMOS-based cameras to +collect images about the world below. One of the most successful applications +of UAVs is to generate orthomosaics or orthomaps, in which a series of images +are integrated together to develop a larger map. However, the use of CMOS-based +cameras with global or rolling shutters mean that orthomaps are vulnerable to +challenging light conditions, motion blur, and high-speed motion of +independently moving objects under the camera. Event cameras are less sensitive +to these issues, as their pixels are able to trigger asynchronously on +brightness changes. This work introduces the first orthomosaic approach using +event cameras. In contrast to existing methods relying only on CMOS cameras, +our approach enables map generation even in challenging light conditions, +including direct sunlight and after sunset. + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ☆ Multi-View and Multi-Scale Alignment for Contrastive Language-Image + Pre-training in Mammography MICCAI 2024 + + +
+ Contrastive Language-Image Pre-training (CLIP) shows promise in medical image +analysis but requires substantial data and computational resources. Due to +these restrictions, existing CLIP applications in medical imaging focus mainly +on modalities like chest X-rays that have abundant image-report data available, +leaving many other important modalities under-explored. Here, we propose the +first adaptation of the full CLIP model to mammography, which presents +significant challenges due to labeled data scarcity, high-resolution images +with small regions of interest, and data imbalance. We first develop a +specialized supervision framework for mammography that leverages its multi-view +nature. Furthermore, we design a symmetric local alignment module to better +focus on detailed features in high-resolution images. Lastly, we incorporate a +parameter-efficient fine-tuning approach for large language models pre-trained +with medical knowledge to address data limitations. Our multi-view and +multi-scale alignment (MaMA) method outperforms state-of-the-art baselines for +three different tasks on two large real-world mammography datasets, EMBED and +RSNA-Mammo, with only 52% model size compared with the largest baseline. + +
+
+ comment: This work is also the basis of the overall best solution for the + MICCAI 2024 CXR-LT Challenge +
+
+
+
+
+ + ☆ EdgeRunner: Auto-regressive Auto-encoder for Artistic Mesh Generation + + +
+ Current auto-regressive mesh generation methods suffer from issues such as +incompleteness, insufficient detail, and poor generalization. In this paper, we +propose an Auto-regressive Auto-encoder (ArAE) model capable of generating +high-quality 3D meshes with up to 4,000 faces at a spatial resolution of +$512^3$. We introduce a novel mesh tokenization algorithm that efficiently +compresses triangular meshes into 1D token sequences, significantly enhancing +training efficiency. Furthermore, our model compresses variable-length +triangular meshes into a fixed-length latent space, enabling training latent +diffusion models for better generalization. Extensive experiments demonstrate +the superior quality, diversity, and generalization capabilities of our model +in both point cloud and image-conditioned mesh generation tasks. + +
+
+ comment: Project Page: https://research.nvidia.com/labs/dir/edgerunner/ +
+
+
+
+
+ + ☆ E.T. Bench: Towards Open-Ended Event-Level Video-Language Understanding NeurIPS 2024 + + +
+ Recent advances in Video Large Language Models (Video-LLMs) have demonstrated +their great potential in general-purpose video understanding. To verify the +significance of these models, a number of benchmarks have been proposed to +diagnose their capabilities in different scenarios. However, existing +benchmarks merely evaluate models through video-level question-answering, +lacking fine-grained event-level assessment and task diversity. To fill this +gap, we introduce E.T. Bench (Event-Level & Time-Sensitive Video Understanding +Benchmark), a large-scale and high-quality benchmark for open-ended event-level +video understanding. Categorized within a 3-level task taxonomy, E.T. Bench +encompasses 7.3K samples under 12 tasks with 7K videos (251.4h total length) +under 8 domains, providing comprehensive evaluations. We extensively evaluated +8 Image-LLMs and 12 Video-LLMs on our benchmark, and the results reveal that +state-of-the-art models for coarse-level (video-level) understanding struggle +to solve our fine-grained tasks, e.g., grounding event-of-interests within +videos, largely due to the short video context length, improper time +representations, and lack of multi-event training data. Focusing on these +issues, we further propose a strong baseline model, E.T. Chat, together with an +instruction-tuning dataset E.T. Instruct 164K tailored for fine-grained +event-level understanding. Our simple but effective solution demonstrates +superior performance in multiple scenarios. + +
+
+ comment: Accepted to NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ☆ Find Rhinos without Finding Rhinos: Active Learning with Multimodal + Imagery of South African Rhino Habitats IJCAI 2023 + + +
+ Much of Earth's charismatic megafauna is endangered by human activities, +particularly the rhino, which is at risk of extinction due to the poaching +crisis in Africa. Monitoring rhinos' movement is crucial to their protection +but has unfortunately proven difficult because rhinos are elusive. Therefore, +instead of tracking rhinos, we propose the novel approach of mapping communal +defecation sites, called middens, which give information about rhinos' spatial +behavior valuable to anti-poaching, management, and reintroduction efforts. +This paper provides the first-ever mapping of rhino midden locations by +building classifiers to detect them using remotely sensed thermal, RGB, and +LiDAR imagery in passive and active learning settings. As existing active +learning methods perform poorly due to the extreme class imbalance in our +dataset, we design MultimodAL, an active learning system employing a ranking +technique and multimodality to achieve competitive performance with passive +learning models with 94% fewer labels. Our methods could therefore save over 76 +hours in labeling time when used on a similarly-sized dataset. Unexpectedly, +our midden map reveals that rhino middens are not randomly distributed +throughout the landscape; rather, they are clustered. Consequently, rangers +should be targeted at areas with high midden densities to strengthen +anti-poaching efforts, in line with UN Target 15.7. + +
+
+ comment: 9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good +
+
+
+
+
+ + ☆ MALPOLON: A Framework for Deep Species Distribution Modeling + + +
+ This paper describes a deep-SDM framework, MALPOLON. Written in Python and +built upon the PyTorch library, this framework aims to facilitate training and +inferences of deep species distribution models (deep-SDM) and sharing for users +with only general Python language skills (e.g., modeling ecologists) who are +interested in testing deep learning approaches to build new SDMs. More advanced +users can also benefit from the framework's modularity to run more specific +experiments by overriding existing classes while taking advantage of +press-button examples to train neural networks on multiple classification tasks +using custom or provided raw and pre-processed datasets. The framework is +open-sourced on GitHub and PyPi along with extensive documentation and examples +of use in various scenarios. MALPOLON offers straightforward installation, +YAML-based configuration, parallel computing, multi-GPU utilization, baseline +and foundational models for benchmarking, and extensive +tutorials/documentation, aiming to enhance accessibility and performance +scalability for ecologists and researchers. + +
+
+
+
+
+ + ☆ AI-Powered Augmented Reality for Satellite Assembly, Integration and + Test + + +
+ The integration of Artificial Intelligence (AI) and Augmented Reality (AR) is +set to transform satellite Assembly, Integration, and Testing (AIT) processes +by enhancing precision, minimizing human error, and improving operational +efficiency in cleanroom environments. This paper presents a technical +description of the European Space Agency's (ESA) project "AI for AR in +Satellite AIT," which combines real-time computer vision and AR systems to +assist technicians during satellite assembly. Leveraging Microsoft HoloLens 2 +as the AR interface, the system delivers context-aware instructions and +real-time feedback, tackling the complexities of object recognition and 6D pose +estimation in AIT workflows. All AI models demonstrated over 70% accuracy, with +the detection model exceeding 95% accuracy, indicating a high level of +performance and reliability. A key contribution of this work lies in the +effective use of synthetic data for training AI models in AR applications, +addressing the significant challenges of obtaining real-world datasets in +highly dynamic satellite environments, as well as the creation of the Segmented +Anything Model for Automatic Labelling (SAMAL), which facilitates the automatic +annotation of real data, achieving speeds up to 20 times faster than manual +human annotation. The findings demonstrate the efficacy of AI-driven AR systems +in automating critical satellite assembly tasks, setting a foundation for +future innovations in the space industry. + +
+
+
+
+
+ + ☆ Self-supervised Pretraining for Cardiovascular Magnetic Resonance Cine + Segmentation MICCAI 2024 + + +
+ Self-supervised pretraining (SSP) has shown promising results in learning +from large unlabeled datasets and, thus, could be useful for automated +cardiovascular magnetic resonance (CMR) short-axis cine segmentation. However, +inconsistent reports of the benefits of SSP for segmentation have made it +difficult to apply SSP to CMR. Therefore, this study aimed to evaluate SSP +methods for CMR cine segmentation. + To this end, short-axis cine stacks of 296 subjects (90618 2D slices) were +used for unlabeled pretraining with four SSP methods; SimCLR, positional +contrastive learning, DINO, and masked image modeling (MIM). Subsets of varying +numbers of subjects were used for supervised fine-tuning of 2D models for each +SSP method, as well as to train a 2D baseline model from scratch. The +fine-tuned models were compared to the baseline using the 3D Dice similarity +coefficient (DSC) in a test dataset of 140 subjects. + The SSP methods showed no performance gains with the largest supervised +fine-tuning subset compared to the baseline (DSC = 0.89). When only 10 subjects +(231 2D slices) are available for supervised training, SSP using MIM (DSC = +0.86) improves over training from scratch (DSC = 0.82). + This study found that SSP is valuable for CMR cine segmentation when labeled +training data is scarce, but does not aid state-of-the-art deep learning +methods when ample labeled data is available. Moreover, the choice of SSP +method is important. The code is publicly available at: +https://github.com/q-cardIA/ssp-cmr-cine-segmentation + +
+
+ comment: Accepted to Data Engineering in Medical Imaging (DEMI) Workshop at + MICCAI 2024 +
+
+
+
+
+ + ☆ EfficientCrackNet: A Lightweight Model for Crack Segmentation + + +
+ Crack detection, particularly from pavement images, presents a formidable +challenge in the domain of computer vision due to several inherent complexities +such as intensity inhomogeneity, intricate topologies, low contrast, and noisy +backgrounds. Automated crack detection is crucial for maintaining the +structural integrity of essential infrastructures, including buildings, +pavements, and bridges. Existing lightweight methods often face challenges +including computational inefficiency, complex crack patterns, and difficult +backgrounds, leading to inaccurate detection and impracticality for real-world +applications. To address these limitations, we propose EfficientCrackNet, a +lightweight hybrid model combining Convolutional Neural Networks (CNNs) and +transformers for precise crack segmentation. EfficientCrackNet integrates +depthwise separable convolutions (DSC) layers and MobileViT block to capture +both global and local features. The model employs an Edge Extraction Method +(EEM) and for efficient crack edge detection without pretraining, and +Ultra-Lightweight Subspace Attention Module (ULSAM) to enhance feature +extraction. Extensive experiments on three benchmark datasets Crack500, +DeepCrack, and GAPs384 demonstrate that EfficientCrackNet achieves superior +performance compared to existing lightweight models, while requiring only 0.26M +parameters, and 0.483 FLOPs (G). The proposed model offers an optimal balance +between accuracy and computational efficiency, outperforming state-of-the-art +lightweight models, and providing a robust and adaptable solution for +real-world crack segmentation. + +
+
+
+
+
+ + ☆ DiffSSC: Semantic LiDAR Scan Completion using Denoising Diffusion + Probabilistic Models + + +
+ Perception systems play a crucial role in autonomous driving, incorporating +multiple sensors and corresponding computer vision algorithms. 3D LiDAR sensors +are widely used to capture sparse point clouds of the vehicle's surroundings. +However, such systems struggle to perceive occluded areas and gaps in the scene +due to the sparsity of these point clouds and their lack of semantics. To +address these challenges, Semantic Scene Completion (SSC) jointly predicts +unobserved geometry and semantics in the scene given raw LiDAR measurements, +aiming for a more complete scene representation. Building on promising results +of diffusion models in image generation and super-resolution tasks, we propose +their extension to SSC by implementing the noising and denoising diffusion +processes in the point and semantic spaces individually. To control the +generation, we employ semantic LiDAR point clouds as conditional input and +design local and global regularization losses to stabilize the denoising +process. We evaluate our approach on autonomous driving datasets and our +approach outperforms the state-of-the-art for SSC. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Stable Video Portraits ECCV 2024 + + +
+ Rapid advances in the field of generative AI and text-to-image methods in +particular have transformed the way we interact with and perceive +computer-generated imagery today. In parallel, much progress has been made in +3D face reconstruction, using 3D Morphable Models (3DMM). In this paper, we +present SVP, a novel hybrid 2D/3D generation method that outputs photorealistic +videos of talking faces leveraging a large pre-trained text-to-image prior +(2D), controlled via a 3DMM (3D). Specifically, we introduce a person-specific +fine-tuning of a general 2D stable diffusion model which we lift to a video +model by providing temporal 3DMM sequences as conditioning and by introducing a +temporal denoising procedure. As an output, this model generates temporally +smooth imagery of a person with 3DMM-based controls, i.e., a person-specific +avatar. The facial appearance of this person-specific avatar can be edited and +morphed to text-defined celebrities, without any fine-tuning at test time. The +method is analyzed quantitatively and qualitatively, and we show that our +method outperforms state-of-the-art monocular head avatar methods. + +
+
+ comment: Accepted at ECCV 2024, Project: https://svp.is.tue.mpg.de +
+
+
+
+
+ + ☆ SKT: Integrating State-Aware Keypoint Trajectories with Vision-Language + Models for Robotic Garment Manipulation + + +
+ Automating garment manipulation poses a significant challenge for assistive +robotics due to the diverse and deformable nature of garments. Traditional +approaches typically require separate models for each garment type, which +limits scalability and adaptability. In contrast, this paper presents a unified +approach using vision-language models (VLMs) to improve keypoint prediction +across various garment categories. By interpreting both visual and semantic +information, our model enables robots to manage different garment states with a +single model. We created a large-scale synthetic dataset using advanced +simulation techniques, allowing scalable training without extensive real-world +data. Experimental results indicate that the VLM-based method significantly +enhances keypoint detection accuracy and task success rates, providing a more +flexible and general solution for robotic garment manipulation. In addition, +this research also underscores the potential of VLMs to unify various garment +manipulation tasks within a single framework, paving the way for broader +applications in home automation and assistive robotics for future. + +
+
+
+
+
+ + ☆ FreeEdit: Mask-free Reference-based Image Editing with Multi-modal + Instruction + + +
+ Introducing user-specified visual concepts in image editing is highly +practical as these concepts convey the user's intent more precisely than +text-based descriptions. We propose FreeEdit, a novel approach for achieving +such reference-based image editing, which can accurately reproduce the visual +concept from the reference image based on user-friendly language instructions. +Our approach leverages the multi-modal instruction encoder to encode language +instructions to guide the editing process. This implicit way of locating the +editing area eliminates the need for manual editing masks. To enhance the +reconstruction of reference details, we introduce the Decoupled Residual +ReferAttention (DRRA) module. This module is designed to integrate fine-grained +reference features extracted by a detail extractor into the image editing +process in a residual way without interfering with the original self-attention. +Given that existing datasets are unsuitable for reference-based image editing +tasks, particularly due to the difficulty in constructing image triplets that +include a reference image, we curate a high-quality dataset, FreeBench, using a +newly developed twice-repainting scheme. FreeBench comprises the images before +and after editing, detailed editing instructions, as well as a reference image +that maintains the identity of the edited object, encompassing tasks such as +object addition, replacement, and deletion. By conducting phased training on +FreeBench followed by quality tuning, FreeEdit achieves high-quality zero-shot +editing through convenient language instructions. We conduct extensive +experiments to evaluate the effectiveness of FreeEdit across multiple task +types, demonstrating its superiority over existing methods. The code will be +available at: https://freeedit.github.io/. + +
+
+ comment: 14 pages, 14 figures, project website: https://freeedit.github.io/ +
+
+
+
+
+ + ☆ LightAvatar: Efficient Head Avatar as Dynamic Neural Light Field ECCV'24 + + +
+ Recent works have shown that neural radiance fields (NeRFs) on top of +parametric models have reached SOTA quality to build photorealistic head +avatars from a monocular video. However, one major limitation of the NeRF-based +avatars is the slow rendering speed due to the dense point sampling of NeRF, +preventing them from broader utility on resource-constrained devices. We +introduce LightAvatar, the first head avatar model based on neural light fields +(NeLFs). LightAvatar renders an image from 3DMM parameters and a camera pose +via a single network forward pass, without using mesh or volume rendering. The +proposed approach, while being conceptually appealing, poses a significant +challenge towards real-time efficiency and training stability. To resolve them, +we introduce dedicated network designs to obtain proper representations for the +NeLF model and maintain a low FLOPs budget. Meanwhile, we tap into a +distillation-based training strategy that uses a pretrained avatar model as +teacher to synthesize abundant pseudo data for training. A warping field +network is introduced to correct the fitting error in the real data so that the +model can learn better. Extensive experiments suggest that our method can +achieve new SOTA image quality quantitatively or qualitatively, while being +significantly faster than the counterparts, reporting 174.1 FPS (512x512 +resolution) on a consumer-grade GPU (RTX3090) with no customized optimization. + +
+
+ comment: Appear in ECCV'24 CADL Workshop. Code: + https://github.com/MingSun-Tse/LightAvatar-TensorFlow +
+
+
+
+
+ + ☆ Visual Data Diagnosis and Debiasing with Concept Graphs + + +
+ The widespread success of deep learning models today is owed to the curation +of extensive datasets significant in size and complexity. However, such models +frequently pick up inherent biases in the data during the training process, +leading to unreliable predictions. Diagnosing and debiasing datasets is thus a +necessity to ensure reliable model performance. In this paper, we present +CONBIAS, a novel framework for diagnosing and mitigating Concept co-occurrence +Biases in visual datasets. CONBIAS represents visual datasets as knowledge +graphs of concepts, enabling meticulous analysis of spurious concept +co-occurrences to uncover concept imbalances across the whole dataset. +Moreover, we show that by employing a novel clique-based concept balancing +strategy, we can mitigate these imbalances, leading to enhanced performance on +downstream tasks. Extensive experiments show that data augmentation based on a +balanced concept distribution augmented by CONBIAS improves generalization +performance across multiple datasets compared to state-of-the-art methods. We +will make our code and data publicly available. + +
+
+
+
+
+ + ☆ Revisit Anything: Visual Place Recognition via Image Segment Retrieval ECCV 2024 + + +
+ Accurately recognizing a revisited place is crucial for embodied agents to +localize and navigate. This requires visual representations to be distinct, +despite strong variations in camera viewpoint and scene appearance. Existing +visual place recognition pipelines encode the "whole" image and search for +matches. This poses a fundamental challenge in matching two images of the same +place captured from different camera viewpoints: "the similarity of what +overlaps can be dominated by the dissimilarity of what does not overlap". We +address this by encoding and searching for "image segments" instead of the +whole images. We propose to use open-set image segmentation to decompose an +image into `meaningful' entities (i.e., things and stuff). This enables us to +create a novel image representation as a collection of multiple overlapping +subgraphs connecting a segment with its neighboring segments, dubbed +SuperSegment. Furthermore, to efficiently encode these SuperSegments into +compact vector representations, we propose a novel factorized representation of +feature aggregation. We show that retrieving these partial representations +leads to significantly higher recognition recall than the typical whole image +based retrieval. Our segments-based approach, dubbed SegVLAD, sets a new +state-of-the-art in place recognition on a diverse selection of benchmark +datasets, while being applicable to both generic and task-specialized image +encoders. Finally, we demonstrate the potential of our method to ``revisit +anything'' by evaluating our method on an object instance retrieval task, which +bridges the two disparate areas of research: visual place recognition and +object-goal navigation, through their common aim of recognizing goal objects +specific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything. + +
+
+ comment: Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures +
+
+
+
+
+ + ☆ IFCap: Image-like Retrieval and Frequency-based Entity Filtering for + Zero-shot Captioning EMNLP 2024 + + +
+ Recent advancements in image captioning have explored text-only training +methods to overcome the limitations of paired image-text data. However, +existing text-only training methods often overlook the modality gap between +using text data during training and employing images during inference. To +address this issue, we propose a novel approach called Image-like Retrieval, +which aligns text features with visually relevant features to mitigate the +modality gap. Our method further enhances the accuracy of generated captions by +designing a Fusion Module that integrates retrieved captions with input +features. Additionally, we introduce a Frequency-based Entity Filtering +technique that significantly improves caption quality. We integrate these +methods into a unified framework, which we refer to as IFCap +($\textbf{I}$mage-like Retrieval and $\textbf{F}$requency-based Entity +Filtering for Zero-shot $\textbf{Cap}$tioning). Through extensive +experimentation, our straightforward yet powerful approach has demonstrated its +efficacy, outperforming the state-of-the-art methods by a significant margin in +both image captioning and video captioning compared to zero-shot captioning +based on text-only training. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ EMOVA: Empowering Language Models to See, Hear and Speak with Vivid + Emotions + + +
+ GPT-4o, an omni-modal model that enables vocal conversations with diverse +emotions and tones, marks a milestone for omni-modal foundation models. +However, empowering Large Language Models to perceive and generate images, +texts, and speeches end-to-end with publicly available data remains challenging +in the open-source community. Existing vision-language models rely on external +tools for the speech processing, while speech-language models still suffer from +limited or even without vision-understanding abilities. To address this gap, we +propose EMOVA (EMotionally Omni-present Voice Assistant), to enable Large +Language Models with end-to-end speech capabilities while maintaining the +leading vision-language performance. With a semantic-acoustic disentangled +speech tokenizer, we notice surprisingly that omni-modal alignment can further +enhance vision-language and speech abilities compared with the corresponding +bi-modal aligned counterparts. Moreover, a lightweight style module is proposed +for flexible speech style controls (e.g., emotions and pitches). For the first +time, EMOVA achieves state-of-the-art performance on both the vision-language +and speech benchmarks, and meanwhile, supporting omni-modal spoken dialogue +with vivid emotions. + +
+
+ comment: Project Page: https://emova-ollm.github.io/ +
+
+
+
+
+ + ☆ ReliOcc: Towards Reliable Semantic Occupancy Prediction via Uncertainty + Learning + + +
+ Vision-centric semantic occupancy prediction plays a crucial role in +autonomous driving, which requires accurate and reliable predictions from +low-cost sensors. Although having notably narrowed the accuracy gap with LiDAR, +there is still few research effort to explore the reliability in predicting +semantic occupancy from camera. In this paper, we conduct a comprehensive +evaluation of existing semantic occupancy prediction models from a reliability +perspective for the first time. Despite the gradual alignment of camera-based +models with LiDAR in term of accuracy, a significant reliability gap persists. +To addresses this concern, we propose ReliOcc, a method designed to enhance the +reliability of camera-based occupancy networks. ReliOcc provides a +plug-and-play scheme for existing models, which integrates hybrid uncertainty +from individual voxels with sampling-based noise and relative voxels through +mix-up learning. Besides, an uncertainty-aware calibration strategy is devised +to further enhance model reliability in offline mode. Extensive experiments +under various settings demonstrate that ReliOcc significantly enhances model +reliability while maintaining the accuracy of both geometric and semantic +predictions. Importantly, our proposed approach exhibits robustness to sensor +failures and out of domain noises during inference. + +
+
+ comment: Technical report. Work in progress +
+
+
+
+
+ + ☆ Transferring disentangled representations: bridging the gap between + synthetic and real images + + +
+ Developing meaningful and efficient representations that separate the +fundamental structure of the data generation mechanism is crucial in +representation learning. However, Disentangled Representation Learning has not +fully shown its potential on real images, because of correlated generative +factors, their resolution and limited access to ground truth labels. +Specifically on the latter, we investigate the possibility of leveraging +synthetic data to learn general-purpose disentangled representations applicable +to real data, discussing the effect of fine-tuning and what properties of +disentanglement are preserved after the transfer. We provide an extensive +empirical study to address these issues. In addition, we propose a new +interpretable intervention-based metric, to measure the quality of factors +encoding in the representation. Our results indicate that some level of +disentanglement, transferring a representation from synthetic to real data, is +possible and effective. + +
+
+
+
+
+ + ☆ PhoCoLens: Photorealistic and Consistent Reconstruction in Lensless + Imaging NeurIPS 2024 + + +
+ Lensless cameras offer significant advantages in size, weight, and cost +compared to traditional lens-based systems. Without a focusing lens, lensless +cameras rely on computational algorithms to recover the scenes from multiplexed +measurements. However, current algorithms struggle with inaccurate forward +imaging models and insufficient priors to reconstruct high-quality images. To +overcome these limitations, we introduce a novel two-stage approach for +consistent and photorealistic lensless image reconstruction. The first stage of +our approach ensures data consistency by focusing on accurately reconstructing +the low-frequency content with a spatially varying deconvolution method that +adjusts to changes in the Point Spread Function (PSF) across the camera's field +of view. The second stage enhances photorealism by incorporating a generative +prior from pre-trained diffusion models. By conditioning on the low-frequency +content retrieved in the first stage, the diffusion model effectively +reconstructs the high-frequency details that are typically lost in the lensless +imaging process, while also maintaining image fidelity. Our method achieves a +superior balance between data fidelity and visual quality compared to existing +methods, as demonstrated with two popular lensless systems, PhlatCam and +DiffuserCam. Project website: https://phocolens.github.io/. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ☆ InterNet: Unsupervised Cross-modal Homography Estimation Based on + Interleaved Modality Transfer and Self-supervised Homography Prediction + + +
+ We propose a novel unsupervised cross-modal homography estimation framework, +based on interleaved modality transfer and self-supervised homography +prediction, named InterNet. InterNet integrates modality transfer and +self-supervised homography estimation, introducing an innovative interleaved +optimization framework to alternately promote both components. The modality +transfer gradually narrows the modality gaps, facilitating the self-supervised +homography estimation to fully leverage the synthetic intra-modal data. The +self-supervised homography estimation progressively achieves reliable +predictions, thereby providing robust cross-modal supervision for the modality +transfer. To further boost the estimation accuracy, we also formulate a +fine-grained homography feature loss to improve the connection between two +components. Furthermore, we employ a simple yet effective distillation training +technique to reduce model parameters and improve cross-domain generalization +ability while maintaining comparable performance. Experiments reveal that +InterNet achieves the state-of-the-art (SOTA) performance among unsupervised +methods, and even outperforms many supervised methods such as MHN and +LocalTrans. + +
+
+
+
+
+ + ☆ Deblur e-NeRF: NeRF from Motion-Blurred Events under High-speed or + Low-light Conditions ECCV 2024 + + +
+ The stark contrast in the design philosophy of an event camera makes it +particularly ideal for operating under high-speed, high dynamic range and +low-light conditions, where standard cameras underperform. Nonetheless, event +cameras still suffer from some amount of motion blur, especially under these +challenging conditions, in contrary to what most think. This is attributed to +the limited bandwidth of the event sensor pixel, which is mostly proportional +to the light intensity. Thus, to ensure that event cameras can truly excel in +such conditions where it has an edge over standard cameras, it is crucial to +account for event motion blur in downstream applications, especially +reconstruction. However, none of the recent works on reconstructing Neural +Radiance Fields (NeRFs) from events, nor event simulators, have considered the +full effects of event motion blur. To this end, we propose, Deblur e-NeRF, a +novel method to directly and effectively reconstruct blur-minimal NeRFs from +motion-blurred events generated under high-speed motion or low-light +conditions. The core component of this work is a physically-accurate pixel +bandwidth model proposed to account for event motion blur under arbitrary speed +and lighting conditions. We also introduce a novel threshold-normalized total +variation loss to improve the regularization of large textureless patches. +Experiments on real and novel realistically simulated sequences verify our +effectiveness. Our code, event simulator and synthetic event dataset will be +open-sourced. + +
+
+ comment: Accepted to ECCV 2024. Project website is accessible at + https://wengflow.github.io/deblur-e-nerf. arXiv admin note: text overlap with + arXiv:2006.07722 by other authors +
+
+
+
+
+ + ☆ LLM4Brain: Training a Large Language Model for Brain Video Understanding ECCV2024 + + +
+ Decoding visual-semantic information from brain signals, such as functional +MRI (fMRI), across different subjects poses significant challenges, including +low signal-to-noise ratio, limited data availability, and cross-subject +variability. Recent advancements in large language models (LLMs) show +remarkable effectiveness in processing multimodal information. In this study, +we introduce an LLM-based approach for reconstructing visual-semantic +information from fMRI signals elicited by video stimuli. Specifically, we +employ fine-tuning techniques on an fMRI encoder equipped with adaptors to +transform brain responses into latent representations aligned with the video +stimuli. Subsequently, these representations are mapped to textual modality by +LLM. In particular, we integrate self-supervised domain adaptation methods to +enhance the alignment between visual-semantic information and brain responses. +Our proposed method achieves good results using various quantitative semantic +metrics, while yielding similarity with ground-truth information. + +
+
+ comment: ECCV2024 Workshop +
+
+
+
+
+ + ☆ BlinkTrack: Feature Tracking over 100 FPS via Events and Images + + +
+ Feature tracking is crucial for, structure from motion (SFM), simultaneous +localization and mapping (SLAM), object tracking and various computer vision +tasks. Event cameras, known for their high temporal resolution and ability to +capture asynchronous changes, have gained significant attention for their +potential in feature tracking, especially in challenging conditions. However, +event cameras lack the fine-grained texture information that conventional +cameras provide, leading to error accumulation in tracking. To address this, we +propose a novel framework, BlinkTrack, which integrates event data with RGB +images for high-frequency feature tracking. Our method extends the traditional +Kalman filter into a learning-based framework, utilizing differentiable Kalman +filters in both event and image branches. This approach improves +single-modality tracking, resolves ambiguities, and supports asynchronous data +fusion. We also introduce new synthetic and augmented datasets to better +evaluate our model. Experimental results indicate that BlinkTrack significantly +outperforms existing event-based methods, exceeding 100 FPS with preprocessed +event data and 80 FPS with multi-modality data. + +
+
+
+
+
+ + ☆ HydraViT: Stacking Heads for a Scalable ViT + + +
+ The architecture of Vision Transformers (ViTs), particularly the Multi-head +Attention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs +on devices with varying constraints, such as mobile phones, requires multiple +models of different sizes. However, this approach has limitations, such as +training and storing each required model separately. This paper introduces +HydraViT, a novel approach that addresses these limitations by stacking +attention heads to achieve a scalable ViT. By repeatedly changing the size of +the embedded dimensions throughout each layer and their corresponding number of +attention heads in MHA during training, HydraViT induces multiple subnetworks. +Thereby, HydraViT achieves adaptability across a wide spectrum of hardware +environments while maintaining performance. Our experimental results +demonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10 +subnetworks, covering a wide range of resource constraints. HydraViT achieves +up to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy +with the same throughput on ImageNet-1K compared to the baselines, making it an +effective solution for scenarios where hardware availability is diverse or +varies over time. Source code available at https://github.com/ds-kiel/HydraViT. + +
+
+
+
+
+ + ☆ Cross-Modality Attack Boosted by Gradient-Evolutionary Multiform + Optimization + + +
+ In recent years, despite significant advancements in adversarial attack +research, the security challenges in cross-modal scenarios, such as the +transferability of adversarial attacks between infrared, thermal, and RGB +images, have been overlooked. These heterogeneous image modalities collected by +different hardware devices are widely prevalent in practical applications, and +the substantial differences between modalities pose significant challenges to +attack transferability. In this work, we explore a novel cross-modal +adversarial attack strategy, termed multiform attack. We propose a dual-layer +optimization framework based on gradient-evolution, facilitating efficient +perturbation transfer between modalities. In the first layer of optimization, +the framework utilizes image gradients to learn universal perturbations within +each modality and employs evolutionary algorithms to search for shared +perturbations with transferability across different modalities through +secondary optimization. Through extensive testing on multiple heterogeneous +datasets, we demonstrate the superiority and robustness of Multiform Attack +compared to existing techniques. This work not only enhances the +transferability of cross-modal adversarial attacks but also provides a new +perspective for understanding security vulnerabilities in cross-modal systems. + +
+
+
+
+
+ + ☆ CNCA: Toward Customizable and Natural Generation of Adversarial + Camouflage for Vehicle Detectors + + +
+ Prior works on physical adversarial camouflage against vehicle detectors +mainly focus on the effectiveness and robustness of the attack. The current +most successful methods optimize 3D vehicle texture at a pixel level. However, +this results in conspicuous and attention-grabbing patterns in the generated +camouflage, which humans can easily identify. To address this issue, we propose +a Customizable and Natural Camouflage Attack (CNCA) method by leveraging an +off-the-shelf pre-trained diffusion model. By sampling the optimal texture +image from the diffusion model with a user-specific text prompt, our method can +generate natural and customizable adversarial camouflage while maintaining high +attack performance. With extensive experiments on the digital and physical +worlds and user studies, the results demonstrate that our proposed method can +generate significantly more natural-looking camouflage than the +state-of-the-art baselines while achieving competitive attack performance. Our +code is available at +\href{https://anonymous.4open.science/r/CNCA-1D54}{https://anonymous.4open.science/r/CNCA-1D54} + +
+
+
+
+
+ + ☆ The Hard Positive Truth about Vision-Language Compositionality ECCV 2024 + + +
+ Several benchmarks have concluded that our best vision-language models (e.g., +CLIP) are lacking in compositionality. Given an image, these benchmarks probe a +model's ability to identify its associated caption amongst a set of +compositional distractors. In response, a surge of recent proposals show +improvements by finetuning CLIP with distractors as hard negatives. Our +investigations reveal that these improvements have, in fact, been significantly +overstated -- because existing benchmarks do not probe whether finetuned +vision-language models remain invariant to hard positives. By curating an +evaluation dataset with 112,382 hard negatives and hard positives, we uncover +that including hard positives decreases CLIP's performance by 12.9%, while +humans perform effortlessly at 99%. CLIP finetuned with hard negatives results +in an even larger decrease, up to 38.7%. With this finding, we then produce a +1,775,259 image-text training set with both hard negative and hard positive +captions. By training with both, we see improvements on existing benchmarks +while simultaneously improving performance on hard positives, indicating a more +robust improvement in compositionality. Our work suggests the need for future +research to rigorously test and improve CLIP's understanding of semantic +relationships between related "positive" concepts. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + Spatial Hierarchy and Temporal Attention Guided Cross Masking for + Self-supervised Skeleton-based Action Recognition + + +
+ In self-supervised skeleton-based action recognition, the mask reconstruction +paradigm is gaining interest in enhancing model refinement and robustness +through effective masking. However, previous works primarily relied on a single +masking criterion, resulting in the model overfitting specific features and +overlooking other effective information. In this paper, we introduce a +hierarchy and attention guided cross-masking framework (HA-CM) that applies +masking to skeleton sequences from both spatial and temporal perspectives. +Specifically, in spatial graphs, we utilize hyperbolic space to maintain joint +distinctions and effectively preserve the hierarchical structure of +high-dimensional skeletons, employing joint hierarchy as the masking criterion. +In temporal flows, we substitute traditional distance metrics with the global +attention of joints for masking, addressing the convergence of distances in +high-dimensional space and the lack of a global perspective. Additionally, we +incorporate cross-contrast loss based on the cross-masking framework into the +loss function to enhance the model's learning of instance-level features. HA-CM +shows efficiency and universality on three public large-scale datasets, NTU-60, +NTU-120, and PKU-MMD. The source code of our HA-CM is available at +https://github.com/YinxPeng/HA-CM-main. + +
+
+ comment: 12 pages,6 figures,IEEE Trans +
+
+
+
+
+ + ☆ Perturb, Attend, Detect and Localize (PADL): Robust Proactive Image + Defense + + +
+ Image manipulation detection and localization have received considerable +attention from the research community given the blooming of Generative Models +(GMs). Detection methods that follow a passive approach may overfit to specific +GMs, limiting their application in real-world scenarios, due to the growing +diversity of generative models. Recently, approaches based on a proactive +framework have shown the possibility of dealing with this limitation. However, +these methods suffer from two main limitations, which raises concerns about +potential vulnerabilities: i) the manipulation detector is not robust to noise +and hence can be easily fooled; ii) the fact that they rely on fixed +perturbations for image protection offers a predictable exploit for malicious +attackers, enabling them to reverse-engineer and evade detection. To overcome +this issue we propose PADL, a new solution able to generate image-specific +perturbations using a symmetric scheme of encoding and decoding based on +cross-attention, which drastically reduces the possibility of reverse +engineering, even when evaluated with adaptive attack [31]. Additionally, PADL +is able to pinpoint manipulated areas, facilitating the identification of +specific regions that have undergone alterations, and has more generalization +power than prior art on held-out generative models. Indeed, although being +trained only on an attribute manipulation GAN model [15], our method +generalizes to a range of unseen models with diverse architectural designs, +such as StarGANv2, BlendGAN, DiffAE, StableDiffusion and StableDiffusionXL. +Additionally, we introduce a novel evaluation protocol, which offers a fair +evaluation of localisation performance in function of detection accuracy and +better captures real-world scenarios. + +
+
+
+
+
+ + ☆ Neural Light Spheres for Implicit Image Stitching and View Synthesis + + +
+ Challenging to capture, and challenging to display on a cellphone screen, the +panorama paradoxically remains both a staple and underused feature of modern +mobile camera applications. In this work we address both of these challenges +with a spherical neural light field model for implicit panoramic image +stitching and re-rendering; able to accommodate for depth parallax, +view-dependent lighting, and local scene motion and color changes during +capture. Fit during test-time to an arbitrary path panoramic video capture -- +vertical, horizontal, random-walk -- these neural light spheres jointly +estimate the camera path and a high-resolution scene reconstruction to produce +novel wide field-of-view projections of the environment. Our single-layer model +avoids expensive volumetric sampling, and decomposes the scene into compact +view-dependent ray offset and color components, with a total model size of 80 +MB per scene, and real-time (50 FPS) rendering at 1080p resolution. We +demonstrate improved reconstruction quality over traditional image stitching +and radiance field methods, with significantly higher tolerance to scene motion +and non-ideal capture settings. + +
+
+ comment: Project site: https://light.princeton.edu/publication/neuls/ +
+
+
+
+
+ + ☆ Resolving Multi-Condition Confusion for Finetuning-Free Personalized + Image Generation + + +
+ Personalized text-to-image generation methods can generate customized images +based on the reference images, which have garnered wide research interest. +Recent methods propose a finetuning-free approach with a decoupled +cross-attention mechanism to generate personalized images requiring no +test-time finetuning. However, when multiple reference images are provided, the +current decoupled cross-attention mechanism encounters the object confusion +problem and fails to map each reference image to its corresponding object, +thereby seriously limiting its scope of application. To address the object +confusion problem, in this work we investigate the relevance of different +positions of the latent image features to the target object in diffusion model, +and accordingly propose a weighted-merge method to merge multiple reference +image features into the corresponding objects. Next, we integrate this +weighted-merge method into existing pre-trained models and continue to train +the model on a multi-object dataset constructed from the open-sourced SA-1B +dataset. To mitigate object confusion and reduce training costs, we propose an +object quality score to estimate the image quality for the selection of +high-quality training samples. Furthermore, our weighted-merge training +framework can be employed on single-object generation when a single object has +multiple reference images. The experiments verify that our method achieves +superior performance to the state-of-the-arts on the Concept101 dataset and +DreamBooth dataset of multi-object personalized image generation, and +remarkably improves the performance on single-object personalized image +generation. Our code is available at https://github.com/hqhQAQ/MIP-Adapter. + +
+
+
+
+
+ + ☆ WaSt-3D: Wasserstein-2 Distance for Scene-to-Scene Stylization on 3D + Gaussians + + +
+ While style transfer techniques have been well-developed for 2D image +stylization, the extension of these methods to 3D scenes remains relatively +unexplored. Existing approaches demonstrate proficiency in transferring colors +and textures but often struggle with replicating the geometry of the scenes. In +our work, we leverage an explicit Gaussian Splatting (GS) representation and +directly match the distributions of Gaussians between style and content scenes +using the Earth Mover's Distance (EMD). By employing the entropy-regularized +Wasserstein-2 distance, we ensure that the transformation maintains spatial +smoothness. Additionally, we decompose the scene stylization problem into +smaller chunks to enhance efficiency. This paradigm shift reframes stylization +from a pure generative process driven by latent space losses to an explicit +matching of distributions between two Gaussian representations. Our method +achieves high-resolution 3D stylization by faithfully transferring details from +3D style scenes onto the content scene. Furthermore, WaSt-3D consistently +delivers results across diverse content and style scenes without necessitating +any training, as it relies solely on optimization-based techniques. See our +project page for additional results and source code: +$\href{https://compvis.github.io/wast3d/}{https://compvis.github.io/wast3d/}$. + +
+
+
+
+
+ + ☆ LKA-ReID:Vehicle Re-Identification with Large Kernel Attention ICASSP 2025 + + +
+ With the rapid development of intelligent transportation systems and the +popularity of smart city infrastructure, Vehicle Re-ID technology has become an +important research field. The vehicle Re-ID task faces an important challenge, +which is the high similarity between different vehicles. Existing methods use +additional detection or segmentation models to extract differentiated local +features. However, these methods either rely on additional annotations or +greatly increase the computational cost. Using attention mechanism to capture +global and local features is crucial to solve the challenge of high similarity +between classes in vehicle Re-ID tasks. In this paper, we propose LKA-ReID with +large kernel attention. Specifically, the large kernel attention (LKA) utilizes +the advantages of self-attention and also benefits from the advantages of +convolution, which can extract the global and local features of the vehicle +more comprehensively. We also introduce hybrid channel attention (HCA) combines +channel attention with spatial information, so that the model can better focus +on channels and feature regions, and ignore background and other disturbing +information. Experiments on VeRi-776 dataset demonstrated the effectiveness of +LKA-ReID, with mAP reaches 86.65% and Rank-1 reaches 98.03%. + +
+
+ comment: The paper is under consideration at 2025 IEEE International + Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025) +
+
+
+
+
+ + ☆ Self-supervised Monocular Depth Estimation with Large Kernel Attention ICASSP 2025 + + +
+ Self-supervised monocular depth estimation has emerged as a promising +approach since it does not rely on labeled training data. Most methods combine +convolution and Transformer to model long-distance dependencies to estimate +depth accurately. However, Transformer treats 2D image features as 1D +sequences, and positional encoding somewhat mitigates the loss of spatial +information between different feature blocks, tending to overlook channel +features, which limit the performance of depth estimation. In this paper, we +propose a self-supervised monocular depth estimation network to get finer +details. Specifically, we propose a decoder based on large kernel attention, +which can model long-distance dependencies without compromising the +two-dimension structure of features while maintaining feature channel +adaptivity. In addition, we introduce a up-sampling module to accurately +recover the fine details in the depth map. Our method achieves competitive +results on the KITTI dataset. + +
+
+ comment: The paper is under consideration at 2025 IEEE International + Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025) +
+
+
+
+
+ + ☆ Upper-Body Pose-based Gaze Estimation for Privacy-Preserving 3D Gaze + Target Detection ECCV 2024 + + +
+ Gaze Target Detection (GTD), i.e., determining where a person is looking +within a scene from an external viewpoint, is a challenging task, particularly +in 3D space. Existing approaches heavily rely on analyzing the person's +appearance, primarily focusing on their face to predict the gaze target. This +paper presents a novel approach to tackle this problem by utilizing the +person's upper-body pose and available depth maps to extract a 3D gaze +direction and employing a multi-stage or an end-to-end pipeline to predict the +gazed target. When predicted accurately, the human body pose can provide +valuable information about the head pose, which is a good approximation of the +gaze direction, as well as the position of the arms and hands, which are linked +to the activity the person is performing and the objects they are likely +focusing on. Consequently, in addition to performing gaze estimation in 3D, we +are also able to perform GTD simultaneously. We demonstrate state-of-the-art +results on the most comprehensive publicly accessible 3D gaze target detection +dataset without requiring images of the person's face, thus promoting privacy +preservation in various application contexts. The code is available at +https://github.com/intelligolabs/privacy-gtd-3D. + +
+
+ comment: Accepted in the T-CAP workshop at ECCV 2024 +
+
+
+
+
+ + ☆ Self-Distilled Depth Refinement with Noisy Poisson Fusion NeurIPS 2024 + + +
+ Depth refinement aims to infer high-resolution depth with fine-grained edges +and details, refining low-resolution results of depth estimation models. The +prevailing methods adopt tile-based manners by merging numerous patches, which +lacks efficiency and produces inconsistency. Besides, prior arts suffer from +fuzzy depth boundaries and limited generalizability. Analyzing the fundamental +reasons for these limitations, we model depth refinement as a noisy Poisson +fusion problem with local inconsistency and edge deformation noises. We propose +the Self-distilled Depth Refinement (SDDR) framework to enforce robustness +against the noises, which mainly consists of depth edge representation and +edge-based guidance. With noisy depth predictions as input, SDDR generates +low-noise depth edge representations as pseudo-labels by coarse-to-fine +self-distillation. Edge-based guidance with edge-guided gradient loss and +edge-based fusion loss serves as the optimization objective equivalent to +Poisson fusion. When depth maps are better refined, the labels also become more +noise-free. Our model can acquire strong robustness to the noises, achieving +significant improvements in accuracy, edge quality, efficiency, and +generalizability on five different benchmarks. Moreover, directly training +another model with edge labels produced by SDDR brings improvements, suggesting +that our method could help with training robust refinement models in future +works. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Visualization of Age Distributions as Elements of Medical Data-Stories + + +
+ In various fields, including medicine, age distributions are crucial. Despite +widespread media coverage of health topics, there remains a need to enhance +health communication. Narrative medical visualization is promising for +improving information comprehension and retention. This study explores the most +effective ways to present age distributions of diseases through narrative +visualizations. We conducted a thorough analysis of existing visualizations, +held workshops with a broad audience, and reviewed relevant literature. From +this, we identified design choices focusing on comprehension, aesthetics, +engagement, and memorability. We specifically tested three pictogram variants: +pictograms as bars, stacked pictograms, and annotations. After evaluating 18 +visualizations with 72 participants and three expert reviews, we determined +that annotations were most effective for comprehension and aesthetics. However, +traditional bar charts were preferred for engagement, and other variants were +more memorable. The study provides a set of design recommendations based on +these insights. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ A New Dataset for Monocular Depth Estimation Under Viewpoint Shifts ECCV 2024 + + +
+ Monocular depth estimation is a critical task for autonomous driving and many +other computer vision applications. While significant progress has been made in +this field, the effects of viewpoint shifts on depth estimation models remain +largely underexplored. This paper introduces a novel dataset and evaluation +methodology to quantify the impact of different camera positions and +orientations on monocular depth estimation performance. We propose a ground +truth strategy based on homography estimation and object detection, eliminating +the need for expensive lidar sensors. We collect a diverse dataset of road +scenes from multiple viewpoints and use it to assess the robustness of a modern +depth estimation model to geometric shifts. After assessing the validity of our +strategy on a public dataset, we provide valuable insights into the limitations +of current models and highlight the importance of considering viewpoint +variations in real-world applications. + +
+
+ comment: 17 pages, 5 figures. Accepted at ECCV 2024 2nd Workshop on + Vision-Centric Autonomous Driving (VCAD) +
+
+
+
+
+ + ☆ Unsupervised Learning Based Multi-Scale Exposure Fusion + + +
+ Unsupervised learning based multi-scale exposure fusion (ULMEF) is efficient +for fusing differently exposed low dynamic range (LDR) images into a higher +quality LDR image for a high dynamic range (HDR) scene. Unlike supervised +learning, loss functions play a crucial role in the ULMEF. In this paper, novel +loss functions are proposed for the ULMEF and they are defined by using all the +images to be fused and other differently exposed images from the same HDR +scene. The proposed loss functions can guide the proposed ULMEF to learn more +reliable information from the HDR scene than existing loss functions which are +defined by only using the set of images to be fused. As such, the quality of +the fused image is significantly improved. The proposed ULMEF also adopts a +multi-scale strategy that includes a multi-scale attention module to +effectively preserve the scene depth and local contrast in the fused image. +Meanwhile, the proposed ULMEF can be adopted to achieve exposure interpolation +and exposure extrapolation. Extensive experiments show that the proposed ULMEF +algorithm outperforms state-of-the-art exposure fusion algorithms. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Kendall's $τ$ Coefficient for Logits Distillation + + +
+ Knowledge distillation typically employs the Kullback-Leibler (KL) divergence +to constrain the student model's output to match the soft labels provided by +the teacher model exactly. However, sometimes the optimization direction of the +KL divergence loss is not always aligned with the task loss, where a smaller KL +divergence could lead to erroneous predictions that diverge from the soft +labels. This limitation often results in suboptimal optimization for the +student. Moreover, even under temperature scaling, the KL divergence loss +function tends to overly focus on the larger-valued channels in the logits, +disregarding the rich inter-class information provided by the multitude of +smaller-valued channels. This hard constraint proves too challenging for +lightweight students, hindering further knowledge distillation. To address this +issue, we propose a plug-and-play ranking loss based on Kendall's $\tau$ +coefficient, called Rank-Kendall Knowledge Distillation (RKKD). RKKD balances +the attention to smaller-valued channels by constraining the order of channel +values in student logits, providing more inter-class relational information. +The rank constraint on the top-valued channels helps avoid suboptimal traps +during optimization. We also discuss different differentiable forms of +Kendall's $\tau$ coefficient and demonstrate that the proposed ranking loss +function shares a consistent optimization objective with the KL divergence. +Extensive experiments on the CIFAR-100 and ImageNet datasets show that our RKKD +can enhance the performance of various knowledge distillation baselines and +offer broad improvements across multiple teacher-student architecture +combinations. + +
+
+
+
+
+ + ☆ Cascade Prompt Learning for Vision-Language Model Adaptation ECCV2024 + + +
+ Prompt learning has surfaced as an effective approach to enhance the +performance of Vision-Language Models (VLMs) like CLIP when applied to +downstream tasks. However, current learnable prompt tokens are primarily used +for the single phase of adapting to tasks (i.e., adapting prompt), easily +leading to overfitting risks. In this work, we propose a novel Cascade Prompt +Learning CasPL framework to enable prompt learning to serve both generic and +specific expertise (i.e., boosting and adapting prompt) simultaneously. +Specifically, CasPL is a new learning paradigm comprising two distinct phases +of learnable prompts: the first boosting prompt is crafted to extract +domain-general knowledge from a senior larger CLIP teacher model by aligning +their predicted logits using extensive unlabeled domain images. The second +adapting prompt is then cascaded with the frozen first set to fine-tune the +downstream tasks, following the approaches employed in prior research. In this +manner, CasPL can effectively capture both domain-general and task-specific +representations into explicitly different gradual groups of prompts, thus +potentially alleviating overfitting issues in the target domain. It's worth +noting that CasPL serves as a plug-and-play module that can seamlessly +integrate into any existing prompt learning approach. CasPL achieves a +significantly better balance between performance and inference speed, which is +especially beneficial for deploying smaller VLM models in resource-constrained +environments. Compared to the previous state-of-the-art method PromptSRC, CasPL +shows an average improvement of 1.85% for base classes, 3.44% for novel +classes, and 2.72% for the harmonic mean over 11 image classification datasets. +Code is publicly available at: https://github.com/megvii-research/CasPL. + +
+
+ comment: ECCV2024 +
+
+
+
+
+ + ☆ Reblurring-Guided Single Image Defocus Deblurring: A Learning Framework + with Misaligned Training Pairs + + +
+ For single image defocus deblurring, acquiring well-aligned training pairs +(or training triplets), i.e., a defocus blurry image, an all-in-focus sharp +image (and a defocus blur map), is an intricate task for the development of +deblurring models. Existing image defocus deblurring methods typically rely on +training data collected by specialized imaging equipment, presupposing that +these pairs or triplets are perfectly aligned. However, in practical scenarios +involving the collection of real-world data, direct acquisition of training +triplets is infeasible, and training pairs inevitably encounter spatial +misalignment issues. In this work, we introduce a reblurring-guided learning +framework for single image defocus deblurring, enabling the learning of a +deblurring network even with misaligned training pairs. Specifically, we first +propose a baseline defocus deblurring network that utilizes spatially varying +defocus blur map as degradation prior to enhance the deblurring performance. +Then, to effectively learn the baseline defocus deblurring network with +misaligned training pairs, our reblurring module ensures spatial consistency +between the deblurred image, the reblurred image and the input blurry image by +reconstructing spatially variant isotropic blur kernels. Moreover, the +spatially variant blur derived from the reblurring module can serve as pseudo +supervision for defocus blur map during training, interestingly transforming +training pairs into training triplets. Additionally, we have collected a new +dataset specifically for single image defocus deblurring (SDD) with typical +misalignments, which not only substantiates our proposed method but also serves +as a benchmark for future research. + +
+
+ comment: The source code and dataset are available at + https://github.com/ssscrystal/Reblurring-guided-JDRL +
+
+
+
+
+ + ☆ CASPFormer: Trajectory Prediction from BEV Images with Deformable + Attention ICPR 2024 + + +
+ Motion prediction is an important aspect for Autonomous Driving (AD) and +Advance Driver Assistance Systems (ADAS). Current state-of-the-art motion +prediction methods rely on High Definition (HD) maps for capturing the +surrounding context of the ego vehicle. Such systems lack scalability in +real-world deployment as HD maps are expensive to produce and update in +real-time. To overcome this issue, we propose Context Aware Scene Prediction +Transformer (CASPFormer), which can perform multi-modal motion prediction from +rasterized Bird-Eye-View (BEV) images. Our system can be integrated with any +upstream perception module that is capable of generating BEV images. Moreover, +CASPFormer directly decodes vectorized trajectories without any postprocessing. +Trajectories are decoded recurrently using deformable attention, as it is +computationally efficient and provides the network with the ability to focus +its attention on the important spatial locations of the BEV images. In +addition, we also address the issue of mode collapse for generating multiple +scene-consistent trajectories by incorporating learnable mode queries. We +evaluate our model on the nuScenes dataset and show that it reaches +state-of-the-art across multiple metrics + +
+
+ comment: Under Review at ICPR 2024, Kolkata +
+
+
+
+
+ + ☆ Taming Diffusion Prior for Image Super-Resolution with Domain Shift SDEs NeurIPS 2024 + + +
+ Diffusion-based image super-resolution (SR) models have attracted substantial +interest due to their powerful image restoration capabilities. However, +prevailing diffusion models often struggle to strike an optimal balance between +efficiency and performance. Typically, they either neglect to exploit the +potential of existing extensive pretrained models, limiting their generative +capacity, or they necessitate a dozens of forward passes starting from random +noises, compromising inference efficiency. In this paper, we present DoSSR, a +Domain Shift diffusion-based SR model that capitalizes on the generative powers +of pretrained diffusion models while significantly enhancing efficiency by +initiating the diffusion process with low-resolution (LR) images. At the core +of our approach is a domain shift equation that integrates seamlessly with +existing diffusion models. This integration not only improves the use of +diffusion prior but also boosts inference efficiency. Moreover, we advance our +method by transitioning the discrete shift process to a continuous formulation, +termed as DoS-SDEs. This advancement leads to the fast and customized solvers +that further enhance sampling efficiency. Empirical results demonstrate that +our proposed method achieves state-of-the-art performance on synthetic and +real-world datasets, while notably requiring only 5 sampling steps. Compared to +previous diffusion prior based methods, our approach achieves a remarkable +speedup of 5-7 times, demonstrating its superior efficiency. Code: +https://github.com/QinpengCui/DoSSR. + +
+
+ comment: This paper is accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Harnessing Shared Relations via Multimodal Mixup Contrastive Learning + for Multimodal Classification + + +
+ Deep multimodal learning has shown remarkable success by leveraging +contrastive learning to capture explicit one-to-one relations across +modalities. However, real-world data often exhibits shared relations beyond +simple pairwise associations. We propose M3CoL, a Multimodal Mixup Contrastive +Learning approach to capture nuanced shared relations inherent in multimodal +data. Our key contribution is a Mixup-based contrastive loss that learns robust +representations by aligning mixed samples from one modality with their +corresponding samples from other modalities thereby capturing shared relations +between them. For multimodal classification tasks, we introduce a framework +that integrates a fusion module with unimodal prediction modules for auxiliary +supervision during training, complemented by our proposed Mixup-based +contrastive loss. Through extensive experiments on diverse datasets (N24News, +ROSMAP, BRCA, and Food-101), we demonstrate that M3CoL effectively captures +shared multimodal relations and generalizes across domains. It outperforms +state-of-the-art methods on N24News, ROSMAP, and BRCA, while achieving +comparable performance on Food-101. Our work highlights the significance of +learning shared relations for robust multimodal learning, opening up promising +avenues for future research. + +
+
+ comment: RK and RS contributed equally to this work, 20 Pages, 8 Figures, 9 + Tables +
+
+
+
+
+ + ☆ UNICORN: A Deep Learning Model for Integrating Multi-Stain Data in + Histopathology + + +
+ Background: The integration of multi-stain histopathology images through deep +learning poses a significant challenge in digital histopathology. Current +multi-modal approaches struggle with data heterogeneity and missing data. This +study aims to overcome these limitations by developing a novel transformer +model for multi-stain integration that can handle missing data during training +as well as inference. Methods: We propose UNICORN (UNiversal modality +Integration Network for CORonary classificatioN) a multi-modal transformer +capable of processing multi-stain histopathology for atherosclerosis severity +class prediction. The architecture comprises a two-stage, end-to-end trainable +model with specialized modules utilizing transformer self-attention blocks. The +initial stage employs domain-specific expert modules to extract features from +each modality. In the subsequent stage, an aggregation expert module integrates +these features by learning the interactions between the different data +modalities. Results: Evaluation was performed using a multi-class dataset of +atherosclerotic lesions from the Munich Cardiovascular Studies Biobank +(MISSION), using over 4,000 paired multi-stain whole slide images (WSIs) from +170 deceased individuals on 7 prespecified segments of the coronary tree, each +stained according to four histopathological protocols. UNICORN achieved a +classification accuracy of 0.67, outperforming other state-of-the-art models. +The model effectively identifies relevant tissue phenotypes across stainings +and implicitly models disease progression. Conclusion: Our proposed multi-modal +transformer model addresses key challenges in medical data analysis, including +data heterogeneity and missing modalities. Explainability and the model's +effectiveness in predicting atherosclerosis progression underscores its +potential for broader applications in medical research. + +
+
+
+
+
+ + ☆ Confidence intervals uncovered: Are we ready for real-world medical + imaging AI? MICCAI 2024 + + +
+ Medical imaging is spearheading the AI transformation of healthcare. +Performance reporting is key to determine which methods should be translated +into clinical practice. Frequently, broad conclusions are simply derived from +mean performance values. In this paper, we argue that this common practice is +often a misleading simplification as it ignores performance variability. Our +contribution is threefold. (1) Analyzing all MICCAI segmentation papers (n = +221) published in 2023, we first observe that more than 50\% of papers do not +assess performance variability at all. Moreover, only one (0.5\%) paper +reported confidence intervals (CIs) for model performance. (2) To address the +reporting bottleneck, we show that the unreported standard deviation (SD) in +segmentation papers can be approximated by a second-order polynomial function +of the mean Dice similarity coefficient (DSC). Based on external validation +data from 56 previous MICCAI challenges, we demonstrate that this approximation +can accurately reconstruct the CI of a method using information provided in +publications. (3) Finally, we reconstructed 95\% CIs around the mean DSC of +MICCAI 2023 segmentation papers. The median CI width was 0.03 which is three +times larger than the median performance gap between the first and second +ranked method. For more than 60\% of papers, the mean performance of the +second-ranked method was within the CI of the first-ranked method. We conclude +that current publications typically do not provide sufficient evidence to +support which models could potentially be translated into clinical practice. + +
+
+ comment: Paper accepted at MICCAI 2024 conference +
+
+
+
+
+ + ☆ LGFN: Lightweight Light Field Image Super-Resolution using Local + Convolution Modulation and Global Attention Feature Extraction + + +
+ Capturing different intensity and directions of light rays at the same scene +Light field (LF) can encode the 3D scene cues into a 4D LF image which has a +wide range of applications (i.e. post-capture refocusing and depth sensing). LF +image super-resolution (SR) aims to improve the image resolution limited by the +performance of LF camera sensor. Although existing methods have achieved +promising results the practical application of these models is limited because +they are not lightweight enough. In this paper we propose a lightweight model +named LGFN which integrates the local and global features of different views +and the features of different channels for LF image SR. Specifically owing to +neighboring regions of the same pixel position in different sub-aperture images +exhibit similar structural relationships we design a lightweight CNN-based +feature extraction module (namely DGCE) to extract local features better +through feature modulation. Meanwhile as the position beyond the boundaries in +the LF image presents a large disparity we propose an efficient spatial +attention module (namely ESAM) which uses decomposable large-kernel convolution +to obtain an enlarged receptive field and an efficient channel attention module +(namely ECAM). Compared with the existing LF image SR models with large +parameter our model has a parameter of 0.45M and a FLOPs of 19.33G which has +achieved a competitive effect. Extensive experiments with ablation studies +demonstrate the effectiveness of our proposed method which ranked the second +place in the Track 2 Fidelity & Efficiency of NTIRE2024 Light Field Super +Resolution Challenge and the seventh place in the Track 1 Fidelity. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Text Image Generation for Low-Resource Languages with Dual Translation + Learning + + +
+ Scene text recognition in low-resource languages frequently faces challenges +due to the limited availability of training datasets derived from real-world +scenes. This study proposes a novel approach that generates text images in +low-resource languages by emulating the style of real text images from +high-resource languages. Our approach utilizes a diffusion model that is +conditioned on binary states: ``synthetic'' and ``real.'' The training of this +model involves dual translation tasks, where it transforms plain text images +into either synthetic or real text images, based on the binary states. This +approach not only effectively differentiates between the two domains but also +facilitates the model's explicit recognition of characters in the target +language. Furthermore, to enhance the accuracy and variety of generated text +images, we introduce two guidance techniques: Fidelity-Diversity Balancing +Guidance and Fidelity Enhancement Guidance. Our experimental results +demonstrate that the text images generated by our proposed framework can +significantly improve the performance of scene text recognition models for +low-resource languages. + +
+
+ comment: 23 pages, 11 figures +
+
+
+
+
+ + ☆ AnyLogo: Symbiotic Subject-Driven Diffusion System with Gemini Status + + +
+ Diffusion models have made compelling progress on facilitating +high-throughput daily production. Nevertheless, the appealing customized +requirements are remain suffered from instance-level finetuning for authentic +fidelity. Prior zero-shot customization works achieve the semantic consistence +through the condensed injection of identity features, while addressing detailed +low-level signatures through complex model configurations and subject-specific +fabrications, which significantly break the statistical coherence within the +overall system and limit the applicability across various scenarios. To +facilitate the generic signature concentration with rectified efficiency, we +present \textbf{AnyLogo}, a zero-shot region customizer with remarkable detail +consistency, building upon the symbiotic diffusion system with eliminated +cumbersome designs. Streamlined as vanilla image generation, we discern that +the rigorous signature extraction and creative content generation are +promisingly compatible and can be systematically recycled within a single +denoising model. In place of the external configurations, the gemini status of +the denoising model promote the reinforced subject transmission efficiency and +disentangled semantic-signature space with continuous signature decoration. +Moreover, the sparse recycling paradigm is adopted to prevent the duplicated +risk with compressed transmission quota for diversified signature stimulation. +Extensive experiments on constructed logo-level benchmarks demonstrate the +effectiveness and practicability of our methods. + +
+
+ comment: 13 pages, 12 figures +
+
+
+
+
+ + ☆ Neural Implicit Representation for Highly Dynamic LiDAR Mapping and + Odometry + + +
+ Recent advancements in Simultaneous Localization and Mapping (SLAM) have +increasingly highlighted the robustness of LiDAR-based techniques. At the same +time, Neural Radiance Fields (NeRF) have introduced new possibilities for 3D +scene reconstruction, exemplified by SLAM systems. Among these, NeRF-LOAM has +shown notable performance in NeRF-based SLAM applications. However, despite its +strengths, these systems often encounter difficulties in dynamic outdoor +environments due to their inherent static assumptions. To address these +limitations, this paper proposes a novel method designed to improve +reconstruction in highly dynamic outdoor scenes. Based on NeRF-LOAM, the +proposed approach consists of two primary components. First, we separate the +scene into static background and dynamic foreground. By identifying and +excluding dynamic elements from the mapping process, this segmentation enables +the creation of a dense 3D map that accurately represents the static background +only. The second component extends the octree structure to support +multi-resolution representation. This extension not only enhances +reconstruction quality but also aids in the removal of dynamic objects +identified by the first module. Additionally, Fourier feature encoding is +applied to the sampled points, capturing high-frequency information and leading +to more complete reconstruction results. Evaluations on various datasets +demonstrate that our method achieves more competitive results compared to +current state-of-the-art approaches. + +
+
+
+
+
+ + ☆ AlterMOMA: Fusion Redundancy Pruning for Camera-LiDAR Fusion Models with + Alternative Modality Masking NeurIPS 2024 + + +
+ Camera-LiDAR fusion models significantly enhance perception performance in +autonomous driving. The fusion mechanism leverages the strengths of each +modality while minimizing their weaknesses. Moreover, in practice, camera-LiDAR +fusion models utilize pre-trained backbones for efficient training. However, we +argue that directly loading single-modal pre-trained camera and LiDAR backbones +into camera-LiDAR fusion models introduces similar feature redundancy across +modalities due to the nature of the fusion mechanism. Unfortunately, existing +pruning methods are developed explicitly for single-modal models, and thus, +they struggle to effectively identify these specific redundant parameters in +camera-LiDAR fusion models. In this paper, to address the issue above on +camera-LiDAR fusion models, we propose a novelty pruning framework Alternative +Modality Masking Pruning (AlterMOMA), which employs alternative masking on each +modality and identifies the redundant parameters. Specifically, when one +modality parameters are masked (deactivated), the absence of features from the +masked backbone compels the model to reactivate previous redundant features of +the other modality backbone. Therefore, these redundant features and relevant +redundant parameters can be identified via the reactivation process. The +redundant parameters can be pruned by our proposed importance score evaluation +function, Alternative Evaluation (AlterEva), which is based on the observation +of the loss changes when certain modality parameters are activated and +deactivated. Extensive experiments on the nuScene and KITTI datasets +encompassing diverse tasks, baseline models, and pruning algorithms showcase +that AlterMOMA outperforms existing pruning methods, attaining state-of-the-art +performance. + +
+
+ comment: 17 pages, 3 figures, Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Robotic-CLIP: Fine-tuning CLIP on Action Data for Robotic Applications + + +
+ Vision language models have played a key role in extracting meaningful +features for various robotic applications. Among these, Contrastive +Language-Image Pretraining (CLIP) is widely used in robotic tasks that require +both vision and natural language understanding. However, CLIP was trained +solely on static images paired with text prompts and has not yet been fully +adapted for robotic tasks involving dynamic actions. In this paper, we +introduce Robotic-CLIP to enhance robotic perception capabilities. We first +gather and label large-scale action data, and then build our Robotic-CLIP by +fine-tuning CLIP on 309,433 videos (~7.4 million frames) of action data using +contrastive learning. By leveraging action data, Robotic-CLIP inherits CLIP's +strong image performance while gaining the ability to understand actions in +robotic contexts. Intensive experiments show that our Robotic-CLIP outperforms +other CLIP-based models across various language-driven robotic tasks. +Additionally, we demonstrate the practical effectiveness of Robotic-CLIP in +real-world grasping applications. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Scene Understanding in Pick-and-Place Tasks: Analyzing Transformations + Between Initial and Final Scenes + + +
+ With robots increasingly collaborating with humans in everyday tasks, it is +important to take steps toward robotic systems capable of understanding the +environment. This work focuses on scene understanding to detect pick and place +tasks given initial and final images from the scene. To this end, a dataset is +collected for object detection and pick and place task detection. A YOLOv5 +network is subsequently trained to detect the objects in the initial and final +scenes. Given the detected objects and their bounding boxes, two methods are +proposed to detect the pick and place tasks which transform the initial scene +into the final scene. A geometric method is proposed which tracks objects' +movements in the two scenes and works based on the intersection of the bounding +boxes which moved within scenes. Contrarily, the CNN-based method utilizes a +Convolutional Neural Network to classify objects with intersected bounding +boxes into 5 classes, showing the spatial relationship between the involved +objects. The performed pick and place tasks are then derived from analyzing the +experiments with both scenes. Results show that the CNN-based method, using a +VGG16 backbone, outscores the geometric method by roughly 12 percentage points +in certain scenarios, with an overall success rate of 84.3%. + +
+
+ comment: Conference Paper, ICEE 2024, 7 pages, 5 figures +
+
+
+
+
+ + ☆ Behaviour4All: in-the-wild Facial Behaviour Analysis Toolkit + + +
+ In this paper, we introduce Behavior4All, a comprehensive, open-source +toolkit for in-the-wild facial behavior analysis, integrating Face +Localization, Valence-Arousal Estimation, Basic Expression Recognition and +Action Unit Detection, all within a single framework. Available in both +CPU-only and GPU-accelerated versions, Behavior4All leverages 12 large-scale, +in-the-wild datasets consisting of over 5 million images from diverse +demographic groups. It introduces a novel framework that leverages distribution +matching and label co-annotation to address tasks with non-overlapping +annotations, encoding prior knowledge of their relatedness. In the largest +study of its kind, Behavior4All outperforms both state-of-the-art and toolkits +in overall performance as well as fairness across all databases and tasks. It +also demonstrates superior generalizability on unseen databases and on compound +expression recognition. Finally, Behavior4All is way times faster than other +toolkits. + +
+
+
+
+
+ + ☆ MoGenTS: Motion Generation based on Spatial-Temporal Joint Modeling NeurIPS 2024 + + +
+ Motion generation from discrete quantization offers many advantages over +continuous regression, but at the cost of inevitable approximation errors. +Previous methods usually quantize the entire body pose into one code, which not +only faces the difficulty in encoding all joints within one vector but also +loses the spatial relationship between different joints. Differently, in this +work we quantize each individual joint into one vector, which i) simplifies the +quantization process as the complexity associated with a single joint is +markedly lower than that of the entire pose; ii) maintains a spatial-temporal +structure that preserves both the spatial relationships among joints and the +temporal movement patterns; iii) yields a 2D token map, which enables the +application of various 2D operations widely used in 2D images. Grounded in the +2D motion quantization, we build a spatial-temporal modeling framework, where +2D joint VQVAE, temporal-spatial 2D masking technique, and spatial-temporal 2D +attention are proposed to take advantage of spatial-temporal signals among the +2D tokens. Extensive experiments demonstrate that our method significantly +outperforms previous methods across different datasets, with a $26.6\%$ +decrease of FID on HumanML3D and a $29.9\%$ decrease on KIT-ML. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Dark Miner: Defend against unsafe generation for text-to-image diffusion + models + + +
+ Text-to-image diffusion models have been demonstrated with unsafe generation +due to unfiltered large-scale training data, such as violent, sexual, and +shocking images, necessitating the erasure of unsafe concepts. Most existing +methods focus on modifying the generation probabilities conditioned on the +texts containing unsafe descriptions. However, they fail to guarantee safe +generation for unseen texts in the training phase, especially for the prompts +from adversarial attacks. In this paper, we re-analyze the erasure task and +point out that existing methods cannot guarantee the minimization of the total +probabilities of unsafe generation. To tackle this problem, we propose Dark +Miner. It entails a recurring three-stage process that comprises mining, +verifying, and circumventing. It greedily mines embeddings with maximum +generation probabilities of unsafe concepts and reduces unsafe generation more +effectively. In the experiments, we evaluate its performance on two +inappropriate concepts, two objects, and two styles. Compared with 6 previous +state-of-the-art methods, our method achieves better erasure and defense +results in most cases, especially under 4 state-of-the-art attacks, while +preserving the model's native generation capability. Our code will be available +on GitHub. + +
+
+
+
+
+ + ☆ Event-based Stereo Depth Estimation: A Survey + + +
+ Stereopsis has widespread appeal in robotics as it is the predominant way by +which living beings perceive depth to navigate our 3D world. Event cameras are +novel bio-inspired sensors that detect per-pixel brightness changes +asynchronously, with very high temporal resolution and high dynamic range, +enabling machine perception in high-speed motion and broad illumination +conditions. The high temporal precision also benefits stereo matching, making +disparity (depth) estimation a popular research area for event cameras ever +since its inception. Over the last 30 years, the field has evolved rapidly, +from low-latency, low-power circuit design to current deep learning (DL) +approaches driven by the computer vision community. The bibliography is vast +and difficult to navigate for non-experts due its highly interdisciplinary +nature. Past surveys have addressed distinct aspects of this topic, in the +context of applications, or focusing only on a specific class of techniques, +but have overlooked stereo datasets. This survey provides a comprehensive +overview, covering both instantaneous stereo and long-term methods suitable for +simultaneous localization and mapping (SLAM), along with theoretical and +empirical comparisons. It is the first to extensively review DL methods as well +as stereo datasets, even providing practical suggestions for creating new +benchmarks to advance the field. The main advantages and challenges faced by +event-based stereo depth estimation are also discussed. Despite significant +progress, challenges remain in achieving optimal performance in not only +accuracy but also efficiency, a cornerstone of event-based computing. We +identify several gaps and propose future research directions. We hope this +survey inspires future research in this area, by serving as an accessible entry +point for newcomers, as well as a practical guide for seasoned researchers in +the community. + +
+
+ comment: 28 pages, 20 figures, 7 tables +
+
+
+
+
+ + ☆ EM-Net: Efficient Channel and Frequency Learning with Mamba for 3D + Medical Image Segmentation MICCAI 2024 + + +
+ Convolutional neural networks have primarily led 3D medical image +segmentation but may be limited by small receptive fields. Transformer models +excel in capturing global relationships through self-attention but are +challenged by high computational costs at high resolutions. Recently, Mamba, a +state space model, has emerged as an effective approach for sequential +modeling. Inspired by its success, we introduce a novel Mamba-based 3D medical +image segmentation model called EM-Net. It not only efficiently captures +attentive interaction between regions by integrating and selecting channels, +but also effectively utilizes frequency domain to harmonize the learning of +features across varying scales, while accelerating training speed. +Comprehensive experiments on two challenging multi-organ datasets with other +state-of-the-art (SOTA) algorithms show that our method exhibits better +segmentation accuracy while requiring nearly half the parameter size of SOTA +models and 2x faster training speed. + +
+
+ comment: 10 pages, 3 figures, accepted by MICCAI 2024 +
+
+
+
+
+ + ☆ Self-Supervised Learning of Deviation in Latent Representation for + Co-speech Gesture Video Generation + + +
+ Gestures are pivotal in enhancing co-speech communication. While recent works +have mostly focused on point-level motion transformation or fully supervised +motion representations through data-driven approaches, we explore the +representation of gestures in co-speech, with a focus on self-supervised +representation and pixel-level motion deviation, utilizing a diffusion model +which incorporates latent motion features. Our approach leverages +self-supervised deviation in latent representation to facilitate hand gestures +generation, which are crucial for generating realistic gesture videos. Results +of our first experiment demonstrate that our method enhances the quality of +generated videos, with an improvement from 2.7 to 4.5% for FGD, DIV, and FVD, +and 8.1% for PSNR, 2.5% for SSIM over the current state-of-the-art methods. + +
+
+ comment: 5 pages, 5 figures, conference +
+
+
+
+
+ + ☆ Leveraging Anthropometric Measurements to Improve Human Mesh Estimation + and Ensure Consistent Body Shapes + + +
+ The basic body shape of a person does not change within a single video. +However, most SOTA human mesh estimation (HME) models output a slightly +different body shape for each video frame, which results in inconsistent body +shapes for the same person. In contrast, we leverage anthropometric +measurements like tailors are already obtaining from humans for centuries. We +create a model called A2B that converts such anthropometric measurements to +body shape parameters of human mesh models. Moreover, we find that finetuned +SOTA 3D human pose estimation (HPE) models outperform HME models regarding the +precision of the estimated keypoints. We show that applying inverse kinematics +(IK) to the results of such a 3D HPE model and combining the resulting body +pose with the A2B body shape leads to superior and consistent human meshes for +challenging datasets like ASPset or fit3D, where we can lower the MPJPE by over +30 mm compared to SOTA HME models. Further, replacing HME models estimates of +the body shape parameters with A2B model results not only increases the +performance of these HME models, but also leads to consistent body shapes. + +
+
+
+
+
+ + ☆ Explanation Bottleneck Models + + +
+ Recent concept-based interpretable models have succeeded in providing +meaningful explanations by pre-defined concept sets. However, the dependency on +the pre-defined concepts restricts the application because of the limited +number of concepts for explanations. This paper proposes a novel interpretable +deep neural network called explanation bottleneck models (XBMs). XBMs generate +a text explanation from the input without pre-defined concepts and then predict +a final task prediction based on the generated explanation by leveraging +pre-trained vision-language encoder-decoder models. To achieve both the target +task performance and the explanation quality, we train XBMs through the target +task loss with the regularization penalizing the explanation decoder via the +distillation from the frozen pre-trained decoder. Our experiments, including a +comparison to state-of-the-art concept bottleneck models, confirm that XBMs +provide accurate and fluent natural language explanations without pre-defined +concept sets. Code will be available at https://github.com/yshinya6/xbm/. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Provable Performance Guarantees of Copy Detection Patterns + + +
+ Copy Detection Patterns (CDPs) are crucial elements in modern security +applications, playing a vital role in safeguarding industries such as food, +pharmaceuticals, and cosmetics. Current performance evaluations of CDPs +predominantly rely on empirical setups using simplistic metrics like Hamming +distances or Pearson correlation. These methods are often inadequate due to +their sensitivity to distortions, degradation, and their limitations to +stationary statistics of printing and imaging. Additionally, machine +learning-based approaches suffer from distribution biases and fail to +generalize to unseen counterfeit samples. Given the critical importance of CDPs +in preventing counterfeiting, including the counterfeit vaccines issue +highlighted during the COVID-19 pandemic, there is an urgent need for provable +performance guarantees across various criteria. This paper aims to establish a +theoretical framework to derive optimal criteria for the analysis, +optimization, and future development of CDP authentication technologies, +ensuring their reliability and effectiveness in diverse security scenarios. + +
+
+
+
+
+ + ☆ MECD: Unlocking Multi-Event Causal Discovery in Video Reasoning NeurIPS 2024 + + +
+ Video causal reasoning aims to achieve a high-level understanding of video +content from a causal perspective. However, current video reasoning tasks are +limited in scope, primarily executed in a question-answering paradigm and +focusing on short videos containing only a single event and simple causal +relationships, lacking comprehensive and structured causality analysis for +videos with multiple events. To fill this gap, we introduce a new task and +dataset, Multi-Event Causal Discovery (MECD). It aims to uncover the causal +relationships between events distributed chronologically across long videos. +Given visual segments and textual descriptions of events, MECD requires +identifying the causal associations between these events to derive a +comprehensive, structured event-level video causal diagram explaining why and +how the final result event occurred. To address MECD, we devise a novel +framework inspired by the Granger Causality method, using an efficient +mask-based event prediction model to perform an Event Granger Test, which +estimates causality by comparing the predicted result event when premise events +are masked versus unmasked. Furthermore, we integrate causal inference +techniques such as front-door adjustment and counterfactual inference to +address challenges in MECD like causality confounding and illusory causality. +Experiments validate the effectiveness of our framework in providing causal +relationships in multi-event videos, outperforming GPT-4o and VideoLLaVA by +5.7% and 4.1%, respectively. + +
+
+ comment: Accepted at NeurIPS 2024 as a spotlight paper +
+
+
+
+
+ + ☆ P4Q: Learning to Prompt for Quantization in Visual-language Models + + +
+ Large-scale pre-trained Vision-Language Models (VLMs) have gained prominence +in various visual and multimodal tasks, yet the deployment of VLMs on +downstream application platforms remains challenging due to their prohibitive +requirements of training samples and computing resources. Fine-tuning and +quantization of VLMs can substantially reduce the sample and computation costs, +which are in urgent need. There are two prevailing paradigms in quantization, +Quantization-Aware Training (QAT) can effectively quantize large-scale VLMs but +incur a huge training cost, while low-bit Post-Training Quantization (PTQ) +suffers from a notable performance drop. We propose a method that balances +fine-tuning and quantization named ``Prompt for Quantization'' (P4Q), in which +we design a lightweight architecture to leverage contrastive loss supervision +to enhance the recognition performance of a PTQ model. Our method can +effectively reduce the gap between image features and text features caused by +low-bit quantization, based on learnable prompts to reorganize textual +representations and a low-bit adapter to realign the distributions of image and +text features. We also introduce a distillation loss based on cosine similarity +predictions to distill the quantized model using a full-precision teacher. +Extensive experimental results demonstrate that our P4Q method outperforms +prior arts, even achieving comparable results to its full-precision +counterparts. For instance, our 8-bit P4Q can theoretically compress the +CLIP-ViT/B-32 by 4 $\times$ while achieving 66.94\% Top-1 accuracy, +outperforming the learnable prompt fine-tuned full-precision model by 2.24\% +with negligible additional parameters on the ImageNet dataset. + +
+
+
+
+
+ + ☆ Hand-object reconstruction via interaction-aware graph attention + mechanism ICIP 2024 + + +
+ Estimating the poses of both a hand and an object has become an important +area of research due to the growing need for advanced vision computing. The +primary challenge involves understanding and reconstructing how hands and +objects interact, such as contact and physical plausibility. Existing +approaches often adopt a graph neural network to incorporate spatial +information of hand and object meshes. However, these approaches have not fully +exploited the potential of graphs without modification of edges within and +between hand- and object-graphs. We propose a graph-based refinement method +that incorporates an interaction-aware graph-attention mechanism to account for +hand-object interactions. Using edges, we establish connections among closely +correlated nodes, both within individual graphs and across different graphs. +Experiments demonstrate the effectiveness of our proposed method with notable +improvements in the realm of physical plausibility. + +
+
+ comment: 7 pages, Accepted by ICIP 2024 +
+
+
+
+
+ + ☆ Diversity-Driven Synthesis: Enhancing Dataset Distillation through + Directed Weight Adjustment + + +
+ The sharp increase in data-related expenses has motivated research into +condensing datasets while retaining the most informative features. Dataset +distillation has thus recently come to the fore. This paradigm generates +synthetic dataset that are representative enough to replace the original +dataset in training a neural network. To avoid redundancy in these synthetic +datasets, it is crucial that each element contains unique features and remains +diverse from others during the synthesis stage. In this paper, we provide a +thorough theoretical and empirical analysis of diversity within synthesized +datasets. We argue that enhancing diversity can improve the parallelizable yet +isolated synthesizing approach. Specifically, we introduce a novel method that +employs dynamic and directed weight adjustment techniques to modulate the +synthesis process, thereby maximizing the representativeness and diversity of +each synthetic instance. Our method ensures that each batch of synthetic data +mirrors the characteristics of a large, varying subset of the original dataset. +Extensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet, +and ImageNet-1K, demonstrate the superior performance of our method, +highlighting its effectiveness in producing diverse and representative +synthetic datasets with minimal computational expense. + +
+
+
+
+
+ + ☆ ZALM3: Zero-Shot Enhancement of Vision-Language Alignment via In-Context + Information in Multi-Turn Multimodal Medical Dialogue + + +
+ The rocketing prosperity of large language models (LLMs) in recent years has +boosted the prevalence of vision-language models (VLMs) in the medical sector. +In our online medical consultation scenario, a doctor responds to the texts and +images provided by a patient in multiple rounds to diagnose her/his health +condition, forming a multi-turn multimodal medical dialogue format. Unlike +high-quality images captured by professional equipment in traditional medical +visual question answering (Med-VQA), the images in our case are taken by +patients' mobile phones. These images have poor quality control, with issues +such as excessive background elements and the lesion area being significantly +off-center, leading to degradation of vision-language alignment in the model +training phase. In this paper, we propose ZALM3, a Zero-shot strategy to +improve vision-language ALignment in Multi-turn Multimodal Medical dialogue. +Since we observe that the preceding text conversations before an image can +infer the regions of interest (RoIs) in the image, ZALM3 employs an LLM to +summarize the keywords from the preceding context and a visual grounding model +to extract the RoIs. The updated images eliminate unnecessary background noise +and provide more effective vision-language alignment. To better evaluate our +proposed method, we design a new subjective assessment metric for multi-turn +unimodal/multimodal medical dialogue to provide a fine-grained performance +comparison. Our experiments across three different clinical departments +remarkably demonstrate the efficacy of ZALM3 with statistical significance. + +
+
+
+
+
+ + ☆ Appearance Blur-driven AutoEncoder and Motion-guided Memory Module for + Video Anomaly Detection + + +
+ Video anomaly detection (VAD) often learns the distribution of normal samples +and detects the anomaly through measuring significant deviations, but the +undesired generalization may reconstruct a few anomalies thus suppressing the +deviations. Meanwhile, most VADs cannot cope with cross-dataset validation for +new target domains, and few-shot methods must laboriously rely on model-tuning +from the target domain to complete domain adaptation. To address these +problems, we propose a novel VAD method with a motion-guided memory module to +achieve cross-dataset validation with zero-shot. First, we add Gaussian blur to +the raw appearance images, thereby constructing the global pseudo-anomaly, +which serves as the input to the network. Then, we propose multi-scale residual +channel attention to deblur the pseudo-anomaly in normal samples. Next, memory +items are obtained by recording the motion features in the training phase, +which are used to retrieve the motion features from the raw information in the +testing phase. Lastly, our method can ignore the blurred real anomaly through +attention and rely on motion memory items to increase the normality gap between +normal and abnormal motion. Extensive experiments on three benchmark datasets +demonstrate the effectiveness of the proposed method. Compared with +cross-domain methods, our method achieves competitive performance without +adaptation during testing. + +
+
+ comment: 13 pages, 11 figures +
+
+
+
+
+ + ☆ Good Data Is All Imitation Learning Needs + + +
+ In this paper, we address the limitations of traditional teacher-student +models, imitation learning, and behaviour cloning in the context of +Autonomous/Automated Driving Systems (ADS), where these methods often struggle +with incomplete coverage of real-world scenarios. To enhance the robustness of +such models, we introduce the use of Counterfactual Explanations (CFEs) as a +novel data augmentation technique for end-to-end ADS. CFEs, by generating +training samples near decision boundaries through minimal input modifications, +lead to a more comprehensive representation of expert driver strategies, +particularly in safety-critical scenarios. This approach can therefore help +improve the model's ability to handle rare and challenging driving events, such +as anticipating darting out pedestrians, ultimately leading to safer and more +trustworthy decision-making for ADS. Our experiments in the CARLA simulator +demonstrate that CF-Driver outperforms the current state-of-the-art method, +achieving a higher driving score and lower infraction rates. Specifically, +CF-Driver attains a driving score of 84.2, surpassing the previous best model +by 15.02 percentage points. These results highlight the effectiveness of +incorporating CFEs in training end-to-end ADS. To foster further research, the +CF-Driver code is made publicly available. + +
+
+
+
+
+ + ☆ TA-Cleaner: A Fine-grained Text Alignment Backdoor Defense Strategy for + Multimodal Contrastive Learning + + +
+ Pre-trained large models for multimodal contrastive learning, such as CLIP, +have been widely recognized in the industry as highly susceptible to +data-poisoned backdoor attacks. This poses significant risks to downstream +model training. In response to such potential threats, finetuning offers a +simpler and more efficient defense choice compared to retraining large models +with augmented data. In the supervised learning domain, fine-tuning defense +strategies can achieve excellent defense performance. However, in the +unsupervised and semi-supervised domain, we find that when CLIP faces some +complex attack techniques, the existing fine-tuning defense strategy, +CleanCLIP, has some limitations on defense performance. The synonym +substitution of its text-augmentation is insufficient to enhance the text +feature space. To compensate for this weakness, we improve it by proposing a +fine-grained \textbf{T}ext \textbf{A}lignment \textbf{C}leaner (TA-Cleaner) to +cut off feature connections of backdoor triggers. We randomly select a few +samples for positive and negative subtext generation at each epoch of +CleanCLIP, and align the subtexts to the images to strengthen the text +self-supervision. We evaluate the effectiveness of our TA-Cleaner against six +attack algorithms and conduct comprehensive zero-shot classification tests on +ImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves +state-of-the-art defensiveness among finetuning-based defense techniques. Even +when faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms +CleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\% and 63.88\%, +respectively. + +
+
+
+
+
+ + ☆ Unifying Dimensions: A Linear Adaptive Approach to Lightweight Image + Super-Resolution + + +
+ Window-based transformers have demonstrated outstanding performance in +super-resolution tasks due to their adaptive modeling capabilities through +local self-attention (SA). However, they exhibit higher computational +complexity and inference latency than convolutional neural networks. In this +paper, we first identify that the adaptability of the Transformers is derived +from their adaptive spatial aggregation and advanced structural design, while +their high latency results from the computational costs and memory layout +transformations associated with the local SA. To simulate this aggregation +approach, we propose an effective convolution-based linear focal separable +attention (FSA), allowing for long-range dynamic modeling with linear +complexity. Additionally, we introduce an effective dual-branch structure +combined with an ultra-lightweight information exchange module (IEM) to enhance +the aggregation of information by the Token Mixer. Finally, with respect to the +structure, we modify the existing spatial-gate-based feedforward neural +networks by incorporating a self-gate mechanism to preserve high-dimensional +channel information, enabling the modeling of more complex relationships. With +these advancements, we construct a convolution-based Transformer framework +named the linear adaptive mixer network (LAMNet). Extensive experiments +demonstrate that LAMNet achieves better performance than existing SA-based +Transformer methods while maintaining the computational efficiency of +convolutional neural networks, which can achieve a \(3\times\) speedup of +inference time. The code will be publicly available at: +https://github.com/zononhzy/LAMNet. + +
+
+
+
+
+ + ☆ Improving Fast Adversarial Training via Self-Knowledge Guidance + + +
+ Adversarial training has achieved remarkable advancements in defending +against adversarial attacks. Among them, fast adversarial training (FAT) is +gaining attention for its ability to achieve competitive robustness with fewer +computing resources. Existing FAT methods typically employ a uniform strategy +that optimizes all training data equally without considering the influence of +different examples, which leads to an imbalanced optimization. However, this +imbalance remains unexplored in the field of FAT. In this paper, we conduct a +comprehensive study of the imbalance issue in FAT and observe an obvious class +disparity regarding their performances. This disparity could be embodied from a +perspective of alignment between clean and robust accuracy. Based on the +analysis, we mainly attribute the observed misalignment and disparity to the +imbalanced optimization in FAT, which motivates us to optimize different +training data adaptively to enhance robustness. Specifically, we take disparity +and misalignment into consideration. First, we introduce self-knowledge guided +regularization, which assigns differentiated regularization weights to each +class based on its training state, alleviating class disparity. Additionally, +we propose self-knowledge guided label relaxation, which adjusts label +relaxation according to the training accuracy, alleviating the misalignment and +improving robustness. By combining these methods, we formulate the +Self-Knowledge Guided FAT (SKG-FAT), leveraging naturally generated knowledge +during training to enhance the adversarial robustness without compromising +training efficiency. Extensive experiments on four standard datasets +demonstrate that the SKG-FAT improves the robustness and preserves competitive +clean accuracy, outperforming the state-of-the-art methods. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Let the Quantum Creep In: Designing Quantum Neural Network Models by + Gradually Swapping Out Classical Components + + +
+ Artificial Intelligence (AI), with its multiplier effect and wide +applications in multiple areas, could potentially be an important application +of quantum computing. Since modern AI systems are often built on neural +networks, the design of quantum neural networks becomes a key challenge in +integrating quantum computing into AI. To provide a more fine-grained +characterisation of the impact of quantum components on the performance of +neural networks, we propose a framework where classical neural network layers +are gradually replaced by quantum layers that have the same type of input and +output while keeping the flow of information between layers unchanged, +different from most current research in quantum neural network, which favours +an end-to-end quantum model. We start with a simple three-layer classical +neural network without any normalisation layers or activation functions, and +gradually change the classical layers to the corresponding quantum versions. We +conduct numerical experiments on image classification datasets such as the +MNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of +performance brought by the systematic introduction of quantum components. +Through this framework, our research sheds new light on the design of future +quantum neural network models where it could be more favourable to search for +methods and frameworks that harness the advantages from both the classical and +quantum worlds. + +
+
+ comment: 50 pages (including Appendix), many figures, accepted as a poster on + QTML2024. Code available at + https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In +
+
+
+
+
+ + ☆ ID$^3$: Identity-Preserving-yet-Diversified Diffusion Models for + Synthetic Face Recognition NeurIPS 2024 + + +
+ Synthetic face recognition (SFR) aims to generate synthetic face datasets +that mimic the distribution of real face data, which allows for training face +recognition models in a privacy-preserving manner. Despite the remarkable +potential of diffusion models in image generation, current diffusion-based SFR +models struggle with generalization to real-world faces. To address this +limitation, we outline three key objectives for SFR: (1) promoting diversity +across identities (inter-class diversity), (2) ensuring diversity within each +identity by injecting various facial attributes (intra-class diversity), and +(3) maintaining identity consistency within each identity group (intra-class +identity preservation). Inspired by these goals, we introduce a +diffusion-fueled SFR model termed $\text{ID}^3$. $\text{ID}^3$ employs an +ID-preserving loss to generate diverse yet identity-consistent facial +appearances. Theoretically, we show that minimizing this loss is equivalent to +maximizing the lower bound of an adjusted conditional log-likelihood over +ID-preserving data. This equivalence motivates an ID-preserving sampling +algorithm, which operates over an adjusted gradient vector field, enabling the +generation of fake face recognition datasets that approximate the distribution +of real-world faces. Extensive experiments across five challenging benchmarks +validate the advantages of $\text{ID}^3$. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Flexiffusion: Segment-wise Neural Architecture Search for Flexible + Denoising Schedule + + +
+ Diffusion models are cutting-edge generative models adept at producing +diverse, high-quality images. Despite their effectiveness, these models often +require significant computational resources owing to their numerous sequential +denoising steps and the significant inference cost of each step. Recently, +Neural Architecture Search (NAS) techniques have been employed to automatically +search for faster generation processes. However, NAS for diffusion is +inherently time-consuming as it requires estimating thousands of diffusion +models to search for the optimal one. In this paper, we introduce Flexiffusion, +a novel training-free NAS paradigm designed to accelerate diffusion models by +concurrently optimizing generation steps and network structures. Specifically, +we partition the generation process into isometric step segments, each +sequentially composed of a full step, multiple partial steps, and several null +steps. The full step computes all network blocks, while the partial step +involves part of the blocks, and the null step entails no computation. +Flexiffusion autonomously explores flexible step combinations for each segment, +substantially reducing search costs and enabling greater acceleration compared +to the state-of-the-art (SOTA) method for diffusion models. Our searched models +reported speedup factors of $2.6\times$ and $1.5\times$ for the original +LDM-4-G and the SOTA, respectively. The factors for Stable Diffusion V1.5 and +the SOTA are $5.1\times$ and $2.0\times$. We also verified the performance of +Flexiffusion on multiple datasets, and positive experiment results indicate +that Flexiffusion can effectively reduce redundancy in diffusion models. + +
+
+
+
+
+ + ☆ Pixel-Space Post-Training of Latent Diffusion Models + + +
+ Latent diffusion models (LDMs) have made significant advancements in the +field of image generation in recent years. One major advantage of LDMs is their +ability to operate in a compressed latent space, allowing for more efficient +training and deployment. However, despite these advantages, challenges with +LDMs still remain. For example, it has been observed that LDMs often generate +high-frequency details and complex compositions imperfectly. We hypothesize +that one reason for these flaws is due to the fact that all pre- and +post-training of LDMs are done in latent space, which is typically $8 \times 8$ +lower spatial-resolution than the output images. To address this issue, we +propose adding pixel-space supervision in the post-training process to better +preserve high-frequency details. Experimentally, we show that adding a +pixel-space objective significantly improves both supervised quality +fine-tuning and preference-based post-training by a large margin on a +state-of-the-art DiT transformer and U-Net diffusion models in both visual +quality and visual flaw metrics, while maintaining the same text alignment +quality. + +
+
+
+
+
+ + ☆ General Compression Framework for Efficient Transformer Object Tracking + + +
+ Transformer-based trackers have established a dominant role in the field of +visual object tracking. While these trackers exhibit promising performance, +their deployment on resource-constrained devices remains challenging due to +inefficiencies. To improve the inference efficiency and reduce the computation +cost, prior approaches have aimed to either design lightweight trackers or +distill knowledge from larger teacher models into more compact student +trackers. However, these solutions often sacrifice accuracy for speed. Thus, we +propose a general model compression framework for efficient transformer object +tracking, named CompressTracker, to reduce the size of a pre-trained tracking +model into a lightweight tracker with minimal performance degradation. Our +approach features a novel stage division strategy that segments the transformer +layers of the teacher model into distinct stages, enabling the student model to +emulate each corresponding teacher stage more effectively. Additionally, we +also design a unique replacement training technique that involves randomly +substituting specific stages in the student model with those from the teacher +model, as opposed to training the student model in isolation. Replacement +training enhances the student model's ability to replicate the teacher model's +behavior. To further forcing student model to emulate teacher model, we +incorporate prediction guidance and stage-wise feature mimicking to provide +additional supervision during the teacher model's compression process. Our +framework CompressTracker is structurally agnostic, making it compatible with +any transformer architecture. We conduct a series of experiment to verify the +effectiveness and generalizability of CompressTracker. Our CompressTracker-4 +with 4 transformer layers, which is compressed from OSTrack, retains about 96% +performance on LaSOT (66.1% AUC) while achieves 2.17x speed up. + +
+
+
+
+
+ + ☆ Dynamic Subframe Splitting and Spatio-Temporal Motion Entangled Sparse + Attention for RGB-E Tracking + + +
+ Event-based bionic camera asynchronously captures dynamic scenes with high +temporal resolution and high dynamic range, offering potential for the +integration of events and RGB under conditions of illumination degradation and +fast motion. Existing RGB-E tracking methods model event characteristics +utilising attention mechanism of Transformer before integrating both +modalities. Nevertheless, these methods involve aggregating the event stream +into a single event frame, lacking the utilisation of the temporal information +inherent in the event stream.Moreover, the traditional attention mechanism is +well-suited for dense semantic features, while the attention mechanism for +sparse event features require revolution. In this paper, we propose a dynamic +event subframe splitting strategy to split the event stream into more +fine-grained event clusters, aiming to capture spatio-temporal features that +contain motion cues. Based on this, we design an event-based sparse attention +mechanism to enhance the interaction of event features in temporal and spatial +dimensions. The experimental results indicate that our method outperforms +existing state-of-the-art methods on the FE240 and COESOT datasets, providing +an effective processing manner for the event data. + +
+
+ comment: 15 pages, 8 figures, conference +
+
+
+
+
+ + ☆ Advancing Open-Set Domain Generalization Using Evidential Bi-Level + Hardest Domain Scheduler NeurIPS 2024 + + +
+ In Open-Set Domain Generalization (OSDG), the model is exposed to both new +variations of data appearance (domains) and open-set conditions, where both +known and novel categories are present at test time. The challenges of this +task arise from the dual need to generalize across diverse domains and +accurately quantify category novelty, which is critical for applications in +dynamic environments. Recently, meta-learning techniques have demonstrated +superior results in OSDG, effectively orchestrating the meta-train and -test +tasks by employing varied random categories and predefined domain partition +strategies. These approaches prioritize a well-designed training schedule over +traditional methods that focus primarily on data augmentation and the +enhancement of discriminative feature learning. The prevailing meta-learning +models in OSDG typically utilize a predefined sequential domain scheduler to +structure data partitions. However, a crucial aspect that remains inadequately +explored is the influence brought by strategies of domain schedulers during +training. In this paper, we observe that an adaptive domain scheduler benefits +more in OSDG compared with prefixed sequential and random domain schedulers. We +propose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve +an adaptive domain scheduler. This method strategically sequences domains by +assessing their reliabilities in utilizing a follower network, trained with +confidence scores learned in an evidential manner, regularized by max rebiasing +discrepancy, and optimized in a bi-level manner. The results show that our +method substantially improves OSDG performance and achieves more discriminative +embeddings for both the seen and unseen categories. The source code will be +available at https://github.com/KPeng9510/EBiL-HaDS. + +
+
+ comment: Accepted to NeurIPS 2024. The source code will be available at + https://github.com/KPeng9510/EBiL-HaDS +
+
+
+
+
+ + ☆ Triple Point Masking + + +
+ Existing 3D mask learning methods encounter performance bottlenecks under +limited data, and our objective is to overcome this limitation. In this paper, +we introduce a triple point masking scheme, named TPM, which serves as a +scalable framework for pre-training of masked autoencoders to achieve +multi-mask learning for 3D point clouds. Specifically, we augment the baselines +with two additional mask choices (i.e., medium mask and low mask) as our core +insight is that the recovery process of an object can manifest in diverse ways. +Previous high-masking schemes focus on capturing the global representation but +lack the fine-grained recovery capability, so that the generated pre-trained +weights tend to play a limited role in the fine-tuning process. With the +support of the proposed TPM, available methods can exhibit more flexible and +accurate completion capabilities, enabling the potential autoencoder in the +pre-training stage to consider multiple representations of a single 3D object. +In addition, an SVM-guided weight selection module is proposed to fill the +encoder parameters for downstream networks with the optimal weight during the +fine-tuning stage, maximizing linear accuracy and facilitating the acquisition +of intricate representations for new objects. Extensive experiments show that +the four baselines equipped with the proposed TPM achieve comprehensive +performance improvements on various downstream tasks. + +
+
+
+
+
+ + ☆ CAMOT: Camera Angle-aware Multi-Object Tracking + + +
+ This paper proposes CAMOT, a simple camera angle estimator for multi-object +tracking to tackle two problems: 1) occlusion and 2) inaccurate distance +estimation in the depth direction. Under the assumption that multiple objects +are located on a flat plane in each video frame, CAMOT estimates the camera +angle using object detection. In addition, it gives the depth of each object, +enabling pseudo-3D MOT. We evaluated its performance by adding it to various 2D +MOT methods on the MOT17 and MOT20 datasets and confirmed its effectiveness. +Applying CAMOT to ByteTrack, we obtained 63.8% HOTA, 80.6% MOTA, and 78.5% IDF1 +in MOT17, which are state-of-the-art results. Its computational cost is +significantly lower than the existing deep-learning-based depth estimators for +tracking. + +
+
+
+
+
+ + ☆ SimVG: A Simple Framework for Visual Grounding with Decoupled + Multi-modal Fusion NeurIPS2024 + + +
+ Visual grounding is a common vision task that involves grounding descriptive +sentences to the corresponding regions of an image. Most existing methods use +independent image-text encoding and apply complex hand-crafted modules or +encoder-decoder architectures for modal interaction and query reasoning. +However, their performance significantly drops when dealing with complex +textual expressions. This is because the former paradigm only utilizes limited +downstream data to fit the multi-modal feature fusion. Therefore, it is only +effective when the textual expressions are relatively simple. In contrast, +given the wide diversity of textual expressions and the uniqueness of +downstream training data, the existing fusion module, which extracts multimodal +content from a visual-linguistic context, has not been fully investigated. In +this paper, we present a simple yet robust transformer-based framework, SimVG, +for visual grounding. Specifically, we decouple visual-linguistic feature +fusion from downstream tasks by leveraging existing multimodal pre-trained +models and incorporating additional object tokens to facilitate deep +integration of downstream and pre-training tasks. Furthermore, we design a +dynamic weight-balance distillation method in the multi-branch synchronous +learning process to enhance the representation capability of the simpler +branch. This branch only consists of a lightweight MLP, which simplifies the +structure and improves reasoning speed. Experiments on six widely used VG +datasets, i.e., RefCOCO/+/g, ReferIt, Flickr30K, and GRefCOCO, demonstrate the +superiority of SimVG. Finally, the proposed method not only achieves +improvements in efficiency and convergence speed but also attains new +state-of-the-art performance on these benchmarks. Codes and models will be +available at \url{https://github.com/Dmmm1997/SimVG}. + +
+
+ comment: 21pages, 11figures, NeurIPS2024 +
+
+
+
+
+ + ☆ Drone Stereo Vision for Radiata Pine Branch Detection and Distance + Measurement: Integrating SGBM and Segmentation Models + + +
+ Manual pruning of radiata pine trees presents significant safety risks due to +their substantial height and the challenging terrains in which they thrive. To +address these risks, this research proposes the development of a drone-based +pruning system equipped with specialized pruning tools and a stereo vision +camera, enabling precise detection and trimming of branches. Deep learning +algorithms, including YOLO and Mask R-CNN, are employed to ensure accurate +branch detection, while the Semi-Global Matching algorithm is integrated to +provide reliable distance estimation. The synergy between these techniques +facilitates the precise identification of branch locations and enables +efficient, targeted pruning. Experimental results demonstrate that the combined +implementation of YOLO and SGBM enables the drone to accurately detect branches +and measure their distances from the drone. This research not only improves the +safety and efficiency of pruning operations but also makes a significant +contribution to the advancement of drone technology in the automation of +agricultural and forestry practices, laying a foundational framework for +further innovations in environmental management. + +
+
+
+
+
+ + ☆ JoyType: A Robust Design for Multilingual Visual Text Creation AAAI 2025 + + +
+ Generating images with accurately represented text, especially in non-Latin +languages, poses a significant challenge for diffusion models. Existing +approaches, such as the integration of hint condition diagrams via auxiliary +networks (e.g., ControlNet), have made strides towards addressing this issue. +However, diffusion models often fall short in tasks requiring controlled text +generation, such as specifying particular fonts or producing text in small +fonts. In this paper, we introduce a novel approach for multilingual visual +text creation, named JoyType, designed to maintain the font style of text +during the image generation process. Our methodology begins with assembling a +training dataset, JoyType-1M, comprising 1 million pairs of data. Each pair +includes an image, its description, and glyph instructions corresponding to the +font style within the image. We then developed a text control network, Font +ControlNet, tasked with extracting font style information to steer the image +generation. To further enhance our model's ability to maintain font style, +notably in generating small-font text, we incorporated a multi-layer OCR-aware +loss into the diffusion process. This enhancement allows JoyType to direct text +rendering using low-level descriptors. Our evaluations, based on both visual +and accuracy metrics, demonstrate that JoyType significantly outperforms +existing state-of-the-art methods. Additionally, JoyType can function as a +plugin, facilitating the creation of varied image styles in conjunction with +other stable diffusion models on HuggingFace and CivitAI. Our project is +open-sourced on https://jdh-algo.github.io/JoyType/. + +
+
+ comment: Under Review at AAAI 2025 +
+
+
+
+
+ + ☆ EAGLE: Egocentric AGgregated Language-video Engine + + +
+ The rapid evolution of egocentric video analysis brings new insights into +understanding human activities and intentions from a first-person perspective. +Despite this progress, the fragmentation in tasks like action recognition, +procedure learning, and moment retrieval, \etc, coupled with inconsistent +annotations and isolated model development, hinders a holistic interpretation +of video content. In response, we introduce the EAGLE (Egocentric AGgregated +Language-video Engine) model and the EAGLE-400K dataset to provide a unified +framework that integrates various egocentric video understanding tasks. +EAGLE-400K, the \textit{first} large-scale instruction-tuning dataset tailored +for egocentric video, features 400K diverse samples to enhance a broad spectrum +of tasks from activity recognition to procedure knowledge learning. Moreover, +EAGLE, a strong video multimodal large language model (MLLM), is designed to +effectively capture both spatial and temporal information. In addition, we +propose a set of evaluation metrics designed to facilitate a thorough +assessment of MLLM for egocentric video understanding. Our extensive +experiments demonstrate EAGLE's superior performance over existing models, +highlighting its ability to balance task-specific understanding with holistic +video interpretation. With EAGLE, we aim to pave the way for research +opportunities and practical applications in real-world scenarios. + +
+
+ comment: Accepted by ACMMM 24 +
+
+
+
+
+ + ☆ Robotic Environmental State Recognition with Pre-Trained Vision-Language + Models and Black-Box Optimization + + +
+ In order for robots to autonomously navigate and operate in diverse +environments, it is essential for them to recognize the state of their +environment. On the other hand, the environmental state recognition has +traditionally involved distinct methods tailored to each state to be +recognized. In this study, we perform a unified environmental state recognition +for robots through the spoken language with pre-trained large-scale +vision-language models. We apply Visual Question Answering and Image-to-Text +Retrieval, which are tasks of Vision-Language Models. We show that with our +method, it is possible to recognize not only whether a room door is +open/closed, but also whether a transparent door is open/closed and whether +water is running in a sink, without training neural networks or manual +programming. In addition, the recognition accuracy can be improved by selecting +appropriate texts from the set of prepared texts based on black-box +optimization. For each state recognition, only the text set and its weighting +need to be changed, eliminating the need to prepare multiple different models +and programs, and facilitating the management of source code and computer +resource. We experimentally demonstrate the effectiveness of our method and +apply it to the recognition behavior on a mobile robot, Fetch. + +
+
+ comment: Accepted at Advanced Robotics, website - + https://haraduka.github.io/vlm-bbo/ +
+
+
+
+
+ + ☆ SCOMatch: Alleviating Overtrusting in Open-set Semi-supervised Learning ECCV 2024 + + +
+ Open-set semi-supervised learning (OSSL) leverages practical open-set +unlabeled data, comprising both in-distribution (ID) samples from seen classes +and out-of-distribution (OOD) samples from unseen classes, for semi-supervised +learning (SSL). Prior OSSL methods initially learned the decision boundary +between ID and OOD with labeled ID data, subsequently employing self-training +to refine this boundary. These methods, however, suffer from the tendency to +overtrust the labeled ID data: the scarcity of labeled data caused the +distribution bias between the labeled samples and the entire ID data, which +misleads the decision boundary to overfit. The subsequent self-training +process, based on the overfitted result, fails to rectify this problem. In this +paper, we address the overtrusting issue by treating OOD samples as an +additional class, forming a new SSL process. + Specifically, we propose SCOMatch, a novel OSSL method that 1) selects +reliable OOD samples as new labeled data with an OOD memory queue and a +corresponding update strategy and 2) integrates the new SSL process into the +original task through our Simultaneous Close-set and Open-set self-training. +SCOMatch refines the decision boundary of ID and OOD classes across the entire +dataset, thereby leading to improved results. Extensive experimental results +show that SCOMatch significantly outperforms the state-of-the-art methods on +various benchmarks. The effectiveness is further verified through ablation +studies and visualization. + +
+
+ comment: ECCV 2024 accepted +
+
+
+
+
+ + ☆ NeuroPath: A Neural Pathway Transformer for Joining the Dots of Human + Connectomes NeurIPS 2024 + + +
+ Although modern imaging technologies allow us to study connectivity between +two distinct brain regions in-vivo, an in-depth understanding of how anatomical +structure supports brain function and how spontaneous functional fluctuations +emerge remarkable cognition is still elusive. Meanwhile, tremendous efforts +have been made in the realm of machine learning to establish the nonlinear +mapping between neuroimaging data and phenotypic traits. However, the absence +of neuroscience insight in the current approaches poses significant challenges +in understanding cognitive behavior from transient neural activities. To +address this challenge, we put the spotlight on the coupling mechanism of +structural connectivity (SC) and functional connectivity (FC) by formulating +such network neuroscience question into an expressive graph representation +learning problem for high-order topology. Specifically, we introduce the +concept of topological detour to characterize how a ubiquitous instance of FC +(direct link) is supported by neural pathways (detour) physically wired by SC, +which forms a cyclic loop interacted by brain structure and function. In the +clich\'e of machine learning, the multi-hop detour pathway underlying SC-FC +coupling allows us to devise a novel multi-head self-attention mechanism within +Transformer to capture multi-modal feature representation from paired graphs of +SC and FC. Taken together, we propose a biological-inspired deep model, coined +as NeuroPath, to find putative connectomic feature representations from the +unprecedented amount of neuroimages, which can be plugged into various +downstream applications such as task recognition and disease diagnosis. We have +evaluated NeuroPath on large-scale public datasets including HCP and UK Biobank +under supervised and zero-shot learning, where the state-of-the-art performance +by our NeuroPath indicates great potential in network neuroscience. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task + Learning Via Connector-MoE + + +
+ Multi-modal large language models (MLLMs) have shown impressive capabilities +as a general-purpose interface for various visual and linguistic tasks. +However, building a unified MLLM for multi-task learning in the medical field +remains a thorny challenge. To mitigate the tug-of-war problem of multi-modal +multi-task optimization, recent advances primarily focus on improving the LLM +components, while neglecting the connector that bridges the gap between +modalities. In this paper, we introduce Uni-Med, a novel medical generalist +foundation model which consists of a universal visual feature extraction +module, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting +from the proposed CMoE that leverages a well-designed router with a mixture of +projection experts at the connector, Uni-Med achieves efficient solution to the +tug-of-war problem and can perform six different medical tasks including +question answering, visual question answering, report generation, referring +expression comprehension, referring expression generation and image +classification. To the best of our knowledge, Uni-Med is the first effort to +tackle multi-task interference at the connector. Extensive ablation experiments +validate the effectiveness of introducing CMoE under any configuration, with up +to an average 8% performance gains. We further provide interpretation analysis +of the tug-of-war problem from the perspective of gradient optimization and +parameter statistics. Compared to previous state-of-the-art medical MLLMs, +Uni-Med achieves competitive or superior evaluation metrics on diverse tasks. +Code, data and model will be soon available at GitHub. + +
+
+
+
+
+ + ☆ Shape-intensity knowledge distillation for robust medical image + segmentation + + +
+ Many medical image segmentation methods have achieved impressive results. +Yet, most existing methods do not take into account the shape-intensity prior +information. This may lead to implausible segmentation results, in particular +for images of unseen datasets. In this paper, we propose a novel approach to +incorporate joint shape-intensity prior information into the segmentation +network. Specifically, we first train a segmentation network (regarded as the +teacher network) on class-wise averaged training images to extract valuable +shape-intensity information, which is then transferred to a student +segmentation network with the same network architecture as the teacher via +knowledge distillation. In this way, the student network regarded as the final +segmentation model can effectively integrate the shape-intensity prior +information, yielding more accurate segmentation results. Despite its +simplicity, experiments on five medical image segmentation tasks of different +modalities demonstrate that the proposed Shape-Intensity Knowledge Distillation +(SIKD) consistently improves several baseline models (including recent MaxStyle +and SAMed) under intra-dataset evaluation, and significantly improves the +cross-dataset generalization ability. The code is available at +https://github.com/whdong-whu/SIKD. + +
+
+
+
+
+ + ☆ Learning Quantized Adaptive Conditions for Diffusion Models + + +
+ The curvature of ODE trajectories in diffusion models hinders their ability +to generate high-quality images in a few number of function evaluations (NFE). +In this paper, we propose a novel and effective approach to reduce trajectory +curvature by utilizing adaptive conditions. By employing a extremely +light-weight quantized encoder, our method incurs only an additional 1% of +training parameters, eliminates the need for extra regularization terms, yet +achieves significantly better sample quality. Our approach accelerates ODE +sampling while preserving the downstream task image editing capabilities of SDE +techniques. Extensive experiments verify that our method can generate high +quality results under extremely limited sampling costs. With only 6 NFE, we +achieve 5.14 FID on CIFAR-10, 6.91 FID on FFHQ 64x64 and 3.10 FID on AFHQv2. + +
+
+
+
+
+ + ☆ Global-Local Medical SAM Adaptor Based on Full Adaption + + +
+ Emerging of visual language models, such as the segment anything model (SAM), +have made great breakthroughs in the field of universal semantic segmentation +and significantly aid the improvements of medical image segmentation, in +particular with the help of Medical SAM adaptor (Med-SA). However, Med-SA still +can be improved, as it fine-tunes SAM in a partial adaption manner. To resolve +this problem, we present a novel global medical SAM adaptor (GMed-SA) with full +adaption, which can adapt SAM globally. We further combine GMed-SA and Med-SA +to propose a global-local medical SAM adaptor (GLMed-SA) to adapt SAM both +globally and locally. Extensive experiments have been performed on the +challenging public 2D melanoma segmentation dataset. The results show that +GLMed-SA outperforms several state-of-the-art semantic segmentation methods on +various evaluation metrics, demonstrating the superiority of our methods. + +
+
+
+
+
+ + ☆ Revisiting Deep Ensemble Uncertainty for Enhanced Medical Anomaly + Detection MICCAI2024 + + +
+ Medical anomaly detection (AD) is crucial in pathological identification and +localization. Current methods typically rely on uncertainty estimation in deep +ensembles to detect anomalies, assuming that ensemble learners should agree on +normal samples while exhibiting disagreement on unseen anomalies in the output +space. However, these methods may suffer from inadequate disagreement on +anomalies or diminished agreement on normal samples. To tackle these issues, we +propose D2UE, a Diversified Dual-space Uncertainty Estimation framework for +medical anomaly detection. To effectively balance agreement and disagreement +for anomaly detection, we propose Redundancy-Aware Repulsion (RAR), which uses +a similarity kernel that remains invariant to both isotropic scaling and +orthogonal transformations, explicitly promoting diversity in learners' feature +space. Moreover, to accentuate anomalous regions, we develop Dual-Space +Uncertainty (DSU), which utilizes the ensemble's uncertainty in input and +output spaces. In input space, we first calculate gradients of reconstruction +error with respect to input images. The gradients are then integrated with +reconstruction outputs to estimate uncertainty for inputs, enabling effective +anomaly discrimination even when output space disagreement is minimal. We +conduct a comprehensive evaluation of five medical benchmarks with different +backbones. Experimental results demonstrate the superiority of our method to +state-of-the-art methods and the effectiveness of each component in our +framework. Our code is available at https://github.com/Rubiscol/D2UE. + +
+
+ comment: Early accepted by MICCAI2024 +
+
+
+
+
+ + ☆ MultiClimate: Multimodal Stance Detection on Climate Change Videos + + +
+ Climate change (CC) has attracted increasing attention in NLP in recent +years. However, detecting the stance on CC in multimodal data is understudied +and remains challenging due to a lack of reliable datasets. To improve the +understanding of public opinions and communication strategies, this paper +presents MultiClimate, the first open-source manually-annotated stance +detection dataset with $100$ CC-related YouTube videos and $4,209$ +frame-transcript pairs. We deploy state-of-the-art vision and language models, +as well as multimodal models for MultiClimate stance detection. Results show +that text-only BERT significantly outperforms image-only ResNet50 and ViT. +Combining both modalities achieves state-of-the-art, $0.747$/$0.749$ in +accuracy/F1. Our 100M-sized fusion models also beat CLIP and BLIP, as well as +the much larger 9B-sized multimodal IDEFICS and text-only Llama3 and Gemma2, +indicating that multimodal stance detection remains challenging for large +language models. Our code, dataset, as well as supplementary materials, are +available at https://github.com/werywjw/MultiClimate. + +
+
+ comment: 5 pages, 1 figure +
+
+
+
+
+ + ☆ Does End-to-End Autonomous Driving Really Need Perception Tasks? + + +
+ End-to-End Autonomous Driving (E2EAD) methods typically rely on supervised +perception tasks to extract explicit scene information (e.g., objects, maps). +This reliance necessitates expensive annotations and constrains deployment and +data scalability in real-time applications. In this paper, we introduce SSR, a +novel framework that utilizes only 16 navigation-guided tokens as Sparse Scene +Representation, efficiently extracting crucial scene information for E2EAD. Our +method eliminates the need for supervised sub-tasks, allowing computational +resources to concentrate on essential elements directly related to navigation +intent. We further introduce a temporal enhancement module that employs a +Bird's-Eye View (BEV) world model, aligning predicted future scenes with actual +future scenes through self-supervision. SSR achieves state-of-the-art planning +performance on the nuScenes dataset, demonstrating a 27.2\% relative reduction +in L2 error and a 51.6\% decrease in collision rate to the leading E2EAD +method, UniAD. Moreover, SSR offers a 10.9$\times$ faster inference speed and +13$\times$ faster training time. This framework represents a significant leap +in real-time autonomous driving systems and paves the way for future scalable +deployment. Code will be released at \url{https://github.com/PeidongLi/SSR}. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ DRL-STNet: Unsupervised Domain Adaptation for Cross-modality Medical + Image Segmentation via Disentangled Representation Learning MICCAI 2024 + + +
+ Unsupervised domain adaptation (UDA) is essential for medical image +segmentation, especially in cross-modality data scenarios. UDA aims to transfer +knowledge from a labeled source domain to an unlabeled target domain, thereby +reducing the dependency on extensive manual annotations. This paper presents +DRL-STNet, a novel framework for cross-modality medical image segmentation that +leverages generative adversarial networks (GANs), disentangled representation +learning (DRL), and self-training (ST). Our method leverages DRL within a GAN +to translate images from the source to the target modality. Then, the +segmentation model is initially trained with these translated images and +corresponding source labels and then fine-tuned iteratively using a combination +of synthetic and real images with pseudo-labels and real labels. The proposed +framework exhibits superior performance in abdominal organ segmentation on the +FLARE challenge dataset, surpassing state-of-the-art methods by 11.4% in the +Dice similarity coefficient and by 13.1% in the Normalized Surface Dice metric, +achieving scores of 74.21% and 80.69%, respectively. The average running time +is 41 seconds, and the area under the GPU memory-time curve is 11,292 MB. These +results indicate the potential of DRL-STNet for enhancing cross-modality +medical image segmentation tasks. + +
+
+ comment: MICCAI 2024 Challenge, FLARE Challenge, Unsupervised domain + adaptation, Organ segmentation, Feature disentanglement, Self-training +
+
+
+
+
+ + ☆ Photon Inhibition for Energy-Efficient Single-Photon Imaging ECCV 2024 + + +
+ Single-photon cameras (SPCs) are emerging as sensors of choice for various +challenging imaging applications. One class of SPCs based on the single-photon +avalanche diode (SPAD) detects individual photons using an avalanche process; +the raw photon data can then be processed to extract scene information under +extremely low light, high dynamic range, and rapid motion. Yet, single-photon +sensitivity in SPADs comes at a cost -- each photon detection consumes more +energy than that of a CMOS camera. This avalanche power significantly limits +sensor resolution and could restrict widespread adoption of SPAD-based SPCs. We +propose a computational-imaging approach called \emph{photon inhibition} to +address this challenge. Photon inhibition strategically allocates detections in +space and time based on downstream inference task goals and resource +constraints. We develop lightweight, on-sensor computational inhibition +policies that use past photon data to disable SPAD pixels in real-time, to +select the most informative future photons. As case studies, we design policies +tailored for image reconstruction and edge detection, and demonstrate, both via +simulations and real SPC captured data, considerable reduction in photon +detections (over 90\% of photons) while maintaining task performance metrics. +Our work raises the question of ``which photons should be detected?'', and +paves the way for future energy-efficient single-photon imaging. + +
+
+ comment: Accepted for ECCV 2024. Supplementary material and code available at + https://wisionlab.com/project/inhibition +
+
+
+
+
+ + ☆ DeBaRA: Denoising-Based 3D Room Arrangement Generation NeurIPS 2024 + + +
+ Generating realistic and diverse layouts of furnished indoor 3D scenes +unlocks multiple interactive applications impacting a wide range of industries. +The inherent complexity of object interactions, the limited amount of available +data and the requirement to fulfill spatial constraints all make generative +modeling for 3D scene synthesis and arrangement challenging. Current methods +address these challenges autoregressively or by using off-the-shelf diffusion +objectives by simultaneously predicting all attributes without 3D reasoning +considerations. In this paper, we introduce DeBaRA, a score-based model +specifically tailored for precise, controllable and flexible arrangement +generation in a bounded environment. We argue that the most critical component +of a scene synthesis system is to accurately establish the size and position of +various objects within a restricted area. Based on this insight, we propose a +lightweight conditional score-based model designed with 3D spatial awareness at +its core. We demonstrate that by focusing on spatial attributes of objects, a +single trained DeBaRA model can be leveraged at test time to perform several +downstream applications such as scene synthesis, completion and re-arrangement. +Further, we introduce a novel Self Score Evaluation procedure so it can be +optimally employed alongside external LLM models. We evaluate our approach +through extensive experiments and demonstrate significant improvement upon +state-of-the-art approaches in a range of scenarios. + +
+
+ comment: Accepted at NeurIPS 2024. Preprint version +
+
+
+
+
+ + ☆ Automated Segmentation and Analysis of Microscopy Images of Laser Powder + Bed Fusion Melt Tracks + + +
+ With the increasing adoption of metal additive manufacturing (AM), +researchers and practitioners are turning to data-driven approaches to optimise +printing conditions. Cross-sectional images of melt tracks provide valuable +information for tuning process parameters, developing parameter scaling data, +and identifying defects. Here we present an image segmentation neural network +that automatically identifies and measures melt track dimensions from a +cross-section image. We use a U-Net architecture to train on a data set of 62 +pre-labelled images obtained from different labs, machines, and materials +coupled with image augmentation. When neural network hyperparameters such as +batch size and learning rate are properly tuned, the learned model shows an +accuracy for classification of over 99% and an F1 score over 90%. The neural +network exhibits robustness when tested on images captured by various users, +printed on different machines, and acquired using different microscopes. A +post-processing module extracts the height and width of the melt pool, and the +wetting angles. We discuss opportunities to improve model performance and +avenues for transfer learning, such as extension to other AM processes such as +directed energy deposition. + +
+
+ comment: 21 pages, 10 figures +
+
+
+
+
+ + ☆ Realistic Evaluation of Model Merging for Compositional Generalization + + +
+ Merging has become a widespread way to cheaply combine individual models into +a single model that inherits their capabilities and attains better performance. +This popularity has spurred rapid development of many new merging methods, +which are typically validated in disparate experimental settings and frequently +differ in the assumptions made about model architecture, data availability, and +computational budget. In this work, we characterize the relative merits of +different merging methods by evaluating them in a shared experimental setting +and precisely identifying the practical requirements of each method. +Specifically, our setting focuses on using merging for compositional +generalization of capabilities in image classification, image generation, and +natural language processing. Additionally, we measure the computational costs +of different merging methods as well as how they perform when scaling the +number of models being merged. Taken together, our results clarify the state of +the field of model merging and provide a comprehensive and rigorous +experimental setup to test new methods. + +
+
+
+
+
+ + ☆ Harnessing Wavelet Transformations for Generalizable Deepfake Forgery + Detection + + +
+ The evolution of digital image manipulation, particularly with the +advancement of deep generative models, significantly challenges existing +deepfake detection methods, especially when the origin of the deepfake is +obscure. To tackle the increasing complexity of these forgeries, we propose +\textbf{Wavelet-CLIP}, a deepfake detection framework that integrates wavelet +transforms with features derived from the ViT-L/14 architecture, pre-trained in +the CLIP fashion. Wavelet-CLIP utilizes Wavelet Transforms to deeply analyze +both spatial and frequency features from images, thus enhancing the model's +capability to detect sophisticated deepfakes. To verify the effectiveness of +our approach, we conducted extensive evaluations against existing +state-of-the-art methods for cross-dataset generalization and detection of +unseen images generated by standard diffusion models. Our method showcases +outstanding performance, achieving an average AUC of 0.749 for cross-data +generalization and 0.893 for robustness against unseen deepfakes, outperforming +all compared methods. The code can be reproduced from the repo: +\url{https://github.com/lalithbharadwajbaru/Wavelet-CLIP} + +
+
+
+
+
+ + ☆ SOAR: Self-supervision Optimized UAV Action Recognition with Efficient + Object-Aware Pretraining + + +
+ We introduce SOAR, a novel Self-supervised pretraining algorithm for aerial +footage captured by Unmanned Aerial Vehicles (UAVs). We incorporate human +object knowledge throughout the pretraining process to enhance UAV video +pretraining efficiency and downstream action recognition performance. This is +in contrast to prior works that primarily incorporate object information during +the fine-tuning stage. Specifically, we first propose a novel object-aware +masking strategy designed to retain the visibility of certain patches related +to objects throughout the pretraining phase. Second, we introduce an +object-aware loss function that utilizes object information to adjust the +reconstruction loss, preventing bias towards less informative background +patches. In practice, SOAR with a vanilla ViT backbone, outperforms best UAV +action recognition models, recording a 9.7% and 21.4% boost in top-1 accuracy +on the NEC-Drone and UAV-Human datasets, while delivering an inference speed of +18.7ms per video, making it 2x to 5x faster. Additionally, SOAR obtains +comparable accuracy to prior self-supervised learning (SSL) methods while +requiring 87.5% less pretraining time and 25% less memory usage + +
+
+
+
+
+ + ☆ Flat'n'Fold: A Diverse Multi-Modal Dataset for Garment Perception and + Manipulation + + +
+ We present Flat'n'Fold, a novel large-scale dataset for garment manipulation +that addresses critical gaps in existing datasets. Comprising 1,212 human and +887 robot demonstrations of flattening and folding 44 unique garments across 8 +categories, Flat'n'Fold surpasses prior datasets in size, scope, and diversity. +Our dataset uniquely captures the entire manipulation process from crumpled to +folded states, providing synchronized multi-view RGB-D images, point clouds, +and action data, including hand or gripper positions and rotations. We quantify +the dataset's diversity and complexity compared to existing benchmarks and show +that our dataset features natural and diverse manipulations of real-world +demonstrations of human and robot demonstrations in terms of visual and action +information. To showcase Flat'n'Fold's utility, we establish new benchmarks for +grasping point prediction and subtask decomposition. Our evaluation of +state-of-the-art models on these tasks reveals significant room for +improvement. This underscores Flat'n'Fold's potential to drive advances in +robotic perception and manipulation of deformable objects. Our dataset can be +downloaded at https://cvas-ug.github.io/flat-n-fold + +
+
+
+
+
+ + ☆ Efficient Microscopic Image Instance Segmentation for Food Crystal + Quality Control + + +
+ This paper is directed towards the food crystal quality control area for +manufacturing, focusing on efficiently predicting food crystal counts and size +distributions. Previously, manufacturers used the manual counting method on +microscopic images of food liquid products, which requires substantial human +effort and suffers from inconsistency issues. Food crystal segmentation is a +challenging problem due to the diverse shapes of crystals and their surrounding +hard mimics. To address this challenge, we propose an efficient instance +segmentation method based on object detection. Experimental results show that +the predicted crystal counting accuracy of our method is comparable with +existing segmentation methods, while being five times faster. Based on our +experiments, we also define objective criteria for separating hard mimics and +food crystals, which could benefit manual annotation tasks on similar dataset. + +
+
+
+
+
+ + ☆ Advancing Object Detection in Transportation with Multimodal Large + Language Models (MLLMs): A Comprehensive Review and Empirical Testing + + +
+ This study aims to comprehensively review and empirically evaluate the +application of multimodal large language models (MLLMs) and Large Vision Models +(VLMs) in object detection for transportation systems. In the first fold, we +provide a background about the potential benefits of MLLMs in transportation +applications and conduct a comprehensive review of current MLLM technologies in +previous studies. We highlight their effectiveness and limitations in object +detection within various transportation scenarios. The second fold involves +providing an overview of the taxonomy of end-to-end object detection in +transportation applications and future directions. Building on this, we +proposed empirical analysis for testing MLLMs on three real-world +transportation problems that include object detection tasks namely, road safety +attributes extraction, safety-critical event detection, and visual reasoning of +thermal images. Our findings provide a detailed assessment of MLLM performance, +uncovering both strengths and areas for improvement. Finally, we discuss +practical limitations and challenges of MLLMs in enhancing object detection in +transportation, thereby offering a roadmap for future research and development +in this critical area. + +
+
+
+
+
+ + ☆ Synthesizing beta-amyloid PET images from T1-weighted Structural MRI: A + Preliminary Study + + +
+ Beta-amyloid positron emission tomography (A$\beta$-PET) imaging has become a +critical tool in Alzheimer's disease (AD) research and diagnosis, providing +insights into the pathological accumulation of amyloid plaques, one of the +hallmarks of AD. However, the high cost, limited availability, and exposure to +radioactivity restrict the widespread use of A$\beta$-PET imaging, leading to a +scarcity of comprehensive datasets. Previous studies have suggested that +structural magnetic resonance imaging (MRI), which is more readily available, +may serve as a viable alternative for synthesizing A$\beta$-PET images. In this +study, we propose an approach to utilize 3D diffusion models to synthesize +A$\beta$-PET images from T1-weighted MRI scans, aiming to overcome the +limitations associated with direct PET imaging. Our method generates +high-quality A$\beta$-PET images for cognitive normal cases, although it is +less effective for mild cognitive impairment (MCI) patients due to the +variability in A$\beta$ deposition patterns among subjects. Our preliminary +results suggest that incorporating additional data, such as a larger sample of +MCI cases and multi-modality information including clinical and demographic +details, cognitive and functional assessments, and longitudinal data, may be +necessary to improve A$\beta$-PET image synthesis for MCI patients. + +
+
+
+
+
+ + ☆ Task-recency bias strikes back: Adapting covariances in Exemplar-Free + Class Incremental Learning NeurIPS 2024 + + +
+ Exemplar-Free Class Incremental Learning (EFCIL) tackles the problem of +training a model on a sequence of tasks without access to past data. Existing +state-of-the-art methods represent classes as Gaussian distributions in the +feature extractor's latent space, enabling Bayes classification or training the +classifier by replaying pseudo features. However, we identify two critical +issues that compromise their efficacy when the feature extractor is updated on +incremental tasks. First, they do not consider that classes' covariance +matrices change and must be adapted after each task. Second, they are +susceptible to a task-recency bias caused by dimensionality collapse occurring +during training. In this work, we propose AdaGauss -- a novel method that +adapts covariance matrices from task to task and mitigates the task-recency +bias owing to the additional anti-collapse loss function. AdaGauss yields +state-of-the-art results on popular EFCIL benchmarks and datasets when training +from scratch or starting from a pre-trained backbone. The code is available at: +https://github.com/grypesc/AdaGauss. + +
+
+ comment: Accepted for NeurIPS 2024 +
+
+
+
+
+ + ☆ Omni6D: Large-Vocabulary 3D Object Dataset for Category-Level 6D Object + Pose Estimation ECCV 2024 + + +
+ 6D object pose estimation aims at determining an object's translation, +rotation, and scale, typically from a single RGBD image. Recent advancements +have expanded this estimation from instance-level to category-level, allowing +models to generalize across unseen instances within the same category. However, +this generalization is limited by the narrow range of categories covered by +existing datasets, such as NOCS, which also tend to overlook common real-world +challenges like occlusion. To tackle these challenges, we introduce Omni6D, a +comprehensive RGBD dataset featuring a wide range of categories and varied +backgrounds, elevating the task to a more realistic context. 1) The dataset +comprises an extensive spectrum of 166 categories, 4688 instances adjusted to +the canonical pose, and over 0.8 million captures, significantly broadening the +scope for evaluation. 2) We introduce a symmetry-aware metric and conduct +systematic benchmarks of existing algorithms on Omni6D, offering a thorough +exploration of new challenges and insights. 3) Additionally, we propose an +effective fine-tuning approach that adapts models from previous datasets to our +extensive vocabulary setting. We believe this initiative will pave the way for +new insights and substantial progress in both the industrial and academic +fields, pushing forward the boundaries of general 6D pose estimation. + +
+
+ comment: ECCV 2024 (poster). Github page: https://github.com/3DTopia/Omni6D +
+
+
+
+
+ + ♻ ☆ Gaussian Deja-vu: Creating Controllable 3D Gaussian Head-Avatars with + Enhanced Generalization and Personalization Abilities WACV 2025 + + +
+ Recent advancements in 3D Gaussian Splatting (3DGS) have unlocked significant +potential for modeling 3D head avatars, providing greater flexibility than +mesh-based methods and more efficient rendering compared to NeRF-based +approaches. Despite these advancements, the creation of controllable 3DGS-based +head avatars remains time-intensive, often requiring tens of minutes to hours. +To expedite this process, we here introduce the ``Gaussian D\'ej\`a-vu" +framework, which first obtains a generalized model of the head avatar and then +personalizes the result. The generalized model is trained on large 2D +(synthetic and real) image datasets. This model provides a well-initialized 3D +Gaussian head that is further refined using a monocular video to achieve the +personalized head avatar. For personalizing, we propose learnable +expression-aware rectification blendmaps to correct the initial 3D Gaussians, +ensuring rapid convergence without the reliance on neural networks. Experiments +demonstrate that the proposed method meets its objectives. It outperforms +state-of-the-art 3D Gaussian head avatars in terms of photorealistic quality as +well as reduces training time consumption to at least a quarter of the existing +methods, producing the avatar in minutes. + +
+
+ comment: 11 pages, Accepted by WACV 2025 in Round 1 +
+
+
+
+
+ + ♻ ☆ Chat-Scene: Bridging 3D Scene and Large Language Models with Object + Identifiers + + +
+ Recent advancements in 3D Large Language Models (LLMs) have demonstrated +promising capabilities for 3D scene understanding. However, previous methods +exhibit deficiencies in general referencing and grounding capabilities for +intricate scene comprehension. In this paper, we introduce the use of object +identifiers and object-centric representations to interact with scenes at the +object level. Specifically, we decompose the input 3D scene into a set of +object proposals, each assigned a unique identifier token, which enables +efficient object referencing and grounding during user-assistant interactions. +Given the scarcity of scene-language data, we model the scene embeddings as a +sequence of explicit object-level embeddings, derived from semantic-rich 2D or +3D representations. By employing object identifiers, we transform diverse 3D +scene-language tasks into a unified question-answering format, facilitating +joint training without the need for additional task-specific heads. With +minimal fine-tuning on all downstream tasks, our model significantly +outperforms existing methods on benchmarks including ScanRefer, Multi3DRefer, +Scan2Cap, ScanQA, and SQA3D. + +
+
+
+
+
+ + ♻ ☆ Exploring Event-based Human Pose Estimation with 3D Event + Representations + + +
+ Human pose estimation is a fundamental and appealing task in computer vision. +Although traditional cameras are commonly applied, their reliability decreases +in scenarios under high dynamic range or heavy motion blur, where event cameras +offer a robust solution. Predominant event-based methods accumulate events into +frames, ignoring the asynchronous and high temporal resolution that is crucial +for distinguishing distinct actions. To address this issue and to unlock the 3D +potential of event information, we introduce two 3D event representations: the +Rasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The +RasEPC aggregates events within concise temporal slices at identical positions, +preserving their 3D attributes along with statistical information, thereby +significantly reducing memory and computational demands. Meanwhile, the DEV +representation discretizes events into voxels and projects them across three +orthogonal planes, utilizing decoupled event attention to retrieve 3D cues from +the 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic +event-based dataset crafted to facilitate training and quantitative analysis in +outdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD +dataset, and our EV-3DPW dataset, with further qualitative validation via a +derived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our +code and dataset have been made publicly available at +https://github.com/MasterHow/EventPointPose. + +
+
+ comment: Accepted to Computer Vision and Image Understanding (CVPU). Extended + version of arXiv:2206.04511. The code and dataset are available at + https://github.com/MasterHow/EventPointPose +
+
+
+
+
+ + ♻ ☆ Synthesizing Environment-Specific People in Photographs ECCV 2024 + + +
+ We present ESP, a novel method for context-aware full-body generation, that +enables photo-realistic synthesis and inpainting of people wearing clothing +that is semantically appropriate for the scene depicted in an input photograph. +ESP is conditioned on a 2D pose and contextual cues that are extracted from the +photograph of the scene and integrated into the generation process, where the +clothing is modeled explicitly with human parsing masks (HPM). Generated HPMs +are used as tight guiding masks for inpainting, such that no changes are made +to the original background. Our models are trained on a dataset containing a +set of in-the-wild photographs of people covering a wide range of different +environments. The method is analyzed quantitatively and qualitatively, and we +show that ESP outperforms the state-of-the-art on the task of contextual +full-body generation. + +
+
+ comment: Accepted at ECCV 2024, Project: https://esp.is.tue.mpg.de +
+
+
+
+
+ + ♻ ☆ Valeo4Cast: A Modular Approach to End-to-End Forecasting ECCV + + +
+ Motion forecasting is crucial in autonomous driving systems to anticipate the +future trajectories of surrounding agents such as pedestrians, vehicles, and +traffic signals. In end-to-end forecasting, the model must jointly detect and +track from sensor data (cameras or LiDARs) the past trajectories of the +different elements of the scene and predict their future locations. We depart +from the current trend of tackling this task via end-to-end training from +perception to forecasting, and instead use a modular approach. We individually +build and train detection, tracking and forecasting modules. We then only use +consecutive finetuning steps to integrate the modules better and alleviate +compounding errors. We conduct an in-depth study on the finetuning strategies +and it reveals that our simple yet effective approach significantly improves +performance on the end-to-end forecasting benchmark. Consequently, our solution +ranks first in the Argoverse 2 End-to-end Forecasting Challenge, with 63.82 +mAPf. We surpass forecasting results by +17.1 points over last year's winner +and by +13.3 points over this year's runner-up. This remarkable performance in +forecasting can be explained by our modular paradigm, which integrates +finetuning strategies and significantly outperforms the end-to-end-trained +counterparts. The code, model weights and results are made available +https://github.com/valeoai/valeo4cast. + +
+
+ comment: Winning solution of the Argoverse 2 "Unified Detection, Tracking, and + Forecasting" challenge; work accepted at Road++ ECCVW 2024 +
+
+
+
+
+ + ♻ ☆ Disentangled Clothed Avatar Generation from Text Descriptions + + +
+ In this paper, we introduce a novel text-to-avatar generation method that +separately generates the human body and the clothes and allows high-quality +animation on the generated avatar. While recent advancements in text-to-avatar +generation have yielded diverse human avatars from text prompts, these methods +typically combine all elements-clothes, hair, and body-into a single 3D +representation. Such an entangled approach poses challenges for downstream +tasks like editing or animation. To overcome these limitations, we propose a +novel disentangled 3D avatar representation named Sequentially Offset-SMPL +(SO-SMPL), building upon the SMPL model. SO-SMPL represents the human body and +clothes with two separate meshes but associates them with offsets to ensure the +physical alignment between the body and the clothes. Then, we design a Score +Distillation Sampling (SDS)-based distillation framework to generate the +proposed SO-SMPL representation from text prompts. Our approach not only +achieves higher texture and geometry quality and better semantic alignment with +text prompts, but also significantly improves the visual quality of character +animation, virtual try-on, and avatar editing. Project page: +https://shanemankiw.github.io/SO-SMPL/. + +
+
+ comment: Project page: https://shanemankiw.github.io/SO-SMPL/ +
+
+
+
+
+ + ♻ ☆ Jumping through Local Minima: Quantization in the Loss Landscape of + Vision Transformers + + +
+ Quantization scale and bit-width are the most important parameters when +considering how to quantize a neural network. Prior work focuses on optimizing +quantization scales in a global manner through gradient methods (gradient +descent \& Hessian analysis). Yet, when applying perturbations to quantization +scales, we observe a very jagged, highly non-smooth test loss landscape. In +fact, small perturbations in quantization scale can greatly affect accuracy, +yielding a $0.5-0.8\%$ accuracy boost in 4-bit quantized vision transformers +(ViTs). In this regime, gradient methods break down, since they cannot reliably +reach local minima. In our work, dubbed Evol-Q, we use evolutionary search to +effectively traverse the non-smooth landscape. Additionally, we propose using +an infoNCE loss, which not only helps combat overfitting on the small +calibration dataset ($1,000$ images) but also makes traversing such a highly +non-smooth surface easier. Evol-Q improves the top-1 accuracy of a fully +quantized ViT-Base by $10.30\%$, $0.78\%$, and $0.15\%$ for $3$-bit, $4$-bit, +and $8$-bit weight quantization levels. Extensive experiments on a variety of +CNN and ViT architectures further demonstrate its robustness in extreme +quantization scenarios. Our code is available at +https://github.com/enyac-group/evol-q + +
+
+ comment: arXiv admin note: text overlap with arXiv:2211.09643 +
+
+
+
+
+ + ♻ ☆ LingoQA: Visual Question Answering for Autonomous Driving ECCV 2024 + + +
+ We introduce LingoQA, a novel dataset and benchmark for visual question +answering in autonomous driving. The dataset contains 28K unique short video +scenarios, and 419K annotations. Evaluating state-of-the-art vision-language +models on our benchmark shows that their performance is below human +capabilities, with GPT-4V responding truthfully to 59.6% of the questions +compared to 96.6% for humans. For evaluation, we propose a truthfulness +classifier, called Lingo-Judge, that achieves a 0.95 Spearman correlation +coefficient to human evaluations, surpassing existing techniques like METEOR, +BLEU, CIDEr, and GPT-4. We establish a baseline vision-language model and run +extensive ablation studies to understand its performance. We release our +dataset and benchmark as an evaluation platform for vision-language models in +autonomous driving. + +
+
+ comment: Accepted to ECCV 2024. Benchmark and dataset are available at + https://github.com/wayveai/LingoQA/ +
+
+
+
+
+ + ♻ ☆ Manydepth2: Motion-Aware Self-Supervised Monocular Depth Estimation in + Dynamic Scenes + + +
+ Despite advancements in self-supervised monocular depth estimation, +challenges persist in dynamic scenarios due to the dependence on assumptions +about a static world. In this paper, we present Manydepth2, a Motion-Guided +Cost Volume Depth Net, to achieve precise depth estimation for both dynamic +objects and static backgrounds, all while maintaining computational efficiency. +To tackle the challenges posed by dynamic content, we incorporate optical flow +and coarse monocular depth to create a novel static reference frame. This frame +is then utilized to build a motion-guided cost volume in collaboration with the +target frame. Additionally, to enhance the accuracy and resilience of the +network structure, we introduce an attention-based depth net architecture to +effectively integrate information from feature maps with varying resolutions. +Compared to methods with similar computational costs, Manydepth2 achieves a +significant reduction of approximately five percent in root-mean-square error +for self-supervised monocular depth estimation on the KITTI-2015 dataset. The +code could be found: https://github.com/kaichen-z/Manydepth2 + +
+
+ comment: Monocular Depth Estimation, Self-Supervised, Optical Flow +
+
+
+
+
+ + ♻ ☆ CollaMamba: Efficient Collaborative Perception with Cross-Agent + Spatial-Temporal State Space Model AAAI 2025 + + +
+ By sharing complementary perceptual information, multi-agent collaborative +perception fosters a deeper understanding of the environment. Recent studies on +collaborative perception mostly utilize CNNs or Transformers to learn feature +representation and fusion in the spatial dimension, which struggle to handle +long-range spatial-temporal features under limited computing and communication +resources. Holistically modeling the dependencies over extensive spatial areas +and extended temporal frames is crucial to enhancing feature quality. To this +end, we propose a resource efficient cross-agent spatial-temporal collaborative +state space model (SSM), named CollaMamba. Initially, we construct a +foundational backbone network based on spatial SSM. This backbone adeptly +captures positional causal dependencies from both single-agent and cross-agent +views, yielding compact and comprehensive intermediate features while +maintaining linear complexity. Furthermore, we devise a history-aware feature +boosting module based on temporal SSM, extracting contextual cues from extended +historical frames to refine vague features while preserving low overhead. +Extensive experiments across several datasets demonstrate that CollaMamba +outperforms state-of-the-art methods, achieving higher model accuracy while +reducing computational and communication overhead by up to 71.9% and 1/64, +respectively. This work pioneers the exploration of the Mamba's potential in +collaborative perception. The source code will be made available. + +
+
+ comment: Submitted to AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Computational Trichromacy Reconstruction: Empowering the Color-Vision + Deficient to Recognize Colors Using Augmented Reality + + +
+ We propose an assistive technology that helps individuals with Color Vision +Deficiencies (CVD) to recognize/name colors. A dichromat's color perception is +a reduced two-dimensional (2D) subset of a normal trichromat's three +dimensional color (3D) perception, leading to confusion when visual stimuli +that appear identical to the dichromat are referred to by different color +names. Using our proposed system, CVD individuals can interactively induce +distinct perceptual changes to originally confusing colors via a computational +color space transformation. By combining their original 2D precepts for colors +with the discriminative changes, a three dimensional color space is +reconstructed, where the dichromat can learn to resolve color name confusions +and accurately recognize colors. Our system is implemented as an Augmented +Reality (AR) interface on smartphones, where users interactively control the +rotation through swipe gestures and observe the induced color shifts in the +camera view or in a displayed image. Through psychophysical experiments and a +longitudinal user study, we demonstrate that such rotational color shifts have +discriminative power (initially confusing colors become distinct under +rotation) and exhibit structured perceptual shifts dichromats can learn with +modest training. The AR App is also evaluated in two real-world scenarios +(building with lego blocks and interpreting artistic works); users all report +positive experience in using the App to recognize object colors that they +otherwise could not. + +
+
+
+
+
+ + ♻ ☆ EAGLES: Efficient Accelerated 3D Gaussians with Lightweight EncodingS + + +
+ Recently, 3D Gaussian splatting (3D-GS) has gained popularity in novel-view +scene synthesis. It addresses the challenges of lengthy training times and slow +rendering speeds associated with Neural Radiance Fields (NeRFs). Through rapid, +differentiable rasterization of 3D Gaussians, 3D-GS achieves real-time +rendering and accelerated training. They, however, demand substantial memory +resources for both training and storage, as they require millions of Gaussians +in their point cloud representation for each scene. We present a technique +utilizing quantized embeddings to significantly reduce per-point memory storage +requirements and a coarse-to-fine training strategy for a faster and more +stable optimization of the Gaussian point clouds. Our approach develops a +pruning stage which results in scene representations with fewer Gaussians, +leading to faster training times and rendering speeds for real-time rendering +of high resolution scenes. We reduce storage memory by more than an order of +magnitude all while preserving the reconstruction quality. We validate the +effectiveness of our approach on a variety of datasets and scenes preserving +the visual quality while consuming 10-20x lesser memory and faster +training/inference speed. Project page and code is available +https://efficientgaussian.github.io + +
+
+ comment: Website: https://efficientgaussian.github.io Code: + https://github.com/Sharath-girish/efficientgaussian +
+
+
+
+
+ + ♻ ☆ Low-Rank Interconnected Adaptation across Layers + + +
+ Low-rank adaptation (LoRA) is a powerful parameter-efficient fine-tuning +method that utilizes low-rank projectors $A$ and $B$ to learn weight updates +$\Delta W$ for adaptation targets $W$. Previous research has shown that LoRA is +essentially a gradient compressor, performing random projections on the +gradient using a fixed projection matrix $A_0$. However, this setup restricts +the overall weight update to be low-rank, which limits the adaptation +performance. In this paper, we propose low-rank interconnected adaptation +across layers (Lily). Specifically, we employ a hierarchical framework where +low-dimensional projectors (LPs) retained for downward projection at a +particular level, while globally-shared high-dimensional projector (HP) experts +perform upward projection across all levels of layers. Lily uniquely connects +each LP to all HP experts, therefore the gradient projections are no longer +dominated by fixed projection matrices, but rather by selective combinations of +all the projectors, thereby breaking the low-rank constraint of LoRA. +Furthermore, Lily's cross-layer connections facilitate the capture of intricate +information and dependencies across different layers, thereby enhancing the +model's representational capabilities. Experiments across various modalities, +architectures, and model sizes underscore Lily's great performance and +efficiency. Code is available on github https://github.com/yibozhong/lily. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ♻ ☆ OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera + Fusion for Colorizing Point Clouds ICRA + + +
+ A Colored point cloud, as a simple and efficient 3D representation, has many +advantages in various fields, including robotic navigation and scene +reconstruction. This representation is now commonly used in 3D reconstruction +tasks relying on cameras and LiDARs. However, fusing data from these two types +of sensors is poorly performed in many existing frameworks, leading to +unsatisfactory mapping results, mainly due to inaccurate camera poses. This +paper presents OmniColor, a novel and efficient algorithm to colorize point +clouds using an independent 360-degree camera. Given a LiDAR-based point cloud +and a sequence of panorama images with initial coarse camera poses, our +objective is to jointly optimize the poses of all frames for mapping images +onto geometric reconstructions. Our pipeline works in an off-the-shelf manner +that does not require any feature extraction or matching process. Instead, we +find optimal poses by directly maximizing the photometric consistency of LiDAR +maps. In experiments, we show that our method can overcome the severe visual +distortion of omnidirectional images and greatly benefit from the wide field of +view (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy +and stability. The code will be released at +https://github.com/liubonan123/OmniColor/. + +
+
+ comment: 2024 IEEE International Conference on Robotics and Automation (ICRA) +
+
+
+
+
+ + ♻ ☆ SF-MMCN: Low-Power Sever Flow Multi-Mode Diffusion Model Accelerator + + +
+ Generative Artificial Intelligence (AI) has become incredibly popular in +recent years, and the significance of traditional accelerators in dealing with +large-scale parameters is urgent. With the diffusion model's parallel +structure, the hardware design challenge has skyrocketed because of the +multiple layers operating simultaneously. Convolution Neural Network (CNN) +accelerators have been designed and developed rapidly, especially for +high-speed inference. Often, CNN models with parallel structures are deployed. +In these CNN accelerators, many Processing Elements (PE) are required to +perform parallel computations, mainly the multiply and accumulation (MAC) +operation, resulting in high power consumption and a large silicon area. In +this work, a Server Flow Multi-Mode CNN Unit (SF-MMCN) is proposed to reduce +the number of PE while improving the operation efficiency of the CNN +accelerator. The pipelining technique is introduced into Server Flow to process +parallel computations. The proposed SF-MMCN is implemented with TSMC 90-nm CMOS +technology. It is evaluated with VGG-16, ResNet-18, and U-net. The evaluation +results show that the proposed SF-MMCN can reduce the power consumption by 92%, +and the silicon area by 70%, while improving the efficiency of operation by +nearly 81 times. A new FoM, area efficiency (GOPs/mm^2) is also introduced to +evaluate the performance of the accelerator in terms of the ratio throughput +(GOPs) and silicon area (mm^2). In this FoM, SF-MMCN improves area efficiency +by 18 times (18.42). + +
+
+ comment: 16 pages, 16 figures; extend the CNN to process Diffusion Model + (possible this is the first reported hardware Diffusion Model implementation) +
+
+
+
+
+ + ♻ ☆ 2D and 3D Deep Learning Models for MRI-based Parkinson's Disease + Classification: A Comparative Analysis of Convolutional Kolmogorov-Arnold + Networks, Convolutional Neural Networks, and Graph Convolutional Networks + + +
+ Parkinson's Disease (PD) diagnosis remains challenging. This study applies +Convolutional Kolmogorov-Arnold Networks (ConvKANs), integrating learnable +spline-based activation functions into convolutional layers, for PD +classification using structural MRI. The first 3D implementation of ConvKANs +for medical imaging is presented, comparing their performance to Convolutional +Neural Networks (CNNs) and Graph Convolutional Networks (GCNs) across three +open-source datasets. Isolated analyses assessed performance within individual +datasets, using cross-validation techniques. Holdout analyses evaluated +cross-dataset generalizability by training models on two datasets and testing +on the third, mirroring real-world clinical scenarios. In isolated analyses, 2D +ConvKANs achieved the highest AUC of 0.99 (95% CI: 0.98-0.99) on the PPMI +dataset, outperforming 2D CNNs (AUC: 0.97, p = 0.0092). 3D models showed +promise, with 3D CNN and 3D ConvKAN reaching an AUC of 0.85 on PPMI. In holdout +analyses, 3D ConvKAN demonstrated superior generalization, achieving an AUC of +0.85 on early-stage PD data. GCNs underperformed in 2D but improved in 3D +implementations. These findings highlight ConvKANs' potential for PD detection, +emphasize the importance of 3D analysis in capturing subtle brain changes, and +underscore cross-dataset generalization challenges. This study advances +AI-assisted PD diagnosis using structural MRI and emphasizes the need for +larger-scale validation. + +
+
+ comment: 7 figures +
+
+
+
+
+ + ♻ ☆ Diffusion-based Generative Image Outpainting for Recovery of + FOV-Truncated CT Images + + +
+ Field-of-view (FOV) recovery of truncated chest CT scans is crucial for +accurate body composition analysis, which involves quantifying skeletal muscle +and subcutaneous adipose tissue (SAT) on CT slices. This, in turn, enables +disease prognostication. Here, we present a method for recovering truncated CT +slices using generative image outpainting. We train a diffusion model and apply +it to truncated CT slices generated by simulating a small FOV. Our model +reliably recovers the truncated anatomy and outperforms the previous +state-of-the-art despite being trained on 87% less data. + +
+
+ comment: Shared last authorship: Florian J. Fintelmann and Philip M\"uller +
+
+
+
+
+ + ♻ ☆ Enhanced Unsupervised Image-to-Image Translation Using Contrastive + Learning and Histogram of Oriented Gradients + + +
+ Image-to-Image Translation is a vital area of computer vision that focuses on +transforming images from one visual domain to another while preserving their +core content and structure. However, this field faces two major challenges: +first, the data from the two domains are often unpaired, making it difficult to +train generative adversarial networks effectively; second, existing methods +tend to produce artifacts or hallucinations during image generation, leading to +a decline in image quality. To address these issues, this paper proposes an +enhanced unsupervised image-to-image translation method based on the +Contrastive Unpaired Translation (CUT) model, incorporating Histogram of +Oriented Gradients (HOG) features. This novel approach ensures the preservation +of the semantic structure of images, even without semantic labels, by +minimizing the loss between the HOG features of input and generated images. The +method was tested on translating synthetic game environments from GTA5 dataset +to realistic urban scenes in cityscapes dataset, demonstrating significant +improvements in reducing hallucinations and enhancing image quality. + +
+
+ comment: Critical Errors in Data or Analysis +
+
+
+
+
+ + ♻ ☆ Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation CoRL 2024 + + +
+ Given the high cost of collecting robotic data in the real world, sample +efficiency is a consistently compelling pursuit in robotics. In this paper, we +introduce SGRv2, an imitation learning framework that enhances sample +efficiency through improved visual and action representations. Central to the +design of SGRv2 is the incorporation of a critical inductive bias-action +locality, which posits that robot's actions are predominantly influenced by the +target object and its interactions with the local environment. Extensive +experiments in both simulated and real-world settings demonstrate that action +locality is essential for boosting sample efficiency. SGRv2 excels in RLBench +tasks with keyframe control using merely 5 demonstrations and surpasses the RVT +baseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and +MimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR. +In real-world environments, with only eight demonstrations, SGRv2 can perform a +variety of tasks at a markedly higher success rate compared to baseline models. +Project website: http://sgrv2-robot.github.io + +
+
+ comment: CoRL 2024. Project website: http://sgrv2-robot.github.io +
+
+
+
+
+ + ♻ ☆ AnoVox: A Benchmark for Multimodal Anomaly Detection in Autonomous + Driving ECCV 2024 + + +
+ The scale-up of autonomous vehicles depends heavily on their ability to deal +with anomalies, such as rare objects on the road. In order to handle such +situations, it is necessary to detect anomalies in the first place. Anomaly +detection for autonomous driving has made great progress in the past years but +suffers from poorly designed benchmarks with a strong focus on camera data. In +this work, we propose AnoVox, the largest benchmark for ANOmaly detection in +autonomous driving to date. AnoVox incorporates large-scale multimodal sensor +data and spatial VOXel ground truth, allowing for the comparison of methods +independent of their used sensor. We propose a formal definition of normality +and provide a compliant training dataset. AnoVox is the first benchmark to +contain both content and temporal anomalies. + +
+
+ comment: Daniel Bogdoll, Iramm Hamdard, and Lukas Namgyu R\"o{\ss}ler + contributed equally. Accepted for publication at ECCV 2024 W-CODA workshop +
+
+
+
+
+ + ♻ ☆ Interpretable Vision-Language Survival Analysis with Ordinal Inductive + Bias for Computational Pathology + + +
+ Histopathology Whole-Slide Images (WSIs) provide an important tool to assess +cancer prognosis in computational pathology (CPATH). While existing survival +analysis (SA) approaches have made exciting progress, they are generally +limited to adopting highly-expressive architectures and only coarse-grained +patient-level labels to learn prognostic visual representations from gigapixel +WSIs. Such learning paradigm suffers from important performance bottlenecks, +when facing present scarce training data and standard multi-instance learning +(MIL) framework in CPATH. To overcome it, this paper, for the first time, +proposes a new Vision-Language-based SA (VLSA) paradigm. Concretely, (1) VLSA +is driven by pathology VL foundation models. It no longer relies on +high-capability networks and shows the advantage of data efficiency. (2) In +vision-end, VLSA encodes prognostic language prior and then employs it as +auxiliary signals to guide the aggregating of prognostic visual features at +instance level, thereby compensating for the weak supervision in MIL. Moreover, +given the characteristics of SA, we propose i) ordinal survival prompt learning +to transform continuous survival labels into textual prompts; and ii) ordinal +incidence function as prediction target to make SA compatible with VL-based +prediction. Notably, VLSA's predictions can be interpreted intuitively by our +Shapley values-based method. The extensive experiments on five datasets confirm +the effectiveness of our scheme. Our VLSA could pave a new way for SA in CPATH +by offering weakly-supervised MIL an effective means to learn valuable +prognostic clues from gigapixel WSIs. Our source code is available at +https://github.com/liupei101/VLSA. + +
+
+ comment: 24 pages, 11 tables, 6 figures +
+
+
+
+
+ + ♻ ☆ Fast Sampling Through The Reuse Of Attention Maps In Diffusion Models + + +
+ Text-to-image diffusion models have demonstrated unprecedented capabilities +for flexible and realistic image synthesis. Nevertheless, these models rely on +a time-consuming sampling procedure, which has motivated attempts to reduce +their latency. When improving efficiency, researchers often use the original +diffusion model to train an additional network designed specifically for fast +image generation. In contrast, our approach seeks to reduce latency directly, +without any retraining, fine-tuning, or knowledge distillation. In particular, +we find the repeated calculation of attention maps to be costly yet redundant, +and instead suggest reusing them during sampling. Our specific reuse strategies +are based on ODE theory, which implies that the later a map is reused, the +smaller the distortion in the final image. We empirically compare these reuse +strategies with few-step sampling procedures of comparable latency, finding +that reuse generates images that are closer to those produced by the original +high-latency diffusion model. + +
+
+
+
+
+ + ♻ ☆ ICON: Improving Inter-Report Consistency in Radiology Report Generation + via Lesion-aware Mixup Augmentation + + +
+ Previous research on radiology report generation has made significant +progress in terms of increasing the clinical accuracy of generated reports. In +this paper, we emphasize another crucial quality that it should possess, i.e., +inter-report consistency, which refers to the capability of generating +consistent reports for semantically equivalent radiographs. This quality is +even of greater significance than the overall report accuracy in terms of +ensuring the system's credibility, as a system prone to providing conflicting +results would severely erode users' trust. Regrettably, existing approaches +struggle to maintain inter-report consistency, exhibiting biases towards common +patterns and susceptibility to lesion variants. To address this issue, we +propose ICON, which improves the inter-report consistency of radiology report +generation. Aiming to enhance the system's ability to capture similarities in +semantically equivalent lesions, our approach first involves extracting lesions +from input images and examining their characteristics. Then, we introduce a +lesion-aware mixup technique to ensure that the representations of the +semantically equivalent lesions align with the same attributes, achieved +through a linear combination during the training phase. Extensive experiments +on three publicly available chest X-ray datasets verify the effectiveness of +our approach, both in terms of improving the consistency and accuracy of the +generated reports. + +
+
+
+
+
+ + ♻ ☆ Direct Learning of Mesh and Appearance via 3D Gaussian Splatting + + +
+ Accurately reconstructing a 3D scene including explicit geometry information +is both attractive and challenging. Geometry reconstruction can benefit from +incorporating differentiable appearance models, such as Neural Radiance Fields +and 3D Gaussian Splatting (3DGS). However, existing methods encounter +efficiency issues due to indirect geometry learning and the paradigm of +separately modeling geometry and surface appearance. In this work, we propose a +learnable scene model that incorporates 3DGS with an explicit geometry +representation, namely a mesh. Our model learns the mesh and appearance in an +end-to-end manner, where we bind 3D Gaussians to the mesh faces and perform +differentiable rendering of 3DGS to obtain photometric supervision. The model +creates an effective information pathway to supervise the learning of both 3DGS +and mesh. Experimental results demonstrate that the learned scene model not +only achieves state-of-the-art efficiency and rendering quality but also +supports manipulation using the explicit mesh. In addition, our model has a +unique advantage in adapting to scene updates, thanks to the end-to-end +learning of both mesh and appearance. + +
+
+
+
+
+ + ♻ ☆ Latent Watermark: Inject and Detect Watermarks in Latent Diffusion Space + + +
+ Watermarking is a tool for actively identifying and attributing the images +generated by latent diffusion models. Existing methods face the dilemma of +image quality and watermark robustness. Watermarks with superior image quality +usually have inferior robustness against attacks such as blurring and JPEG +compression, while watermarks with superior robustness usually significantly +damage image quality. This dilemma stems from the traditional paradigm where +watermarks are injected and detected in pixel space, relying on pixel +perturbation for watermark detection and resilience against attacks. In this +paper, we highlight that an effective solution to the problem is to both inject +and detect watermarks in the latent diffusion space, and propose Latent +Watermark with a progressive training strategy. It weakens the direct +connection between quality and robustness and thus alleviates their +contradiction. We conduct evaluations on two datasets and against 10 watermark +attacks. Six metrics measure the image quality and watermark robustness. +Results show that compared to the recently proposed methods such as +StableSignature, StegaStamp, RoSteALS, LaWa, TreeRing, and DiffuseTrace, LW not +only surpasses them in terms of robustness but also offers superior image +quality. Our code will be available at +https://github.com/RichardSunnyMeng/LatentWatermark. + +
+
+
+
+
+ + ♻ ☆ Deep Self-Cleansing for Medical Image Segmentation with Noisy Labels + + +
+ Medical image segmentation is crucial in the field of medical imaging, aiding +in disease diagnosis and surgical planning. Most established segmentation +methods rely on supervised deep learning, in which clean and precise labels are +essential for supervision and significantly impact the performance of models. +However, manually delineated labels often contain noise, such as missing labels +and inaccurate boundary delineation, which can hinder networks from correctly +modeling target characteristics. In this paper, we propose a deep +self-cleansing segmentation framework that can preserve clean labels while +cleansing noisy ones in the training phase. To achieve this, we devise a +gaussian mixture model-based label filtering module that distinguishes noisy +labels from clean labels. Additionally, we develop a label cleansing module to +generate pseudo low-noise labels for identified noisy samples. The preserved +clean labels and pseudo-labels are then used jointly to supervise the network. +Validated on a clinical liver tumor dataset and a public cardiac diagnosis +dataset, our method can effectively suppress the interference from noisy labels +and achieve prominent segmentation performance. + +
+
+ comment: 31 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ MMCode: Benchmarking Multimodal Large Language Models for Code + Generation with Visually Rich Programming Problems EMNLP 2024 + + +
+ Programming often involves converting detailed and complex specifications +into code, a process during which developers typically utilize visual aids to +more effectively convey concepts. While recent developments in Large Multimodal +Models have demonstrated remarkable abilities in visual reasoning and +mathematical tasks, there is little work on investigating whether these models +can effectively interpret visual elements for code generation. To this end, we +present MMCode, the first multi-modal coding dataset for evaluating algorithmic +problem-solving skills in visually rich contexts. MMCode contains 3,548 +questions and 6,620 images collected from real-world programming challenges +harvested from 10 code competition websites, presenting significant challenges +due to the extreme demand for reasoning abilities. Our experiment results show +that current state-of-the-art models struggle to solve these problems. The +results highlight the lack of powerful vision-code models, and we hope MMCode +can serve as an inspiration for future works in this domain. The data and code +are publicly available at https://github.com/likaixin2000/MMCode. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Recursive Distillation for Open-Set Distributed Robot Localization + + +
+ A typical assumption in state-of-the-art self-localization models is that an +annotated training dataset is available for the target workspace. However, this +is not necessarily true when a robot travels around the general open world. +This work introduces a novel training scheme for open-world distributed robot +systems. In our scheme, a robot (``student") can ask the other robots it meets +at unfamiliar places (``teachers") for guidance. Specifically, a +pseudo-training dataset is reconstructed from the teacher model and then used +for continual learning of the student model under domain, class, and vocabulary +incremental setup. Unlike typical knowledge transfer schemes, our scheme +introduces only minimal assumptions on the teacher model, so that it can handle +various types of open-set teachers, including those uncooperative, untrainable +(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In +this paper, we investigate a ranking function as an instance of such generic +models, using a challenging data-free recursive distillation scenario, where a +student once trained can recursively join the next-generation open teacher set. + +
+
+ comment: 5 pages, 4 figures, technical report +
+
+
+
+
+ + ♻ ☆ Unsupervised Cross-Domain Image Retrieval via Prototypical Optimal + Transport + + +
+ Unsupervised cross-domain image retrieval (UCIR) aims to retrieve images +sharing the same category across diverse domains without relying on labeled +data. Prior approaches have typically decomposed the UCIR problem into two +distinct tasks: intra-domain representation learning and cross-domain feature +alignment. However, these segregated strategies overlook the potential +synergies between these tasks. This paper introduces ProtoOT, a novel Optimal +Transport formulation explicitly tailored for UCIR, which integrates +intra-domain feature representation learning and cross-domain alignment into a +unified framework. ProtoOT leverages the strengths of the K-means clustering +method to effectively manage distribution imbalances inherent in UCIR. By +utilizing K-means for generating initial prototypes and approximating class +marginal distributions, we modify the constraints in Optimal Transport +accordingly, significantly enhancing its performance in UCIR scenarios. +Furthermore, we incorporate contrastive learning into the ProtoOT framework to +further improve representation learning. This encourages local semantic +consistency among features with similar semantics, while also explicitly +enforcing separation between features and unmatched prototypes, thereby +enhancing global discriminativeness. ProtoOT surpasses existing +state-of-the-art methods by a notable margin across benchmark datasets. +Notably, on DomainNet, ProtoOT achieves an average P@200 enhancement of 18.17%, +and on Office-Home, it demonstrates a P@15 improvement of 3.83%. + +
+
+
+
+
+ + ♻ ☆ Fixed-length Dense Descriptor for Efficient Fingerprint Matching + + +
+ In fingerprint matching, fixed-length descriptors generally offer greater +efficiency compared to minutiae set, but the recognition accuracy is not as +good as that of the latter. Although much progress has been made in deep +learning based fixed-length descriptors recently, they often fall short when +dealing with incomplete or partial fingerprints, diverse fingerprint poses, and +significant background noise. In this paper, we propose a three-dimensional +representation called Fixed-length Dense Descriptor (FDD) for efficient +fingerprint matching. FDD features great spatial properties, enabling it to +capture the spatial relationships of the original fingerprints, thereby +enhancing interpretability and robustness. Our experiments on various +fingerprint datasets reveal that FDD outperforms other fixed-length +descriptors, especially in matching fingerprints of different areas, +cross-modal fingerprint matching, and fingerprint matching with background +noise. + +
+
+ comment: Accepted by WIFS 2024 +
+
+
+
+
+ + ♻ ☆ On-Air Deep Learning Integrated Semantic Inference Models for Enhanced + Earth Observation Satellite Networks + + +
+ Earth Observation (EO) systems play a crucial role in achieving Sustainable +Development Goals by collecting and analyzing vital global data through +satellite networks. These systems are essential for tasks like mapping, +disaster monitoring, and resource management, but they face challenges in +processing and transmitting large volumes of EO data, especially in specialized +fields such as agriculture and real-time disaster response. Domain-adapted +Large Language Models (LLMs) provide a promising solution by facilitating data +fusion between extensive EO data and semantic EO data. By improving integration +and interpretation of diverse datasets, LLMs address the challenges of +processing specialized information in agriculture and disaster response +applications. This fusion enhances the accuracy and relevance of transmitted +data. This paper presents a framework for semantic communication in EO +satellite networks, aimed at improving data transmission efficiency and overall +system performance through cognitive processing techniques. The proposed system +employs Discrete-Task-Oriented Source-Channel Coding (DT-JSCC) and Semantic +Data Augmentation (SA) to focus on relevant information while minimizing +communication overhead. By integrating cognitive semantic processing and +inter-satellite links, the framework enhances the analysis and transmission of +multispectral satellite imagery, improving object detection, pattern +recognition, and real-time decision-making. The introduction of Cognitive +Semantic Augmentation (CSA) allows satellites to process and transmit semantic +information, boosting adaptability to changing environments and application +needs. This end-to-end architecture is tailored for next-generation satellite +networks, such as those supporting 6G, and demonstrates significant +improvements in efficiency and accuracy. + +
+
+ comment: 18 pages, 10 figures, magazine +
+
+
+
+
+ + ♻ ☆ EAGLE: Towards Efficient Arbitrary Referring Visual Prompts + Comprehension for Multimodal Large Language Models + + +
+ Recently, Multimodal Large Language Models (MLLMs) have sparked great +research interests owing to their exceptional content-reasoning and +instruction-following capabilities. To effectively instruct an MLLM, in +addition to conventional language expressions, the practice of referring to +objects by painting with brushes on images has emerged as a prevalent tool +(referred to as "referring visual prompts") due to its efficacy in aligning the +user's intention with specific image regions. To accommodate the most common +referring visual prompts, namely points, boxes, and masks, existing approaches +initially utilize specialized feature encoding modules to capture the semantics +of the highlighted areas indicated by these prompts. Subsequently, these +encoded region features are adapted to MLLMs through fine-tuning on a +meticulously curated multimodal instruction dataset. However, such designs +suffer from redundancy in architecture. Moreover, they face challenges in +effectively generalizing when encountering a diverse range of arbitrary +referring visual prompts in real-life scenarios. To address the above issues, +we propose EAGLE, a novel MLLM that empowers comprehension of arbitrary +referring visual prompts with less training efforts than existing approaches. +Specifically, our EAGLE maintains the innate format of the referring visual +prompts as colored patches rendered on the given image for conducting the +instruction tuning. Our approach embeds referring visual prompts as spatial +concepts conveying specific spatial areas comprehensible to the MLLM, with the +semantic comprehension of these regions originating from the MLLM itself. +Besides, we also propose a Geometry-Agnostic Learning paradigm (GAL) to further +disentangle the MLLM's region-level comprehension with the specific formats of +referring visual prompts. Extensive experiments are conducted to prove the +effectiveness of our proposed method. + +
+
+
+
+
+ + ♻ ☆ GenWarp: Single Image to Novel Views with Semantic-Preserving Generative + Warping NeurIPS 2024 + + +
+ Generating novel views from a single image remains a challenging task due to +the complexity of 3D scenes and the limited diversity in the existing +multi-view datasets to train a model on. Recent research combining large-scale +text-to-image (T2I) models with monocular depth estimation (MDE) has shown +promise in handling in-the-wild images. In these methods, an input view is +geometrically warped to novel views with estimated depth maps, then the warped +image is inpainted by T2I models. However, they struggle with noisy depth maps +and loss of semantic details when warping an input view to novel viewpoints. In +this paper, we propose a novel approach for single-shot novel view synthesis, a +semantic-preserving generative warping framework that enables T2I generative +models to learn where to warp and where to generate, through augmenting +cross-view attention with self-attention. Our approach addresses the +limitations of existing methods by conditioning the generative model on source +view images and incorporating geometric warping signals. Qualitative and +quantitative evaluations demonstrate that our model outperforms existing +methods in both in-domain and out-of-domain scenarios. Project page is +available at https://GenWarp-NVS.github.io/. + +
+
+ comment: Accepted to NeurIPS 2024 / Project page: + https://GenWarp-NVS.github.io +
+
+
+
+
+ + ♻ ☆ EPTQ: Enhanced Post-Training Quantization via Hessian-guided + Network-wise Optimization + + +
+ Quantization is a key method for deploying deep neural networks on edge +devices with limited memory and computation resources. Recent improvements in +Post-Training Quantization (PTQ) methods were achieved by an additional local +optimization process for learning the weight quantization rounding policy. +However, a gap exists when employing network-wise optimization with small +representative datasets. In this paper, we propose a new method for enhanced +PTQ (EPTQ) that employs a network-wise quantization optimization process, which +benefits from considering cross-layer dependencies during optimization. EPTQ +enables network-wise optimization with a small representative dataset using a +novel sample-layer attention score based on a label-free Hessian matrix upper +bound. The label-free approach makes our method suitable for the PTQ scheme. We +give a theoretical analysis for the said bound and use it to construct a +knowledge distillation loss that guides the optimization to focus on the more +sensitive layers and samples. In addition, we leverage the Hessian upper bound +to improve the weight quantization parameters selection by focusing on the more +sensitive elements in the weight tensors. Empirically, by employing EPTQ we +achieve state-of-the-art results on various models, tasks, and datasets, +including ImageNet classification, COCO object detection, and Pascal-VOC for +semantic segmentation. + +
+
+
+
+
+ + ♻ ☆ Masks and Boxes: Combining the Best of Both Worlds for Multi-Object + Tracking + + +
+ Multi-object tracking (MOT) involves identifying and consistently tracking +objects across video sequences. Traditional tracking-by-detection methods, +while effective, often require extensive tuning and lack generalizability. On +the other hand, segmentation mask-based methods are more generic but struggle +with tracking management, making them unsuitable for MOT. We propose a novel +approach, McByte, which incorporates a temporally propagated segmentation mask +as a strong association cue within a tracking-by-detection framework. By +combining bounding box and mask information, McByte enhances robustness and +generalizability without per-sequence tuning. Evaluated on four benchmark +datasets - DanceTrack, MOT17, SoccerNet-tracking 2022, and KITTI-tracking - +McByte demonstrates performance gain in all cases examined. At the same time, +it outperforms existing mask-based methods. Implementation code will be +provided upon acceptance. + +
+
+
+
+
+ + ♻ ☆ HER2 and FISH Status Prediction in Breast Biopsy H&E-Stained Images + Using Deep Learning + + +
+ The current standard for detecting human epidermal growth factor receptor 2 +(HER2) status in breast cancer patients relies on HER2 amplification, +identified through fluorescence in situ hybridization (FISH) or +immunohistochemistry (IHC). However, hematoxylin and eosin (H\&E) tumor stains +are more widely available, and accurately predicting HER2 status using H\&E +could reduce costs and expedite treatment selection. Deep Learning algorithms +for H&E have shown effectiveness in predicting various cancer features and +clinical outcomes, including moderate success in HER2 status prediction. In +this work, we employed a customized weak supervision classification technique +combined with MoCo-v2 contrastive learning to predict HER2 status. We trained +our pipeline on 182 publicly available H&E Whole Slide Images (WSIs) from The +Cancer Genome Atlas (TCGA), for which annotations by the pathology team at Yale +School of Medicine are publicly available. Our pipeline achieved an Area Under +the Curve (AUC) of 0.85 across four different test folds. Additionally, we +tested our model on 44 H&E slides from the TCGA-BRCA dataset, which had an HER2 +score of 2+ and included corresponding HER2 status and FISH test results. These +cases are considered equivocal for IHC, requiring an expensive FISH test on +their IHC slides for disambiguation. Our pipeline demonstrated an AUC of 0.81 +on these challenging H&E slides. Reducing the need for FISH test can have +significant implications in cancer treatment equity for underserved +populations. + +
+
+
+
+
+ + ♻ ☆ FruitNeRF: A Unified Neural Radiance Field based Fruit Counting + Framework + + +
+ We introduce FruitNeRF, a unified novel fruit counting framework that +leverages state-of-the-art view synthesis methods to count any fruit type +directly in 3D. Our framework takes an unordered set of posed images captured +by a monocular camera and segments fruit in each image. To make our system +independent of the fruit type, we employ a foundation model that generates +binary segmentation masks for any fruit. Utilizing both modalities, RGB and +semantic, we train a semantic neural radiance field. Through uniform volume +sampling of the implicit Fruit Field, we obtain fruit-only point clouds. By +applying cascaded clustering on the extracted point cloud, our approach +achieves precise fruit count.The use of neural radiance fields provides +significant advantages over conventional methods such as object tracking or +optical flow, as the counting itself is lifted into 3D. Our method prevents +double counting fruit and avoids counting irrelevant fruit.We evaluate our +methodology using both real-world and synthetic datasets. The real-world +dataset consists of three apple trees with manually counted ground truths, a +benchmark apple dataset with one row and ground truth fruit location, while the +synthetic dataset comprises various fruit types including apple, plum, lemon, +pear, peach, and mango.Additionally, we assess the performance of fruit +counting using the foundation model compared to a U-Net. + +
+
+ comment: Project Page: https://meyerls.github.io/fruit_nerf/ +
+
+
+
+
+ + ♻ ☆ Improving Fast Adversarial Training Paradigm: An Example Taxonomy + Perspective + + +
+ While adversarial training is an effective defense method against adversarial +attacks, it notably increases the training cost. To this end, fast adversarial +training (FAT) is presented for efficient training and has become a hot +research topic. However, FAT suffers from catastrophic overfitting, which leads +to a performance drop compared with multi-step adversarial training. However, +the cause of catastrophic overfitting remains unclear and lacks exploration. In +this paper, we present an example taxonomy in FAT, which identifies that +catastrophic overfitting is caused by the imbalance between the inner and outer +optimization in FAT. Furthermore, we investigated the impact of varying degrees +of training loss, revealing a correlation between training loss and +catastrophic overfitting. Based on these observations, we redesign the loss +function in FAT with the proposed dynamic label relaxation to concentrate the +loss range and reduce the impact of misclassified examples. Meanwhile, we +introduce batch momentum initialization to enhance the diversity to prevent +catastrophic overfitting in an efficient manner. Furthermore, we also propose +Catastrophic Overfitting aware Loss Adaptation (COLA), which employs a separate +training strategy for examples based on their loss degree. Our proposed method, +named example taxonomy aware FAT (ETA), establishes an improved paradigm for +FAT. Experiment results demonstrate our ETA achieves state-of-the-art +performance. Comprehensive experiments on four standard datasets demonstrate +the competitiveness of our proposed method. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ Efficient Video Object Segmentation via Modulated Cross-Attention Memory WACV 2025 + + +
+ Recently, transformer-based approaches have shown promising results for +semi-supervised video object segmentation. However, these approaches typically +struggle on long videos due to increased GPU memory demands, as they frequently +expand the memory bank every few frames. We propose a transformer-based +approach, named MAVOS, that introduces an optimized and dynamic long-term +modulated cross-attention (MCA) memory to model temporal smoothness without +requiring frequent memory expansion. The proposed MCA effectively encodes both +local and global features at various levels of granularity while efficiently +maintaining consistent speed regardless of the video length. Extensive +experiments on multiple benchmarks, LVOS, Long-Time Video, and DAVIS 2017, +demonstrate the effectiveness of our proposed contributions leading to +real-time inference and markedly reduced memory demands without any degradation +in segmentation accuracy on long videos. Compared to the best existing +transformer-based approach, our MAVOS increases the speed by 7.6x, while +significantly reducing the GPU memory by 87% with comparable segmentation +performance on short and long video datasets. Notably on the LVOS dataset, our +MAVOS achieves a J&F score of 63.3% while operating at 37 frames per second +(FPS) on a single V100 GPU. Our code and models will be publicly available at: +https://github.com/Amshaker/MAVOS. + +
+
+ comment: WACV 2025 +
+
+
+
+
+ + ♻ ☆ ND-SDF: Learning Normal Deflection Fields for High-Fidelity Indoor + Reconstruction + + +
+ Neural implicit reconstruction via volume rendering has demonstrated its +effectiveness in recovering dense 3D surfaces. However, it is non-trivial to +simultaneously recover meticulous geometry and preserve smoothness across +regions with differing characteristics. To address this issue, previous methods +typically employ geometric priors, which are often constrained by the +performance of the prior models. In this paper, we propose ND-SDF, which learns +a Normal Deflection field to represent the angular deviation between the scene +normal and the prior normal. Unlike previous methods that uniformly apply +geometric priors on all samples, introducing significant bias in accuracy, our +proposed normal deflection field dynamically learns and adapts the utilization +of samples based on their specific characteristics, thereby improving both the +accuracy and effectiveness of the model. Our method not only obtains smooth +weakly textured regions such as walls and floors but also preserves the +geometric details of complex structures. In addition, we introduce a novel ray +sampling strategy based on the deflection angle to facilitate the unbiased +rendering process, which significantly improves the quality and accuracy of +intricate surfaces, especially on thin structures. Consistent improvements on +various challenging datasets demonstrate the superiority of our method. + +
+
+
+
+
+ + ♻ ☆ VideoPatchCore: An Effective Method to Memorize Normality for Video + Anomaly Detection ACCV 2024 + + +
+ Video anomaly detection (VAD) is a crucial task in video analysis and +surveillance within computer vision. Currently, VAD is gaining attention with +memory techniques that store the features of normal frames. The stored features +are utilized for frame reconstruction, identifying an abnormality when a +significant difference exists between the reconstructed and input frames. +However, this approach faces several challenges due to the simultaneous +optimization required for both the memory and encoder-decoder model. These +challenges include increased optimization difficulty, complexity of +implementation, and performance variability depending on the memory size. To +address these challenges,we propose an effective memory method for VAD, called +VideoPatchCore. Inspired by PatchCore, our approach introduces a structure that +prioritizes memory optimization and configures three types of memory tailored +to the characteristics of video data. This method effectively addresses the +limitations of existing memory-based methods, achieving good performance +comparable to state-of-the-art methods. Furthermore, our method requires no +training and is straightforward to implement, making VAD tasks more accessible. +Our code is available online at github.com/SkiddieAhn/Paper-VideoPatchCore. + +
+
+ comment: Accepted to ACCV 2024 +
+
+
+
+
+ + ♻ ☆ AsyncDiff: Parallelizing Diffusion Models by Asynchronous Denoising NeurIPS 2024 + + +
+ Diffusion models have garnered significant interest from the community for +their great generative ability across various applications. However, their +typical multi-step sequential-denoising nature gives rise to high cumulative +latency, thereby precluding the possibilities of parallel computation. To +address this, we introduce AsyncDiff, a universal and plug-and-play +acceleration scheme that enables model parallelism across multiple devices. Our +approach divides the cumbersome noise prediction model into multiple +components, assigning each to a different device. To break the dependency chain +between these components, it transforms the conventional sequential denoising +into an asynchronous process by exploiting the high similarity between hidden +states in consecutive diffusion steps. Consequently, each component is +facilitated to compute in parallel on separate devices. The proposed strategy +significantly reduces inference latency while minimally impacting the +generative quality. Specifically, for the Stable Diffusion v2.1, AsyncDiff +achieves a 2.7x speedup with negligible degradation and a 4.0x speedup with +only a slight reduction of 0.38 in CLIP Score, on four NVIDIA A5000 GPUs. Our +experiments also demonstrate that AsyncDiff can be readily applied to video +diffusion models with encouraging performances. The code is available at +https://github.com/czg1225/AsyncDiff. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ SlimSAM: 0.1% Data Makes Segment Anything Slim NeurIPS 2024 + + +
+ Current approaches for compressing the Segment Anything Model (SAM) yield +commendable results, yet necessitate extensive data to train a new network from +scratch. Employing conventional pruning techniques can remarkably reduce data +requirements but would suffer from a degradation in performance. To address +this challenging trade-off, we introduce SlimSAM, a novel data-efficient SAM +compression method that achieves superior performance with extremely less +training data. The essence of SlimSAM is encapsulated in the alternate slimming +framework which effectively enhances knowledge inheritance under severely +limited training data availability and exceptional pruning ratio. Diverging +from prior techniques, our framework progressively compresses the model by +alternately pruning and distilling distinct, decoupled sub-structures. +Disturbed Taylor pruning is also proposed to address the misalignment between +the pruning objective and training target, thereby boosting the +post-distillation after pruning. SlimSAM yields significant performance +improvements while demanding over 10 times less training data than any other +existing compression methods. Even when compared to the original SAM, SlimSAM +achieves approaching performance while reducing parameter counts to merely 1.4% +(9.1M), MACs to 0.8% (23G), and requiring only 0.1% (10k) of the SAM training +data. The code is available at http://github.com/czg1225/SlimSAM. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Text-Guided Single Image Editing for Remote Sensing Images + + +
+ Artificial intelligence generative content (AIGC) has significantly impacted +image generation in the field of remote sensing. However, the equally important +area of remote sensing image (RSI) editing has not received sufficient +attention. Deep learning based editing methods generally involve two sequential +stages: generation and editing. During the generation stage, consistency in +content and details between the original and edited images must be maintained, +while in the editing stage, controllability and accuracy of the edits should be +ensured. For natural images, these challenges can be tackled by training +generative backbones on large-scale benchmark datasets and using text guidance +based on vision-language models (VLMs). However, these previously effective +approaches become less viable for RSIs due to two reasons: First, existing +generative RSI benchmark datasets do not fully capture the diversity of remote +sensing scenarios, particularly in terms of variations in sensors, object +types, and resolutions. Consequently, the generalization capacity of the +trained backbone model is often inadequate for universal editing tasks on RSIs. +Second, the large spatial resolution of RSIs exacerbates the problem in VLMs +where a single text semantic corresponds to multiple image semantics, leading +to the introduction of incorrect semantics when using text to guide RSI +editing. To solve above problems, this paper proposes a text-guided RSI editing +method that is controllable but stable, and can be trained using only a single +image. It adopts a multi-scale training approach to preserve consistency +without the need for training on extensive benchmark datasets, while leveraging +RSI pre-trained VLMs and prompt ensembling (PE) to ensure accuracy and +controllability in the text-guided editing process. + +
+
+ comment: 14 pages, 14 figures, submitted to IEEE Transactions on Geoscience + and Remote Sensing +
+
+
+
+
+ + ♻ ☆ Learning 3D-Aware GANs from Unposed Images with Template Feature Field + + +
+ Collecting accurate camera poses of training images has been shown to well +serve the learning of 3D-aware generative adversarial networks (GANs) yet can +be quite expensive in practice. This work targets learning 3D-aware GANs from +unposed images, for which we propose to perform on-the-fly pose estimation of +training images with a learned template feature field (TeFF). Concretely, in +addition to a generative radiance field as in previous approaches, we ask the +generator to also learn a field from 2D semantic features while sharing the +density from the radiance field. Such a framework allows us to acquire a +canonical 3D feature template leveraging the dataset mean discovered by the +generative model, and further efficiently estimate the pose parameters on real +data. Experimental results on various challenging datasets demonstrate the +superiority of our approach over state-of-the-art alternatives from both the +qualitative and the quantitative perspectives. + +
+
+ comment: https://XDimlab.github.io/TeFF +
+
+
+
+
+ + ♻ ☆ Regional quality estimation for echocardiography using deep learning + + +
+ Automatic estimation of cardiac ultrasound image quality can be beneficial +for guiding operators and ensuring the accuracy of clinical measurements. +Previous work often fails to distinguish the view correctness of the +echocardiogram from the image quality. Additionally, previous studies only +provide a global image quality value, which limits their practical utility. In +this work, we developed and compared three methods to estimate image quality: +1) classic pixel-based metrics like the generalized contrast-to-noise ratio +(gCNR) on myocardial segments as region of interest and left ventricle lumen as +background, obtained using a U-Net segmentation 2) local image coherence +derived from a U-Net model that predicts coherence from B-Mode images 3) a deep +convolutional network that predicts the quality of each region directly in an +end-to-end fashion. We evaluate each method against manual regional image +quality annotations by three experienced cardiologists. The results indicate +poor performance of the gCNR metric, with Spearman correlation to the +annotations of rho = 0.24. The end-to-end learning model obtains the best +result, rho = 0.69, comparable to the inter-observer correlation, rho = 0.63. +Finally, the coherence-based method, with rho = 0.58, outperformed the +classical metrics and is more generic than the end-to-end approach. The image +quality prediction tool is available as an open source Python library at +https://github.com/GillesVanDeVyver/arqee. + +
+
+
+
+
+ + ♻ ☆ High-throughput 3D shape completion of potato tubers on a harvester + + +
+ Potato yield is an important metric for farmers to further optimize their +cultivation practices. Potato yield can be estimated on a harvester using an +RGB-D camera that can estimate the three-dimensional (3D) volume of individual +potato tubers. A challenge, however, is that the 3D shape derived from RGB-D +images is only partially completed, underestimating the actual volume. To +address this issue, we developed a 3D shape completion network, called CoRe++, +which can complete the 3D shape from RGB-D images. CoRe++ is a deep learning +network that consists of a convolutional encoder and a decoder. The encoder +compresses RGB-D images into latent vectors that are used by the decoder to +complete the 3D shape using the deep signed distance field network (DeepSDF). +To evaluate our CoRe++ network, we collected partial and complete 3D point +clouds of 339 potato tubers on an operational harvester in Japan. On the 1425 +RGB-D images in the test set (representing 51 unique potato tubers), our +network achieved a completion accuracy of 2.8 mm on average. For volumetric +estimation, the root mean squared error (RMSE) was 22.6 ml, and this was better +than the RMSE of the linear regression (31.1 ml) and the base model (36.9 ml). +We found that the RMSE can be further reduced to 18.2 ml when performing the 3D +shape completion in the center of the RGB-D image. With an average 3D shape +completion time of 10 milliseconds per tuber, we can conclude that CoRe++ is +both fast and accurate enough to be implemented on an operational harvester for +high-throughput potato yield estimation. Our method can also be applied to +other tuber, fruit and vegetable crops, thereby enabling versatile, accurate +and real-time yield monitoring in precision agriculture. Our code, network +weights and dataset are publicly available at +https://github.com/UTokyo-FieldPhenomics-Lab/corepp.git. + +
+
+ comment: 20 pages, 11 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles + Using Latent Space Generative World Models ICRA 2025 + + +
+ We propose the use of latent space generative world models to address the +covariate shift problem in autonomous driving. A world model is a neural +network capable of predicting an agent's next state given past states and +actions. By leveraging a world model during training, the driving policy +effectively mitigates covariate shift without requiring an excessive amount of +training data. During end-to-end training, our policy learns how to recover +from errors by aligning with states observed in human demonstrations, so that +at runtime it can recover from perturbations outside the training distribution. +Additionally, we introduce a novel transformer-based perception encoder that +employs multi-view cross-attention and a learned scene query. We present +qualitative and quantitative results, demonstrating significant improvements +upon prior state of the art in closed-loop testing in the CARLA simulator, as +well as showing the ability to handle perturbations in both CARLA and NVIDIA's +DRIVE Sim. + +
+
+ comment: 7 pages, 6 figures, for ICRA 2025 conference, for associated video + file, see https://youtu.be/fO7RZ57gVxk +
+
+
+
+
+ + ♻ ☆ EDA-DM: Enhanced Distribution Alignment for Post-Training Quantization + of Diffusion Models + + +
+ Diffusion models have achieved great success in image generation tasks +through iterative noise estimation. However, the heavy denoising process and +complex neural networks hinder their low-latency applications in real-world +scenarios. Quantization can effectively reduce model complexity, and +post-training quantization (PTQ), which does not require fine-tuning, is highly +promising for compressing and accelerating diffusion models. Unfortunately, we +find that due to the highly dynamic distribution of activations in different +denoising steps, existing PTQ methods for diffusion models suffer from +distribution mismatch issues at both calibration sample level and +reconstruction output level, which makes the performance far from satisfactory, +especially in low-bit cases. In this paper, we propose Enhanced Distribution +Alignment for Post-Training Quantization of Diffusion Models (EDA-DM) to +address the above issues. Specifically, at the calibration sample level, we +select calibration samples based on the density and variety in the latent +space, thus facilitating the alignment of their distribution with the overall +samples; and at the reconstruction output level, we modify the loss of block +reconstruction with the losses of layers, aligning the outputs of quantized +model and full-precision model at different network granularity. Extensive +experiments demonstrate that EDA-DM significantly outperforms the existing PTQ +methods across various models (DDIM, LDM-4, LDM-8, Stable-Diffusion) and +different datasets (CIFAR-10, LSUN-Bedroom, LSUN-Church, ImageNet, MS-COCO). + +
+
+ comment: Code: http://github.com/BienLuky/EDA-DM +
+
+
+
+
+ + ♻ ☆ Fast ODE-based Sampling for Diffusion Models in Around 5 Steps CVPR 2024 + + +
+ Sampling from diffusion models can be treated as solving the corresponding +ordinary differential equations (ODEs), with the aim of obtaining an accurate +solution with as few number of function evaluations (NFE) as possible. +Recently, various fast samplers utilizing higher-order ODE solvers have emerged +and achieved better performance than the initial first-order one. However, +these numerical methods inherently result in certain approximation errors, +which significantly degrades sample quality with extremely small NFE (e.g., +around 5). In contrast, based on the geometric observation that each sampling +trajectory almost lies in a two-dimensional subspace embedded in the ambient +space, we propose Approximate MEan-Direction Solver (AMED-Solver) that +eliminates truncation errors by directly learning the mean direction for fast +diffusion sampling. Besides, our method can be easily used as a plugin to +further improve existing ODE-based samplers. Extensive experiments on image +synthesis with the resolution ranging from 32 to 512 demonstrate the +effectiveness of our method. With only 5 NFE, we achieve 6.61 FID on CIFAR-10, +10.74 FID on ImageNet 64$\times$64, and 13.20 FID on LSUN Bedroom. Our code is +available at https://github.com/zju-pi/diff-sampler. + +
+
+ comment: Accepted by CVPR 2024 (Spotlight) +
+
+
+
+
+ + ♻ ☆ Decision Support System to triage of liver trauma + + +
+ Trauma significantly impacts global health, accounting for over 5 million +deaths annually, which is comparable to mortality rates from diseases such as +tuberculosis, AIDS, and malaria. In Iran, the financial repercussions of road +traffic accidents represent approximately 2% of the nation's Gross National +Product each year. Bleeding is the leading cause of mortality in trauma +patients within the first 24 hours following an injury, making rapid diagnosis +and assessment of severity crucial. Trauma patients require comprehensive scans +of all organs, generating a large volume of data. Evaluating CT images for the +entire body is time-consuming and requires significant expertise, underscoring +the need for efficient time management in diagnosis. Efficient diagnostic +processes can significantly reduce treatment costs and decrease the likelihood +of secondary complications. In this context, the development of a reliable +Decision Support System (DSS) for trauma triage, particularly focused on the +abdominal area, is vital. This paper presents a novel method for detecting +liver bleeding and lacerations using CT scans, utilising the GAN Pix2Pix +translation model. The effectiveness of the method is quantified by Dice score +metrics, with the model achieving an accuracy of 97% for liver bleeding and 93% +for liver laceration detection. These results represent a notable improvement +over current state-of-the-art technologies. The system's design integrates +seamlessly with existing medical imaging technologies, making it a practical +addition to emergency medical services. This research underscores the potential +of advanced image translation models like GAN Pix2Pix in improving the +precision and speed of medical diagnostics in critical care scenarios. + +
+
+
+
+
+ + ♻ ☆ Improvements to SDXL in NovelAI Diffusion V3 + + +
+ In this technical report, we document the changes we made to SDXL in the +process of training NovelAI Diffusion V3, our state of the art anime image +generation model. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ DeNetDM: Debiasing by Network Depth Modulation NeurIPS 2024 + + +
+ When neural networks are trained on biased datasets, they tend to +inadvertently learn spurious correlations, leading to challenges in achieving +strong generalization and robustness. Current approaches to address such biases +typically involve utilizing bias annotations, reweighting based on pseudo-bias +labels, or enhancing diversity within bias-conflicting data points through +augmentation techniques. We introduce DeNetDM, a novel debiasing method based +on the observation that shallow neural networks prioritize learning core +attributes, while deeper ones emphasize biases when tasked with acquiring +distinct information. Using a training paradigm derived from Product of +Experts, we create both biased and debiased branches with deep and shallow +architectures and then distill knowledge to produce the target debiased model. +Extensive experiments and analyses demonstrate that our approach outperforms +current debiasing techniques, achieving a notable improvement of around 5% in +three datasets, encompassing both synthetic and real-world data. Remarkably, +DeNetDM accomplishes this without requiring annotations pertaining to bias +labels or bias types, while still delivering performance on par with supervised +counterparts. Furthermore, our approach effectively harnesses the diversity of +bias-conflicting points within the data, surpassing previous methods and +obviating the need for explicit augmentation-based methods to enhance the +diversity of such bias-conflicting points. The source code will be available +upon acceptance. + +
+
+ comment: Accepted to NeurIPS 2024, * indicates these authors contributed + equally +
+
+
+
+
+ + ♻ ☆ A Distributed Privacy Preserving Model for the Detection of Alzheimer's + Disease + + +
+ In the era of rapidly advancing medical technologies, the segmentation of +medical data has become inevitable, necessitating the development of privacy +preserving machine learning algorithms that can train on distributed data. +Consolidating sensitive medical data is not always an option particularly due +to the stringent privacy regulations imposed by the Health Insurance +Portability and Accountability Act (HIPAA). In this paper, I introduce a HIPAA +compliant framework that can train from distributed data. I then propose a +multimodal vertical federated model for Alzheimer's Disease (AD) detection, a +serious neurodegenerative condition that can cause dementia, severely impairing +brain function and hindering simple tasks, especially without preventative +care. This vertical federated learning (VFL) model offers a distributed +architecture that enables collaborative learning across diverse sources of +medical data while respecting privacy constraints imposed by HIPAA. The VFL +architecture proposed herein offers a novel distributed architecture, enabling +collaborative learning across diverse sources of medical data while respecting +statutory privacy constraints. By leveraging multiple modalities of data, the +robustness and accuracy of AD detection can be enhanced. This model not only +contributes to the advancement of federated learning techniques but also holds +promise for overcoming the hurdles posed by data segmentation in medical +research. + +
+
+ comment: 15 pages, 7 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Boundless: Generating Photorealistic Synthetic Data for Object Detection + in Urban Streetscapes + + +
+ We introduce Boundless, a photo-realistic synthetic data generation system +for enabling highly accurate object detection in dense urban streetscapes. +Boundless can replace massive real-world data collection and manual +ground-truth object annotation (labeling) with an automated and configurable +process. Boundless is based on the Unreal Engine 5 (UE5) City Sample project +with improvements enabling accurate collection of 3D bounding boxes across +different lighting and scene variability conditions. + We evaluate the performance of object detection models trained on the dataset +generated by Boundless when used for inference on a real-world dataset acquired +from medium-altitude cameras. We compare the performance of the +Boundless-trained model against the CARLA-trained model and observe an +improvement of 7.8 mAP. The results we achieved support the premise that +synthetic data generation is a credible methodology for training/fine-tuning +scalable object detection models for urban scenes. + +
+
+
+
+
+ + ♻ ☆ RGB2Point: 3D Point Cloud Generation from Single RGB Images + + +
+ We introduce RGB2Point, an unposed single-view RGB image to a 3D point cloud +generation based on Transformer. RGB2Point takes an input image of an object +and generates a dense 3D point cloud. Contrary to prior works based on CNN +layers and diffusion denoising approaches, we use pre-trained Transformer +layers that are fast and generate high-quality point clouds with consistent +quality over available categories. Our generated point clouds demonstrate high +quality on a real-world dataset, as evidenced by improved Chamfer distance +(51.15%) and Earth Mover's distance (45.96%) metrics compared to the current +state-of-the-art. Additionally, our approach shows a better quality on a +synthetic dataset, achieving better Chamfer distance (39.26%), Earth Mover's +distance (26.95%), and F-score (47.16%). Moreover, our method produces 63.1% +more consistent high-quality results across various object categories compared +to prior works. Furthermore, RGB2Point is computationally efficient, requiring +only 2.3GB of VRAM to reconstruct a 3D point cloud from a single RGB image, and +our implementation generates the results 15,133x faster than a SOTA +diffusion-based model. + +
+
+
+
+
+ + ♻ ☆ PTQ4DiT: Post-training Quantization for Diffusion Transformers NeurIPS 2024 + + +
+ The recent introduction of Diffusion Transformers (DiTs) has demonstrated +exceptional capabilities in image generation by using a different backbone +architecture, departing from traditional U-Nets and embracing the scalable +nature of transformers. Despite their advanced capabilities, the wide +deployment of DiTs, particularly for real-time applications, is currently +hampered by considerable computational demands at the inference stage. +Post-training Quantization (PTQ) has emerged as a fast and data-efficient +solution that can significantly reduce computation and memory footprint by +using low-bit weights and activations. However, its applicability to DiTs has +not yet been explored and faces non-trivial difficulties due to the unique +design of DiTs. In this paper, we propose PTQ4DiT, a specifically designed PTQ +method for DiTs. We discover two primary quantization challenges inherent in +DiTs, notably the presence of salient channels with extreme magnitudes and the +temporal variability in distributions of salient activation over multiple +timesteps. To tackle these challenges, we propose Channel-wise Salience +Balancing (CSB) and Spearmen's $\rho$-guided Salience Calibration (SSC). CSB +leverages the complementarity property of channel magnitudes to redistribute +the extremes, alleviating quantization errors for both activations and weights. +SSC extends this approach by dynamically adjusting the balanced salience to +capture the temporal variations in activation. Additionally, to eliminate extra +computational costs caused by PTQ4DiT during inference, we design an offline +re-parameterization strategy for DiTs. Experiments demonstrate that our PTQ4DiT +successfully quantizes DiTs to 8-bit precision (W8A8) while preserving +comparable generation ability and further enables effective quantization to +4-bit weight precision (W4A8) for the first time. + +
+
+ comment: NeurIPS 2024. Code is available at + https://github.com/adreamwu/PTQ4DiT +
+
+
+
+
+ + ♻ ☆ Splat-MOVER: Multi-Stage, Open-Vocabulary Robotic Manipulation via + Editable Gaussian Splatting + + +
+ We present Splat-MOVER, a modular robotics stack for open-vocabulary robotic +manipulation, which leverages the editability of Gaussian Splatting (GSplat) +scene representations to enable multi-stage manipulation tasks. Splat-MOVER +consists of: (i) ASK-Splat, a GSplat representation that distills semantic and +grasp affordance features into the 3D scene. ASK-Splat enables geometric, +semantic, and affordance understanding of 3D scenes, which is critical in many +robotics tasks; (ii) SEE-Splat, a real-time scene-editing module using 3D +semantic masking and infilling to visualize the motions of objects that result +from robot interactions in the real-world. SEE-Splat creates a "digital twin" +of the evolving environment throughout the manipulation task; and (iii) +Grasp-Splat, a grasp generation module that uses ASK-Splat and SEE-Splat to +propose affordance-aligned candidate grasps for open-world objects. ASK-Splat +is trained in real-time from RGB images in a brief scanning phase prior to +operation, while SEE-Splat and Grasp-Splat run in real-time during operation. +We demonstrate the superior performance of Splat-MOVER in hardware experiments +on a Kinova robot compared to two recent baselines in four single-stage, +open-vocabulary manipulation tasks and in four multi-stage manipulation tasks, +using the edited scene to reflect changes due to prior manipulation stages, +which is not possible with existing baselines. Video demonstrations and the +code for the project are available at https://splatmover.github.io. + +
+
+ comment: https://splatmover.github.io +
+
+
+
+
+
+
+
+ + Information Retrieval 20 + +
+
+
+ + ☆ Open-World Evaluation for Retrieving Diverse Perspectives + + +
+ We study retrieving a set of documents that covers various perspectives on a +complex and contentious question (e.g., will ChatGPT do more harm than good?). +We curate a Benchmark for Retrieval Diversity for Subjective questions (BERDS), +where each example consists of a question and diverse perspectives associated +with the question, sourced from survey questions and debate websites. On this +data, retrievers paired with a corpus are evaluated to surface a document set +that contains diverse perspectives. Our framing diverges from most retrieval +tasks in that document relevancy cannot be decided by simple string matches to +references. Instead, we build a language model based automatic evaluator that +decides whether each retrieved document contains a perspective. This allows us +to evaluate the performance of three different types of corpus (Wikipedia, web +snapshot, and corpus constructed on the fly with retrieved pages from the +search engine) paired with retrievers. Retrieving diverse documents remains +challenging, with the outputs from existing retrievers covering all +perspectives on only 33.74% of the examples. We further study the impact of +query expansion and diversity-focused reranking approaches and analyze +retriever sycophancy. Together, we lay the foundation for future studies in +retrieval diversity handling complex queries. + +
+
+
+
+
+ + ☆ Revisit Anything: Visual Place Recognition via Image Segment Retrieval ECCV 2024 + + +
+ Accurately recognizing a revisited place is crucial for embodied agents to +localize and navigate. This requires visual representations to be distinct, +despite strong variations in camera viewpoint and scene appearance. Existing +visual place recognition pipelines encode the "whole" image and search for +matches. This poses a fundamental challenge in matching two images of the same +place captured from different camera viewpoints: "the similarity of what +overlaps can be dominated by the dissimilarity of what does not overlap". We +address this by encoding and searching for "image segments" instead of the +whole images. We propose to use open-set image segmentation to decompose an +image into `meaningful' entities (i.e., things and stuff). This enables us to +create a novel image representation as a collection of multiple overlapping +subgraphs connecting a segment with its neighboring segments, dubbed +SuperSegment. Furthermore, to efficiently encode these SuperSegments into +compact vector representations, we propose a novel factorized representation of +feature aggregation. We show that retrieving these partial representations +leads to significantly higher recognition recall than the typical whole image +based retrieval. Our segments-based approach, dubbed SegVLAD, sets a new +state-of-the-art in place recognition on a diverse selection of benchmark +datasets, while being applicable to both generic and task-specialized image +encoders. Finally, we demonstrate the potential of our method to ``revisit +anything'' by evaluating our method on an object instance retrieval task, which +bridges the two disparate areas of research: visual place recognition and +object-goal navigation, through their common aim of recognizing goal objects +specific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything. + +
+
+ comment: Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures +
+
+
+
+
+ + ☆ Report on the Workshop on Simulations for Information Access (Sim4IA + 2024) at SIGIR 2024 + + +
+ This paper is a report of the Workshop on Simulations for Information Access +(Sim4IA) workshop at SIGIR 2024. The workshop had two keynotes, a panel +discussion, nine lightning talks, and two breakout sessions. Key takeaways were +user simulation's importance in academia and industry, the possible bridging of +online and offline evaluation, and the issues of organizing a companion shared +task around user simulations for information access. We report on how we +organized the workshop, provide a brief overview of what happened at the +workshop, and summarize the main topics and findings of the workshop and future +work. + +
+
+ comment: Preprint of a SIGIR Forum submission for Vol. 58 No. 2 - December + 2024 +
+
+
+
+
+ + ☆ Enhancing Tourism Recommender Systems for Sustainable City Trips Using + Retrieval-Augmented Generation + + +
+ Tourism Recommender Systems (TRS) have traditionally focused on providing +personalized travel suggestions, often prioritizing user preferences without +considering broader sustainability goals. Integrating sustainability into TRS +has become essential with the increasing need to balance environmental impact, +local community interests, and visitor satisfaction. This paper proposes a +novel approach to enhancing TRS for sustainable city trips using Large Language +Models (LLMs) and a modified Retrieval-Augmented Generation (RAG) pipeline. We +enhance the traditional RAG system by incorporating a sustainability metric +based on a city's popularity and seasonal demand during the prompt augmentation +phase. This modification, called Sustainability Augmented Reranking (SAR), +ensures the system's recommendations align with sustainability goals. +Evaluations using popular open-source LLMs, such as Llama-3.1-Instruct-8B and +Mistral-Instruct-7B, demonstrate that the SAR-enhanced approach consistently +matches or outperforms the baseline (without SAR) across most metrics, +highlighting the benefits of incorporating sustainability into TRS. + +
+
+ comment: Accepted at the RecSoGood 2024 Workshop co-located with the 18th ACM + Conference on Recommender Systems (RecSys 2024) +
+
+
+
+
+ + ☆ A Multimodal Single-Branch Embedding Network for Recommendation in + Cold-Start and Missing Modality Scenarios + + +
+ Most recommender systems adopt collaborative filtering (CF) and provide +recommendations based on past collective interactions. Therefore, the +performance of CF algorithms degrades when few or no interactions are +available, a scenario referred to as cold-start. To address this issue, +previous work relies on models leveraging both collaborative data and side +information on the users or items. Similar to multimodal learning, these models +aim at combining collaborative and content representations in a shared +embedding space. In this work we propose a novel technique for multimodal +recommendation, relying on a multimodal Single-Branch embedding network for +Recommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction +data as well as multimodal side information using the same single-branch +embedding network on different modalities. This makes SiBraR effective in +scenarios of missing modality, including cold start. Our extensive experiments +on large-scale recommendation datasets from three different recommendation +domains (music, movie, and e-commerce) and providing multimodal content +information (audio, text, image, labels, and interactions) show that SiBraR +significantly outperforms CF as well as state-of-the-art content-based RSs in +cold-start scenarios, and is competitive in warm scenarios. We show that +SiBraR's recommendations are accurate in missing modality scenarios, and that +the model is able to map different modalities to the same region of the shared +embedding space, hence reducing the modality gap. + +
+
+ comment: Accepted at 18th ACM Conference on Recommender Systems (RecSys '24) +
+
+
+
+
+ + ☆ Value Identification in Multistakeholder Recommender Systems for + Humanities and Historical Research: The Case of the Digital Archive + Monasterium.net + + +
+ Recommender systems remain underutilized in humanities and historical +research, despite their potential to enhance the discovery of cultural records. +This paper offers an initial value identification of the multiple stakeholders +that might be impacted by recommendations in Monasterium.net, a digital archive +for historical legal documents. Specifically, we discuss the diverse values and +objectives of its stakeholders, such as editors, aggregators, platform owners, +researchers, publishers, and funding agencies. These in-depth insights into the +potentially conflicting values of stakeholder groups allow designing and +adapting recommender systems to enhance their usefulness for humanities and +historical research. Additionally, our findings will support deeper engagement +with additional stakeholders to refine value models and evaluation metrics for +recommender systems in the given domains. Our conclusions are embedded in and +applicable to other digital archives and a broader cultural heritage context. + +
+
+ comment: To be presented at: NORMalize 2024: The Second Workshop on the + Normative Design and Evaluation of Recommender Systems, October 18, 2024, + co-located with the ACM Conference on Recommender Systems 2024 (RecSys 2024), + Bari, Italy +
+
+
+
+
+ + ☆ Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval + Model EMNLP 2024 + + +
+ A supervised ranking model, despite its advantage of being effective, usually +involves complex processing - typically multiple stages of task-specific +pre-training and fine-tuning. This has motivated researchers to explore simpler +pipelines leveraging large language models (LLMs) that are capable of working +in a zero-shot manner. However, since zero-shot inference does not make use of +a training set of pairs of queries and their relevant documents, its +performance is mostly worse than that of supervised models, which are trained +on such example pairs. Motivated by the existing findings that training +examples generally improve zero-shot performance, in our work, we explore if +this also applies to ranking models. More specifically, given a query and a +pair of documents, the preference prediction task is improved by augmenting +examples of preferences for similar queries from a training set. Our proposed +pairwise few-shot ranker demonstrates consistent improvements over the +zero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset) +retrieval benchmarks. Our method also achieves a close performance to that of a +supervised model without requiring any complex training pipeline. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ Autoregressive Generation Strategies for Top-K Sequential + Recommendations + + +
+ The goal of modern sequential recommender systems is often formulated in +terms of next-item prediction. In this paper, we explore the applicability of +generative transformer-based models for the Top-K sequential recommendation +task, where the goal is to predict items a user is likely to interact with in +the "near future". + We explore commonly used autoregressive generation strategies, including +greedy decoding, beam search, and temperature sampling, to evaluate their +performance for the Top-K sequential recommendation task. In addition, we +propose novel Reciprocal Rank Aggregation (RRA) and Relevance Aggregation (RA) +generation strategies based on multi-sequence generation with temperature +sampling and subsequent aggregation. + Experiments on diverse datasets give valuable insights regarding commonly +used strategies' applicability and show that suggested approaches improve +performance on longer time horizons compared to widely-used Top-K prediction +approach and single-sequence autoregressive generation strategies. + +
+
+
+
+
+ + ☆ Efficient Pointwise-Pairwise Learning-to-Rank for News Recommendation + + +
+ News recommendation is a challenging task that involves personalization based +on the interaction history and preferences of each user. Recent works have +leveraged the power of pretrained language models (PLMs) to directly rank news +items by using inference approaches that predominately fall into three +categories: pointwise, pairwise, and listwise learning-to-rank. While pointwise +methods offer linear inference complexity, they fail to capture crucial +comparative information between items that is more effective for ranking tasks. +Conversely, pairwise and listwise approaches excel at incorporating these +comparisons but suffer from practical limitations: pairwise approaches are +either computationally expensive or lack theoretical guarantees, and listwise +methods often perform poorly in practice. In this paper, we propose a novel +framework for PLM-based news recommendation that integrates both pointwise +relevance prediction and pairwise comparisons in a scalable manner. We present +a rigorous theoretical analysis of our framework, establishing conditions under +which our approach guarantees improved performance. Extensive experiments show +that our approach outperforms the state-of-the-art methods on the MIND and +Adressa news recommendation datasets. + +
+
+
+
+
+ + ☆ Enhancing Structured-Data Retrieval with GraphRAG: Soccer Data Case + Study + + +
+ Extracting meaningful insights from large and complex datasets poses +significant challenges, particularly in ensuring the accuracy and relevance of +retrieved information. Traditional data retrieval methods such as sequential +search and index-based retrieval often fail when handling intricate and +interconnected data structures, resulting in incomplete or misleading outputs. +To overcome these limitations, we introduce Structured-GraphRAG, a versatile +framework designed to enhance information retrieval across structured datasets +in natural language queries. Structured-GraphRAG utilizes multiple knowledge +graphs, which represent data in a structured format and capture complex +relationships between entities, enabling a more nuanced and comprehensive +retrieval of information. This graph-based approach reduces the risk of errors +in language model outputs by grounding responses in a structured format, +thereby enhancing the reliability of results. We demonstrate the effectiveness +of Structured-GraphRAG by comparing its performance with that of a recently +published method using traditional retrieval-augmented generation. Our findings +show that Structured-GraphRAG significantly improves query processing +efficiency and reduces response times. While our case study focuses on soccer +data, the framework's design is broadly applicable, offering a powerful tool +for data analysis and enhancing language model applications across various +structured domains. + +
+
+
+
+
+ + ☆ Improving the Shortest Plank: Vulnerability-Aware Adversarial Training + for Robust Recommender System + + +
+ Recommender systems play a pivotal role in mitigating information overload in +various fields. Nonetheless, the inherent openness of these systems introduces +vulnerabilities, allowing attackers to insert fake users into the system's +training data to skew the exposure of certain items, known as poisoning +attacks. Adversarial training has emerged as a notable defense mechanism +against such poisoning attacks within recommender systems. Existing adversarial +training methods apply perturbations of the same magnitude across all users to +enhance system robustness against attacks. Yet, in reality, we find that +attacks often affect only a subset of users who are vulnerable. These +perturbations of indiscriminate magnitude make it difficult to balance +effective protection for vulnerable users without degrading recommendation +quality for those who are not affected. To address this issue, our research +delves into understanding user vulnerability. Considering that poisoning +attacks pollute the training data, we note that the higher degree to which a +recommender system fits users' training data correlates with an increased +likelihood of users incorporating attack information, indicating their +vulnerability. Leveraging these insights, we introduce the Vulnerability-aware +Adversarial Training (VAT), designed to defend against poisoning attacks in +recommender systems. VAT employs a novel vulnerability-aware function to +estimate users' vulnerability based on the degree to which the system fits +them. Guided by this estimation, VAT applies perturbations of adaptive +magnitude to each user, not only reducing the success ratio of attacks but also +preserving, and potentially enhancing, the quality of recommendations. +Comprehensive experiments confirm VAT's superior defensive capabilities across +different recommendation models and against various types of attacks. + +
+
+
+
+
+ + ☆ Towards More Relevant Product Search Ranking Via Large Language Models: + An Empirical Study + + +
+ Training Learning-to-Rank models for e-commerce product search ranking can be +challenging due to the lack of a gold standard of ranking relevance. In this +paper, we decompose ranking relevance into content-based and engagement-based +aspects, and we propose to leverage Large Language Models (LLMs) for both label +and feature generation in model training, primarily aiming to improve the +model's predictive capability for content-based relevance. Additionally, we +introduce different sigmoid transformations on the LLM outputs to polarize +relevance scores in labeling, enhancing the model's ability to balance +content-based and engagement-based relevances and thus prioritize highly +relevant items overall. Comprehensive online tests and offline evaluations are +also conducted for the proposed design. Our work sheds light on advanced +strategies for integrating LLMs into e-commerce product search ranking model +training, offering a pathway to more effective and balanced models with +improved ranking relevance. + +
+
+ comment: To be published in CIKM 2024 GenAIECommerce Workshop +
+
+
+
+
+ + ☆ Long or Short or Both? An Exploration on Lookback Time Windows of + Behavioral Features in Product Search Ranking + + +
+ Customer shopping behavioral features are core to product search ranking +models in eCommerce. In this paper, we investigate the effect of lookback time +windows when aggregating these features at the (query, product) level over +history. By studying the pros and cons of using long and short time windows, we +propose a novel approach to integrating these historical behavioral features of +different time windows. In particular, we address the criticality of using +query-level vertical signals in ranking models to effectively aggregate all +information from different behavioral features. Anecdotal evidence for the +proposed approach is also provided using live product search traffic on +Walmart.com. + +
+
+ comment: Published in ACM SIGIR Workshop on eCommerce 2024 +
+
+
+
+
+ + ☆ Minimizing Live Experiments in Recommender Systems: User Simulation to + Evaluate Preference Elicitation Policies + + +
+ Evaluation of policies in recommender systems typically involves A/B testing +using live experiments on real users to assess a new policy's impact on +relevant metrics. This ``gold standard'' comes at a high cost, however, in +terms of cycle time, user cost, and potential user retention. In developing +policies for ``onboarding'' new users, these costs can be especially +problematic, since on-boarding occurs only once. In this work, we describe a +simulation methodology used to augment (and reduce) the use of live +experiments. We illustrate its deployment for the evaluation of ``preference +elicitation'' algorithms used to onboard new users of the YouTube Music +platform. By developing counterfactually robust user behavior models, and a +simulation service that couples such models with production infrastructure, we +are able to test new algorithms in a way that reliably predicts their +performance on key metrics when deployed live. We describe our domain, our +simulation models and platform, results of experiments and deployment, and +suggest future steps needed to further realistic simulation as a powerful +complement to live experiments. + +
+
+
+
+
+ + ♻ ☆ Language agents achieve superhuman synthesis of scientific knowledge + + +
+ Language models are known to hallucinate incorrect information, and it is +unclear if they are sufficiently accurate and reliable for use in scientific +research. We developed a rigorous human-AI comparison methodology to evaluate +language model agents on real-world literature search tasks covering +information retrieval, summarization, and contradiction detection tasks. We +show that PaperQA2, a frontier language model agent optimized for improved +factuality, matches or exceeds subject matter expert performance on three +realistic literature research tasks without any restrictions on humans (i.e., +full access to internet, search tools, and time). PaperQA2 writes cited, +Wikipedia-style summaries of scientific topics that are significantly more +accurate than existing, human-written Wikipedia articles. We also introduce a +hard benchmark for scientific literature research called LitQA2 that guided +design of PaperQA2, leading to it exceeding human performance. Finally, we +apply PaperQA2 to identify contradictions within the scientific literature, an +important scientific task that is challenging for humans. PaperQA2 identifies +2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of +which 70% are validated by human experts. These results demonstrate that +language model agents are now capable of exceeding domain experts across +meaningful tasks on scientific literature. + +
+
+
+
+
+ + ♻ ☆ Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of + Anomalous Behavior in Bio-regenerative Life Support System Telemetry ECML + + +
+ The detection of abnormal or critical system states is essential in condition +monitoring. While much attention is given to promptly identifying anomalies, a +retrospective analysis of these anomalies can significantly enhance our +comprehension of the underlying causes of observed undesired behavior. This +aspect becomes particularly critical when the monitored system is deployed in a +vital environment. In this study, we delve into anomalies within the domain of +Bio-Regenerative Life Support Systems (BLSS) for space exploration and analyze +anomalies found in telemetry data stemming from the EDEN ISS space greenhouse +in Antarctica. We employ time series clustering on anomaly detection results to +categorize various types of anomalies in both uni- and multivariate settings. +We then assess the effectiveness of these methods in identifying systematic +anomalous behavior. Additionally, we illustrate that the anomaly detection +methods MDI and DAMP produce complementary results, as previously indicated by +research. + +
+
+ comment: 12 pages, + Supplemental Materials, Published at Machine Learning and + Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024 +
+
+
+
+
+ + ♻ ☆ Modeling and Analyzing the Influence of Non-Item Pages on Sequential + Next-Item Prediction + + +
+ Analyzing sequences of interactions between users and items, sequential +recommendation models can learn user intent and make predictions about the next +item. Next to item interactions, most systems also have interactions with what +we call non-item pages: these pages are not related to specific items but still +can provide insights of the user's interests, as, for example, navigation +pages. + We therefore propose a general way to include these non-item pages in +sequential recommendation models to enhance next-item prediction. First, we +demonstrate the influence of non-item pages on following interactions with the +hypotheses testing framework HypTrails and propose methods for representing +non-item pages in sequential recommendation models. Subsequently, we adapt +popular sequential recommender models to integrate non-item pages and +investigate their performance with different item representation strategies as +well as their ability to handle noisy data. To show the general capabilities of +the models to integrate non-item pages, we create a synthetic dataset for a +controlled setting and then evaluate the improvements from including non-item +pages on two real-world datasets. + Our results show that non-item pages are a valuable source of information, +and incorporating them in sequential recommendation models increases the +performance of next-item prediction across all analyzed model architectures. + +
+
+ comment: 37 pages, 19 figures; Submitted to ACM TORS +
+
+
+
+
+ + ♻ ☆ CHIQ: Contextual History Enhancement for Improving Query Rewriting in + Conversational Search EMNLP 2024 + + +
+ In this paper, we study how open-source large language models (LLMs) can be +effectively deployed for improving query rewriting in conversational search, +especially for ambiguous queries. We introduce CHIQ, a two-step method that +leverages the capabilities of LLMs to resolve ambiguities in the conversation +history before query rewriting. This approach contrasts with prior studies that +predominantly use closed-source LLMs to directly generate search queries from +conversation history. We demonstrate on five well-established benchmarks that +CHIQ leads to state-of-the-art results across most settings, showing highly +competitive performances with systems leveraging closed-source LLMs. Our study +provides a first step towards leveraging open-source LLMs in conversational +search, as a competitive alternative to the prevailing reliance on commercial +LLMs. Data, models, and source code will be publicly available upon acceptance +at https://github.com/fengranMark/CHIQ. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ IRSC: A Zero-shot Evaluation Benchmark for Information Retrieval through + Semantic Comprehension in Retrieval-Augmented Generation Scenarios + + +
+ In Retrieval-Augmented Generation (RAG) tasks using Large Language Models +(LLMs), the quality of retrieved information is critical to the final output. +This paper introduces the IRSC benchmark for evaluating the performance of +embedding models in multilingual RAG tasks. The benchmark encompasses five +retrieval tasks: query retrieval, title retrieval, part-of-paragraph retrieval, +keyword retrieval, and summary retrieval. Our research addresses the current +lack of comprehensive testing and effective comparison methods for embedding +models in RAG scenarios. We introduced new metrics: the Similarity of Semantic +Comprehension Index (SSCI) and the Retrieval Capability Contest Index (RCCI), +and evaluated models such as Snowflake-Arctic, BGE, GTE, and M3E. Our +contributions include: 1) the IRSC benchmark, 2) the SSCI and RCCI metrics, and +3) insights into the cross-lingual limitations of embedding models. The IRSC +benchmark aims to enhance the understanding and development of accurate +retrieval systems in RAG tasks. All code and datasets are available at: +https://github.com/Jasaxion/IRSC_Benchmark + +
+
+
+
+
+ + ♻ ☆ A Unified Framework for Multi-Domain CTR Prediction via Large Language + Models + + +
+ Click-Through Rate (CTR) prediction is a crucial task in online +recommendation platforms as it involves estimating the probability of user +engagement with advertisements or items by clicking on them. Given the +availability of various services like online shopping, ride-sharing, food +delivery, and professional services on commercial platforms, recommendation +systems in these platforms are required to make CTR predictions across multiple +domains rather than just a single domain. However, multi-domain click-through +rate (MDCTR) prediction remains a challenging task in online recommendation due +to the complex mutual influence between domains. Traditional MDCTR models +typically encode domains as discrete identifiers, ignoring rich semantic +information underlying. Consequently, they can hardly generalize to new +domains. Besides, existing models can be easily dominated by some specific +domains, which results in significant performance drops in the other domains +(i.e. the "seesaw phenomenon"). In this paper, we propose a novel solution +Uni-CTR to address the above challenges. Uni-CTR leverages a backbone Large +Language Model (LLM) to learn layer-wise semantic representations that capture +commonalities between domains. Uni-CTR also uses several domain-specific +networks to capture the characteristics of each domain. Note that we design a +masked loss strategy so that these domain-specific networks are decoupled from +backbone LLM. This allows domain-specific networks to remain unchanged when +incorporating new or removing domains, thereby enhancing the flexibility and +scalability of the system significantly. Experimental results on three public +datasets show that Uni-CTR outperforms the state-of-the-art (SOTA) MDCTR models +significantly. Furthermore, Uni-CTR demonstrates remarkable effectiveness in +zero-shot prediction. We have applied Uni-CTR in industrial scenarios, +confirming its efficiency. + +
+
+ comment: Accept By ACM TRANSACTIONS ON INFORMATION SYSTEMS(TOIS) +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ Multi-View and Multi-Scale Alignment for Contrastive Language-Image + Pre-training in Mammography MICCAI 2024 + + +
+ Contrastive Language-Image Pre-training (CLIP) shows promise in medical image +analysis but requires substantial data and computational resources. Due to +these restrictions, existing CLIP applications in medical imaging focus mainly +on modalities like chest X-rays that have abundant image-report data available, +leaving many other important modalities under-explored. Here, we propose the +first adaptation of the full CLIP model to mammography, which presents +significant challenges due to labeled data scarcity, high-resolution images +with small regions of interest, and data imbalance. We first develop a +specialized supervision framework for mammography that leverages its multi-view +nature. Furthermore, we design a symmetric local alignment module to better +focus on detailed features in high-resolution images. Lastly, we incorporate a +parameter-efficient fine-tuning approach for large language models pre-trained +with medical knowledge to address data limitations. Our multi-view and +multi-scale alignment (MaMA) method outperforms state-of-the-art baselines for +three different tasks on two large real-world mammography datasets, EMBED and +RSNA-Mammo, with only 52% model size compared with the largest baseline. + +
+
+ comment: This work is also the basis of the overall best solution for the + MICCAI 2024 CXR-LT Challenge +
+
+
+
+
+ + ☆ Find Rhinos without Finding Rhinos: Active Learning with Multimodal + Imagery of South African Rhino Habitats IJCAI 2023 + + +
+ Much of Earth's charismatic megafauna is endangered by human activities, +particularly the rhino, which is at risk of extinction due to the poaching +crisis in Africa. Monitoring rhinos' movement is crucial to their protection +but has unfortunately proven difficult because rhinos are elusive. Therefore, +instead of tracking rhinos, we propose the novel approach of mapping communal +defecation sites, called middens, which give information about rhinos' spatial +behavior valuable to anti-poaching, management, and reintroduction efforts. +This paper provides the first-ever mapping of rhino midden locations by +building classifiers to detect them using remotely sensed thermal, RGB, and +LiDAR imagery in passive and active learning settings. As existing active +learning methods perform poorly due to the extreme class imbalance in our +dataset, we design MultimodAL, an active learning system employing a ranking +technique and multimodality to achieve competitive performance with passive +learning models with 94% fewer labels. Our methods could therefore save over 76 +hours in labeling time when used on a similarly-sized dataset. Unexpectedly, +our midden map reveals that rhino middens are not randomly distributed +throughout the landscape; rather, they are clustered. Consequently, rangers +should be targeted at areas with high midden densities to strengthen +anti-poaching efforts, in line with UN Target 15.7. + +
+
+ comment: 9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good +
+
+
+
+
+ + ☆ MALPOLON: A Framework for Deep Species Distribution Modeling + + +
+ This paper describes a deep-SDM framework, MALPOLON. Written in Python and +built upon the PyTorch library, this framework aims to facilitate training and +inferences of deep species distribution models (deep-SDM) and sharing for users +with only general Python language skills (e.g., modeling ecologists) who are +interested in testing deep learning approaches to build new SDMs. More advanced +users can also benefit from the framework's modularity to run more specific +experiments by overriding existing classes while taking advantage of +press-button examples to train neural networks on multiple classification tasks +using custom or provided raw and pre-processed datasets. The framework is +open-sourced on GitHub and PyPi along with extensive documentation and examples +of use in various scenarios. MALPOLON offers straightforward installation, +YAML-based configuration, parallel computing, multi-GPU utilization, baseline +and foundational models for benchmarking, and extensive +tutorials/documentation, aiming to enhance accessibility and performance +scalability for ecologists and researchers. + +
+
+
+
+
+ + ☆ Self-supervised Pretraining for Cardiovascular Magnetic Resonance Cine + Segmentation MICCAI 2024 + + +
+ Self-supervised pretraining (SSP) has shown promising results in learning +from large unlabeled datasets and, thus, could be useful for automated +cardiovascular magnetic resonance (CMR) short-axis cine segmentation. However, +inconsistent reports of the benefits of SSP for segmentation have made it +difficult to apply SSP to CMR. Therefore, this study aimed to evaluate SSP +methods for CMR cine segmentation. + To this end, short-axis cine stacks of 296 subjects (90618 2D slices) were +used for unlabeled pretraining with four SSP methods; SimCLR, positional +contrastive learning, DINO, and masked image modeling (MIM). Subsets of varying +numbers of subjects were used for supervised fine-tuning of 2D models for each +SSP method, as well as to train a 2D baseline model from scratch. The +fine-tuned models were compared to the baseline using the 3D Dice similarity +coefficient (DSC) in a test dataset of 140 subjects. + The SSP methods showed no performance gains with the largest supervised +fine-tuning subset compared to the baseline (DSC = 0.89). When only 10 subjects +(231 2D slices) are available for supervised training, SSP using MIM (DSC = +0.86) improves over training from scratch (DSC = 0.82). + This study found that SSP is valuable for CMR cine segmentation when labeled +training data is scarce, but does not aid state-of-the-art deep learning +methods when ample labeled data is available. Moreover, the choice of SSP +method is important. The code is publicly available at: +https://github.com/q-cardIA/ssp-cmr-cine-segmentation + +
+
+ comment: Accepted to Data Engineering in Medical Imaging (DEMI) Workshop at + MICCAI 2024 +
+
+
+
+
+ + ☆ Infer Human's Intentions Before Following Natural Language Instructions + + +
+ For AI agents to be helpful to humans, they should be able to follow natural +language instructions to complete everyday cooperative tasks in human +environments. However, real human instructions inherently possess ambiguity, +because the human speakers assume sufficient prior knowledge about their hidden +goals and intentions. Standard language grounding and planning methods fail to +address such ambiguities because they do not model human internal goals as +additional partially observable factors in the environment. We propose a new +framework, Follow Instructions with Social and Embodied Reasoning (FISER), +aiming for better natural language instruction following in collaborative +embodied tasks. Our framework makes explicit inferences about human goals and +intentions as intermediate reasoning steps. We implement a set of +Transformer-based models and evaluate them over a challenging benchmark, +HandMeThat. We empirically demonstrate that using social reasoning to +explicitly infer human intentions before making action plans surpasses purely +end-to-end approaches. We also compare our implementation with strong +baselines, including Chain of Thought prompting on the largest available +pre-trained language models, and find that FISER provides better performance on +the embodied social reasoning tasks under investigation, reaching the +state-of-the-art on HandMeThat. + +
+
+
+
+
+ + ☆ Optimal Protocols for Continual Learning via Statistical Physics and + Control Theory + + +
+ Artificial neural networks often struggle with catastrophic forgetting when +learning multiple tasks sequentially, as training on new tasks degrades the +performance on previously learned ones. Recent theoretical work has addressed +this issue by analysing learning curves in synthetic frameworks under +predefined training protocols. However, these protocols relied on heuristics +and lacked a solid theoretical foundation assessing their optimality. In this +paper, we fill this gap combining exact equations for training dynamics, +derived using statistical physics techniques, with optimal control methods. We +apply this approach to teacher-student models for continual learning and +multi-task problems, obtaining a theory for task-selection protocols maximising +performance while minimising forgetting. Our theoretical analysis offers +non-trivial yet interpretable strategies for mitigating catastrophic +forgetting, shedding light on how optimal learning protocols can modulate +established effects, such as the influence of task similarity on forgetting. +Finally, we validate our theoretical findings on real-world data. + +
+
+ comment: 19 pages, 9 figures +
+
+
+
+
+ + ☆ Inverse Reinforcement Learning with Multiple Planning Horizons + + +
+ In this work, we study an inverse reinforcement learning (IRL) problem where +the experts are planning under a shared reward function but with different, +unknown planning horizons. Without the knowledge of discount factors, the +reward function has a larger feasible solution set, which makes it harder for +existing IRL approaches to identify a reward function. To overcome this +challenge, we develop algorithms that can learn a global multi-agent reward +function with agent-specific discount factors that reconstruct the expert +policies. We characterize the feasible solution space of the reward function +and discount factors for both algorithms and demonstrate the generalizability +of the learned reward function across multiple domains. + +
+
+ comment: Accepted at RLC 2024 +
+
+
+
+
+ + ☆ Revisit Anything: Visual Place Recognition via Image Segment Retrieval ECCV 2024 + + +
+ Accurately recognizing a revisited place is crucial for embodied agents to +localize and navigate. This requires visual representations to be distinct, +despite strong variations in camera viewpoint and scene appearance. Existing +visual place recognition pipelines encode the "whole" image and search for +matches. This poses a fundamental challenge in matching two images of the same +place captured from different camera viewpoints: "the similarity of what +overlaps can be dominated by the dissimilarity of what does not overlap". We +address this by encoding and searching for "image segments" instead of the +whole images. We propose to use open-set image segmentation to decompose an +image into `meaningful' entities (i.e., things and stuff). This enables us to +create a novel image representation as a collection of multiple overlapping +subgraphs connecting a segment with its neighboring segments, dubbed +SuperSegment. Furthermore, to efficiently encode these SuperSegments into +compact vector representations, we propose a novel factorized representation of +feature aggregation. We show that retrieving these partial representations +leads to significantly higher recognition recall than the typical whole image +based retrieval. Our segments-based approach, dubbed SegVLAD, sets a new +state-of-the-art in place recognition on a diverse selection of benchmark +datasets, while being applicable to both generic and task-specialized image +encoders. Finally, we demonstrate the potential of our method to ``revisit +anything'' by evaluating our method on an object instance retrieval task, which +bridges the two disparate areas of research: visual place recognition and +object-goal navigation, through their common aim of recognizing goal objects +specific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything. + +
+
+ comment: Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures +
+
+
+
+
+ + ☆ IFCap: Image-like Retrieval and Frequency-based Entity Filtering for + Zero-shot Captioning EMNLP 2024 + + +
+ Recent advancements in image captioning have explored text-only training +methods to overcome the limitations of paired image-text data. However, +existing text-only training methods often overlook the modality gap between +using text data during training and employing images during inference. To +address this issue, we propose a novel approach called Image-like Retrieval, +which aligns text features with visually relevant features to mitigate the +modality gap. Our method further enhances the accuracy of generated captions by +designing a Fusion Module that integrates retrieved captions with input +features. Additionally, we introduce a Frequency-based Entity Filtering +technique that significantly improves caption quality. We integrate these +methods into a unified framework, which we refer to as IFCap +($\textbf{I}$mage-like Retrieval and $\textbf{F}$requency-based Entity +Filtering for Zero-shot $\textbf{Cap}$tioning). Through extensive +experimentation, our straightforward yet powerful approach has demonstrated its +efficacy, outperforming the state-of-the-art methods by a significant margin in +both image captioning and video captioning compared to zero-shot captioning +based on text-only training. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ FlowBench: A Large Scale Benchmark for Flow Simulation over Complex + Geometries + + +
+ Simulating fluid flow around arbitrary shapes is key to solving various +engineering problems. However, simulating flow physics across complex +geometries remains numerically challenging and computationally +resource-intensive, particularly when using conventional PDE solvers. Machine +learning methods offer attractive opportunities to create fast and adaptable +PDE solvers. However, benchmark datasets to measure the performance of such +methods are scarce, especially for flow physics across complex geometries. We +introduce FlowBench, a dataset for neural simulators with over 10K samples, +which is currently larger than any publicly available flow physics dataset. +FlowBench contains flow simulation data across complex geometries +(\textit{parametric vs. non-parametric}), spanning a range of flow conditions +(\textit{Reynolds number and Grashoff number}), capturing a diverse array of +flow phenomena (\textit{steady vs. transient; forced vs. free convection}), and +for both 2D and 3D. FlowBench contains over 10K data samples, with each sample +the outcome of a fully resolved, direct numerical simulation using a +well-validated simulator framework designed for modeling transport phenomena in +complex geometries. For each sample, we include velocity, pressure, and +temperature field data at 3 different resolutions and several summary +statistics features of engineering relevance (such as coefficients of lift and +drag, and Nusselt numbers). %Additionally, we include masks and signed distance +fields for each shape. We envision that FlowBench will enable evaluating the +interplay between complex geometry, coupled flow phenomena, and data +sufficiency on the performance of current, and future, neural PDE solvers. We +enumerate several evaluation metrics to help rank order the performance of +neural PDE solvers. We benchmark the performance of several baseline methods +including FNO, CNO, WNO, and DeepONet. + +
+
+
+
+
+ + ☆ An Adversarial Perspective on Machine Unlearning for AI Safety + + +
+ Large language models are finetuned to refuse questions about hazardous +knowledge, but these protections can often be bypassed. Unlearning methods aim +at completely removing hazardous capabilities from models and make them +inaccessible to adversaries. This work challenges the fundamental differences +between unlearning and traditional safety post-training from an adversarial +perspective. We demonstrate that existing jailbreak methods, previously +reported as ineffective against unlearning, can be successful when applied +carefully. Furthermore, we develop a variety of adaptive methods that recover +most supposedly unlearned capabilities. For instance, we show that finetuning +on 10 unrelated examples or removing specific directions in the activation +space can recover most hazardous capabilities for models edited with RMU, a +state-of-the-art unlearning method. Our findings challenge the robustness of +current unlearning approaches and question their advantages over safety +training. + +
+
+
+
+
+ + ☆ Spatiotemporal Learning on Cell-embedded Graphs + + +
+ Data-driven simulation of physical systems has recently kindled significant +attention, where many neural models have been developed. In particular, +mesh-based graph neural networks (GNNs) have demonstrated significant potential +in predicting spatiotemporal dynamics across arbitrary geometric domains. +However, the existing node-edge message passing mechanism in GNNs limits the +model's representation learning ability. In this paper, we proposed a +cell-embedded GNN model (aka CeGNN) to learn spatiotemporal dynamics with +lifted performance. Specifically, we introduce a learnable cell attribution to +the node-edge message passing process, which better captures the spatial +dependency of regional features. Such a strategy essentially upgrades the local +aggregation scheme from the first order (e.g., from edge to node) to a higher +order (e.g., from volume to edge and then to node), which takes advantage of +volumetric information in message passing. Meanwhile, a novel feature-enhanced +block is designed to further improve the performance of CeGNN and relieve the +over-smoothness problem, via treating the latent features as basis functions. +The extensive experiments on various PDE systems and one real-world dataset +demonstrate that CeGNN achieves superior performance compared with other +baseline models, particularly reducing the prediction error with up to 1 orders +of magnitude on several PDE systems. + +
+
+
+
+
+ + ☆ Safe Time-Varying Optimization based on Gaussian Processes with + Spatio-Temporal Kernel NeurIPS 2024 + + +
+ Ensuring safety is a key aspect in sequential decision making problems, such +as robotics or process control. The complexity of the underlying systems often +makes finding the optimal decision challenging, especially when the +safety-critical system is time-varying. Overcoming the problem of optimizing an +unknown time-varying reward subject to unknown time-varying safety constraints, +we propose TVSafeOpt, a new algorithm built on Bayesian optimization with a +spatio-temporal kernel. The algorithm is capable of safely tracking a +time-varying safe region without the need for explicit change detection. +Optimality guarantees are also provided for the algorithm when the optimization +problem becomes stationary. We show that TVSafeOpt compares favorably against +SafeOpt on synthetic data, both regarding safety and optimality. Evaluation on +a realistic case study with gas compressors confirms that TVSafeOpt ensures +safety when solving time-varying optimization problems with unknown reward and +safety functions. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ PhoCoLens: Photorealistic and Consistent Reconstruction in Lensless + Imaging NeurIPS 2024 + + +
+ Lensless cameras offer significant advantages in size, weight, and cost +compared to traditional lens-based systems. Without a focusing lens, lensless +cameras rely on computational algorithms to recover the scenes from multiplexed +measurements. However, current algorithms struggle with inaccurate forward +imaging models and insufficient priors to reconstruct high-quality images. To +overcome these limitations, we introduce a novel two-stage approach for +consistent and photorealistic lensless image reconstruction. The first stage of +our approach ensures data consistency by focusing on accurately reconstructing +the low-frequency content with a spatially varying deconvolution method that +adjusts to changes in the Point Spread Function (PSF) across the camera's field +of view. The second stage enhances photorealism by incorporating a generative +prior from pre-trained diffusion models. By conditioning on the low-frequency +content retrieved in the first stage, the diffusion model effectively +reconstructs the high-frequency details that are typically lost in the lensless +imaging process, while also maintaining image fidelity. Our method achieves a +superior balance between data fidelity and visual quality compared to existing +methods, as demonstrated with two popular lensless systems, PhlatCam and +DiffuserCam. Project website: https://phocolens.github.io/. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ☆ Joint Localization and Planning using Diffusion ICRA 2025 + + +
+ Diffusion models have been successfully applied to robotics problems such as +manipulation and vehicle path planning. In this work, we explore their +application to end-to-end navigation -- including both perception and planning +-- by considering the problem of jointly performing global localization and +path planning in known but arbitrary 2D environments. In particular, we +introduce a diffusion model which produces collision-free paths in a global +reference frame given an egocentric LIDAR scan, an arbitrary map, and a desired +goal position. To this end, we implement diffusion in the space of paths in +SE(2), and describe how to condition the denoising process on both obstacles +and sensor observations. In our evaluation, we show that the proposed +conditioning techniques enable generalization to realistic maps of considerably +different appearance than the training environment, demonstrate our model's +ability to accurately describe ambiguous solutions, and run extensive +simulation experiments showcasing our model's use as a real-time, end-to-end +localization and planning stack. + +
+
+ comment: 7 pages, 9 figures. Submitted to ICRA 2025, under review +
+
+
+
+
+ + ☆ LoopSR: Looping Sim-and-Real for Lifelong Policy Adaptation of Legged + Robots + + +
+ Reinforcement Learning (RL) has shown its remarkable and generalizable +capability in legged locomotion through sim-to-real transfer. However, while +adaptive methods like domain randomization are expected to make policy more +robust to diverse environments, such comprehensiveness potentially detracts +from the policy's performance in any specific environment according to the No +Free Lunch theorem, leading to a suboptimal solution once deployed in the real +world. To address this issue, we propose a lifelong policy adaptation framework +named LoopSR, which utilizes a transformer-based encoder to project real-world +trajectories into a latent space, and accordingly reconstruct the real-world +environments back in simulation for further improvement. Autoencoder +architecture and contrastive learning methods are adopted to better extract the +characteristics of real-world dynamics. The simulation parameters for continual +training are derived by combining predicted parameters from the decoder with +retrieved parameters from the simulation trajectory dataset. By leveraging the +continual training, LoopSR achieves superior data efficiency compared with +strong baselines, with only a limited amount of data to yield eminent +performance in both sim-to-sim and sim-to-real experiments. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Dimension-independent learning rates for high-dimensional classification + problems + + +
+ We study the problem of approximating and estimating classification functions +that have their decision boundary in the $RBV^2$ space. Functions of $RBV^2$ +type arise naturally as solutions of regularized neural network learning +problems and neural networks can approximate these functions without the curse +of dimensionality. We modify existing results to show that every $RBV^2$ +function can be approximated by a neural network with bounded weights. +Thereafter, we prove the existence of a neural network with bounded weights +approximating a classification function. And we leverage these bounds to +quantify the estimation rates. Finally, we present a numerical study that +analyzes the effect of different regularity conditions on the decision +boundaries. + +
+
+
+
+
+ + ☆ Supra-Laplacian Encoding for Transformer on Dynamic Graphs + + +
+ Fully connected Graph Transformers (GT) have rapidly become prominent in the +static graph community as an alternative to Message-Passing models, which +suffer from a lack of expressivity, oversquashing, and under-reaching. However, +in a dynamic context, by interconnecting all nodes at multiple snapshots with +self-attention, GT loose both structural and temporal information. In this +work, we introduce Supra-LAplacian encoding for spatio-temporal TransformErs +(SLATE), a new spatio-temporal encoding to leverage the GT architecture while +keeping spatio-temporal information. Specifically, we transform Discrete Time +Dynamic Graphs into multi-layer graphs and take advantage of the spectral +properties of their associated supra-Laplacian matrix. Our second contribution +explicitly model nodes' pairwise relationships with a cross-attention +mechanism, providing an accurate edge representation for dynamic link +prediction. SLATE outperforms numerous state-of-the-art methods based on +Message-Passing Graph Neural Networks combined with recurrent models (e.g +LSTM), and Dynamic Graph Transformers, on 9 datasets. Code and instructions to +reproduce our results will be open-sourced. + +
+
+
+
+
+ + ☆ Hypergame Theory for Decentralized Resource Allocation in Multi-user + Semantic Communications + + +
+ Semantic communications (SC) is an emerging communication paradigm in which +wireless devices can send only relevant information from a source of data while +relying on computing resources to regenerate missing data points. However, the +design of a multi-user SC system becomes more challenging because of the +computing and communication overhead required for coordination. Existing +solutions for learning the semantic language and performing resource allocation +often fail to capture the computing and communication tradeoffs involved in +multiuser SC. To address this gap, a novel framework for decentralized +computing and communication resource allocation in multiuser SC systems is +proposed. The challenge of efficiently allocating communication and computing +resources (for reasoning) in a decentralized manner to maximize the quality of +task experience for the end users is addressed through the application of +Stackelberg hyper game theory. Leveraging the concept of second-level hyper +games, novel analytical formulations are developed to model misperceptions of +the users about each other's communication and control strategies. Further, +equilibrium analysis of the learned resource allocation protocols examines the +convergence of the computing and communication strategies to a local +Stackelberg equilibria, considering misperceptions. Simulation results show +that the proposed Stackelberg hyper game results in efficient usage of +communication and computing resources while maintaining a high quality of +experience for the users compared to state-of-the-art that does not account for +the misperceptions. + +
+
+
+
+
+ + ☆ HydraViT: Stacking Heads for a Scalable ViT + + +
+ The architecture of Vision Transformers (ViTs), particularly the Multi-head +Attention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs +on devices with varying constraints, such as mobile phones, requires multiple +models of different sizes. However, this approach has limitations, such as +training and storing each required model separately. This paper introduces +HydraViT, a novel approach that addresses these limitations by stacking +attention heads to achieve a scalable ViT. By repeatedly changing the size of +the embedded dimensions throughout each layer and their corresponding number of +attention heads in MHA during training, HydraViT induces multiple subnetworks. +Thereby, HydraViT achieves adaptability across a wide spectrum of hardware +environments while maintaining performance. Our experimental results +demonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10 +subnetworks, covering a wide range of resource constraints. HydraViT achieves +up to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy +with the same throughput on ImageNet-1K compared to the baselines, making it an +effective solution for scenarios where hardware availability is diverse or +varies over time. Source code available at https://github.com/ds-kiel/HydraViT. + +
+
+
+
+
+ + ☆ BEATS: Optimizing LLM Mathematical Capabilities with BackVerify and + Adaptive Disambiguate based Efficient Tree Search + + +
+ Large Language Models (LLMs) have exhibited exceptional performance across a +broad range of tasks and domains. However, they still encounter difficulties in +solving mathematical problems due to the rigorous and logical nature of +mathematics. Previous studies have employed techniques such as supervised +fine-tuning (SFT), prompt engineering, and search-based methods to improve the +mathematical problem-solving abilities of LLMs. Despite these efforts, their +performance remains suboptimal and demands substantial computational resources. +To address this issue, we propose a novel approach, BEATS, to enhance +mathematical problem-solving abilities. Our method leverages newly designed +prompts that guide the model to iteratively rewrite, advance by one step, and +generate answers based on previous steps. Additionally, we introduce a new +back-verification technique that uses LLMs to validate the correctness of the +generated answers. Furthermore, we employ a pruning tree search to optimize +search time while achieving strong performance. Notably, our method improves +Qwen2-7b-Instruct's score from 36.94 to 61.52, outperforming GPT4's 42.5 on the +MATH benchmark. + +
+
+
+
+
+ + ☆ On Translating Technical Terminology: A Translation Workflow for + Machine-Translated Acronyms + + +
+ The typical workflow for a professional translator to translate a document +from its source language (SL) to a target language (TL) is not always focused +on what many language models in natural language processing (NLP) do - predict +the next word in a series of words. While high-resource languages like English +and French are reported to achieve near human parity using common metrics for +measurement such as BLEU and COMET, we find that an important step is being +missed: the translation of technical terms, specifically acronyms. Some +state-of-the art machine translation systems like Google Translate which are +publicly available can be erroneous when dealing with acronyms - as much as 50% +in our findings. This article addresses acronym disambiguation for MT systems +by proposing an additional step to the SL-TL (FR-EN) translation workflow where +we first offer a new acronym corpus for public consumption and then experiment +with a search-based thresholding algorithm that achieves nearly 10% increase +when compared to Google Translate and OpusMT. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ Predicting Anchored Text from Translation Memories for Machine + Translation Using Deep Learning Methods + + +
+ Translation memories (TMs) are the backbone for professional translation +tools called computer-aided translation (CAT) tools. In order to perform a +translation using a CAT tool, a translator uses the TM to gather translations +similar to the desired segment to translate (s'). Many CAT tools offer a +fuzzy-match algorithm to locate segments (s) in the TM that are close in +distance to s'. After locating two similar segments, the CAT tool will present +parallel segments (s, t) that contain one segment in the source language along +with its translation in the target language. Additionally, CAT tools contain +fuzzy-match repair (FMR) techniques that will automatically use the parallel +segments from the TM to create new TM entries containing a modified version of +the original with the idea in mind that it will be the translation of s'. Most +FMR techniques use machine translation as a way of "repairing" those words that +have to be modified. In this article, we show that for a large part of those +words which are anchored, we can use other techniques that are based on machine +learning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we +show that for anchored words that follow the continuous bag-of-words (CBOW) +paradigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for +some cases, better results than neural machine translation for translating +anchored words from French to English. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ Adaptive Stream Processing on Edge Devices through Active Inference + + +
+ The current scenario of IoT is witnessing a constant increase on the volume +of data, which is generated in constant stream, calling for novel architectural +and logical solutions for processing it. Moving the data handling towards the +edge of the computing spectrum guarantees better distribution of load and, in +principle, lower latency and better privacy. However, managing such a structure +is complex, especially when requirements, also referred to Service Level +Objectives (SLOs), specified by applications' owners and infrastructure +managers need to be ensured. Despite the rich number of proposals of Machine +Learning (ML) based management solutions, researchers and practitioners yet +struggle to guarantee long-term prediction and control, and accurate +troubleshooting. Therefore, we present a novel ML paradigm based on Active +Inference (AIF) -- a concept from neuroscience that describes how the brain +constantly predicts and evaluates sensory information to decrease long-term +surprise. We implement it and evaluate it in a heterogeneous real stream +processing use case, where an AIF-based agent continuously optimizes the +fulfillment of three SLOs for three autonomous driving services running on +multiple devices. The agent used causal knowledge to gradually develop an +understanding of how its actions are related to requirements fulfillment, and +which configurations to favor. Through this approach, our agent requires up to +thirty iterations to converge to the optimal solution, showing the capability +of offering accurate results in a short amount of time. Furthermore, thanks to +AIF and its causal structures, our method guarantees full transparency on the +decision making, making the interpretation of the results and the +troubleshooting effortless. + +
+
+
+
+
+ + ☆ Sample compression unleashed : New generalization bounds for real valued + losses + + +
+ The sample compression theory provides generalization guarantees for +predictors that can be fully defined using a subset of the training dataset and +a (short) message string, generally defined as a binary sequence. Previous +works provided generalization bounds for the zero-one loss, which is +restrictive, notably when applied to deep learning approaches. In this paper, +we present a general framework for deriving new sample compression bounds that +hold for real-valued losses. We empirically demonstrate the tightness of the +bounds and their versatility by evaluating them on different types of models, +e.g., neural networks and decision forests, trained with the Pick-To-Learn +(P2L) meta-algorithm, which transforms the training method of any +machine-learning predictor to yield sample-compressed predictors. In contrast +to existing P2L bounds, ours are valid in the non-consistent case. + +
+
+
+
+
+ + ☆ Intelligent Energy Management: Remaining Useful Life Prediction and + Charging Automation System Comprised of Deep Learning and the Internet of + Things + + +
+ Remaining Useful Life (RUL) of battery is an important parameter to know the +battery's remaining life and need for recharge. The goal of this research +project is to develop machine learning-based models for the battery RUL +dataset. Different ML models are developed to classify the RUL of the vehicle, +and the IoT (Internet of Things) concept is simulated for automating the +charging system and managing any faults aligning. The graphs plotted depict the +relationship between various vehicle parameters using the Blynk IoT platform. +Results show that the catboost, Multi-Layer Perceptron (MLP), Gated Recurrent +Unit (GRU), and hybrid model developed could classify RUL into three classes +with 99% more accuracy. The data is fed using the tkinter GUI for simulating +artificial intelligence (AI)-based charging, and with a pyserial backend, data +can be entered into the Esp-32 microcontroller for making charge discharge +possible with the model's predictions. Also, with an IoT system, the charging +can be disconnected, monitored, and analyzed for automation. The results show +that an accuracy of 99% can be obtained on models MLP, catboost model and +similar accuracy on GRU model can be obtained, and finally relay-based +triggering can be made by prediction through the model used for automating the +charging and energy-saving mechanism. By showcasing an exemplary Blynk +platform-based monitoring and automation phenomenon, we further present +innovative ways of monitoring parameters and automating the system. + +
+
+
+
+
+ + ☆ Graph Reasoning with Large Language Models via Pseudo-code Prompting + + +
+ Large language models (LLMs) have recently achieved remarkable success in +various reasoning tasks in the field of natural language processing. This +success of LLMs has also motivated their use in graph-related tasks. Among +others, recent work has explored whether LLMs can solve graph problems such as +counting the number of connected components of a graph or computing the +shortest path distance between two nodes. Although LLMs possess preliminary +graph reasoning abilities, they might still struggle to solve some seemingly +simple problems. In this paper, we investigate whether prompting via +pseudo-code instructions can improve the performance of LLMs in solving graph +problems. Our experiments demonstrate that using pseudo-code instructions +generally improves the performance of all considered LLMs. The graphs, +pseudo-code prompts, and evaluation code are publicly available. + +
+
+
+
+
+ + ☆ Designing Short-Stage CDC-XPUFs: Balancing Reliability, Cost, and + Security in IoT Devices + + +
+ The rapid expansion of Internet of Things (IoT) devices demands robust and +resource-efficient security solutions. Physically Unclonable Functions (PUFs), +which generate unique cryptographic keys from inherent hardware variations, +offer a promising approach. However, traditional PUFs like Arbiter PUFs (APUFs) +and XOR Arbiter PUFs (XOR-PUFs) are susceptible to machine learning (ML) and +reliability-based attacks. In this study, we investigate +Component-Differentially Challenged XOR-PUFs (CDC-XPUFs), a less explored +variant, to address these vulnerabilities. We propose an optimized CDC-XPUF +design that incorporates a pre-selection strategy to enhance reliability and +introduces a novel lightweight architecture to reduce hardware overhead. +Rigorous testing demonstrates that our design significantly lowers resource +consumption, maintains strong resistance to ML attacks, and improves +reliability, effectively mitigating reliability-based attacks. These results +highlight the potential of CDC-XPUFs as a secure and efficient candidate for +widespread deployment in resource-constrained IoT systems. + +
+
+
+
+
+ + ☆ Model-Free versus Model-Based Reinforcement Learning for Fixed-Wing UAV + Attitude Control Under Varying Wind Conditions + + +
+ This paper evaluates and compares the performance of model-free and +model-based reinforcement learning for the attitude control of fixed-wing +unmanned aerial vehicles using PID as a reference point. The comparison focuses +on their ability to handle varying flight dynamics and wind disturbances in a +simulated environment. Our results show that the Temporal Difference Model +Predictive Control agent outperforms both the PID controller and other +model-free reinforcement learning methods in terms of tracking accuracy and +robustness over different reference difficulties, particularly in nonlinear +flight regimes. Furthermore, we introduce actuation fluctuation as a key metric +to assess energy efficiency and actuator wear, and we test two different +approaches from the literature: action variation penalty and conditioning for +action policy smoothness. We also evaluate all control methods when subject to +stochastic turbulence and gusts separately, so as to measure their effects on +tracking performance, observe their limitations and outline their implications +on the Markov decision process formalism. + +
+
+ comment: Published at ICINCO 2024 +
+
+
+
+
+ + ☆ A multi-source data power load forecasting method using attention + mechanism-based parallel cnn-gru + + +
+ Accurate power load forecasting is crucial for improving energy efficiency +and ensuring power supply quality. Considering the power load forecasting +problem involves not only dynamic factors like historical load variations but +also static factors such as climate conditions that remain constant over +specific periods. From the model-agnostic perspective, this paper proposes a +parallel structure network to extract important information from both dynamic +and static data. Firstly, based on complexity learning theory, it is +demonstrated that models integrated through parallel structures exhibit +superior generalization abilities compared to individual base learners. +Additionally, the higher the independence between base learners, the stronger +the generalization ability of the parallel structure model. This suggests that +the structure of machine learning models inherently contains significant +information. Building on this theoretical foundation, a parallel convolutional +neural network (CNN)-gate recurrent unit (GRU) attention model (PCGA) is +employed to address the power load forecasting issue, aiming to effectively +integrate the influences of dynamic and static features. The CNN module is +responsible for capturing spatial characteristics from static data, while the +GRU module captures long-term dependencies in dynamic time series data. The +attention layer is designed to focus on key information from the +spatial-temporal features extracted by the parallel CNN-GRU. To substantiate +the advantages of the parallel structure model in extracting and integrating +multi-source information, a series of experiments are conducted. + +
+
+
+
+
+ + ☆ A method for identifying causality in the response of nonlinear + dynamical systems + + +
+ Predicting the response of nonlinear dynamical systems subject to random, +broadband excitation is important across a range of scientific disciplines, +such as structural dynamics and neuroscience. Building data-driven models +requires experimental measurements of the system input and output, but it can +be difficult to determine whether inaccuracies in the model stem from modelling +errors or noise. This paper presents a novel method to identify the causal +component of the input-output data from measurements of a system in the +presence of output noise, as a function of frequency, without needing a high +fidelity model. An output prediction, calculated using an available model, is +optimally combined with noisy measurements of the output to predict the input +to the system. The parameters of the algorithm balance the two output signals +and are utilised to calculate a nonlinear coherence metric as a measure of +causality. This method is applicable to a broad class of nonlinear dynamical +systems. There are currently no solutions to this problem in the absence of a +complete benchmark model. + +
+
+
+
+
+ + ☆ Efficient Arbitrary Precision Acceleration for Large Language Models on + GPU Tensor Cores + + +
+ Large language models (LLMs) have been widely applied but face challenges in +efficient inference. While quantization methods reduce computational demands, +ultra-low bit quantization with arbitrary precision is hindered by limited GPU +Tensor Core support and inefficient memory management, leading to suboptimal +acceleration. To address these challenges, we propose a comprehensive +acceleration scheme for arbitrary precision LLMs. At its core, we introduce a +novel bipolar-INT data format that facilitates parallel computing and supports +symmetric quantization, effectively reducing data redundancy. Building on this, +we implement an arbitrary precision matrix multiplication scheme that +decomposes and recovers matrices at the bit level, enabling flexible precision +while maximizing GPU Tensor Core utilization. Furthermore, we develop an +efficient matrix preprocessing method that optimizes data layout for subsequent +computations. Finally, we design a data recovery-oriented memory management +system that strategically utilizes fast shared memory, significantly enhancing +kernel execution speed and minimizing memory access latency. Experimental +results demonstrate our approach's effectiveness, with up to 13\times speedup +in matrix multiplication compared to NVIDIA's CUTLASS. When integrated into +LLMs, we achieve up to 6.7\times inference acceleration. These improvements +significantly enhance LLM inference efficiency, enabling broader and more +responsive applications of LLMs. + +
+
+
+
+
+ + ☆ Implementing a Nordic-Baltic Federated Health Data Network: a case + report + + +
+ Background: Centralized collection and processing of healthcare data across +national borders pose significant challenges, including privacy concerns, data +heterogeneity and legal barriers. To address some of these challenges, we +formed an interdisciplinary consortium to develop a feder-ated health data +network, comprised of six institutions across five countries, to facilitate +Nordic-Baltic cooperation on secondary use of health data. The objective of +this report is to offer early insights into our experiences developing this +network. Methods: We used a mixed-method ap-proach, combining both experimental +design and implementation science to evaluate the factors affecting the +implementation of our network. Results: Technically, our experiments indicate +that the network functions without significant performance degradation compared +to centralized simu-lation. Conclusion: While use of interdisciplinary +approaches holds a potential to solve challeng-es associated with establishing +such collaborative networks, our findings turn the spotlight on the uncertain +regulatory landscape playing catch up and the significant operational costs. + +
+
+ comment: 24 pages (including appendices), 1 figure +
+
+
+
+
+ + ☆ A Multimodal Single-Branch Embedding Network for Recommendation in + Cold-Start and Missing Modality Scenarios + + +
+ Most recommender systems adopt collaborative filtering (CF) and provide +recommendations based on past collective interactions. Therefore, the +performance of CF algorithms degrades when few or no interactions are +available, a scenario referred to as cold-start. To address this issue, +previous work relies on models leveraging both collaborative data and side +information on the users or items. Similar to multimodal learning, these models +aim at combining collaborative and content representations in a shared +embedding space. In this work we propose a novel technique for multimodal +recommendation, relying on a multimodal Single-Branch embedding network for +Recommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction +data as well as multimodal side information using the same single-branch +embedding network on different modalities. This makes SiBraR effective in +scenarios of missing modality, including cold start. Our extensive experiments +on large-scale recommendation datasets from three different recommendation +domains (music, movie, and e-commerce) and providing multimodal content +information (audio, text, image, labels, and interactions) show that SiBraR +significantly outperforms CF as well as state-of-the-art content-based RSs in +cold-start scenarios, and is competitive in warm scenarios. We show that +SiBraR's recommendations are accurate in missing modality scenarios, and that +the model is able to map different modalities to the same region of the shared +embedding space, hence reducing the modality gap. + +
+
+ comment: Accepted at 18th ACM Conference on Recommender Systems (RecSys '24) +
+
+
+
+
+ + ☆ How Feature Learning Can Improve Neural Scaling Laws + + +
+ We develop a solvable model of neural scaling laws beyond the kernel limit. +Theoretical analysis of this model shows how performance scales with model +size, training time, and the total amount of available data. We identify three +scaling regimes corresponding to varying task difficulties: hard, easy, and +super easy tasks. For easy and super-easy target functions, which lie in the +reproducing kernel Hilbert space (RKHS) defined by the initial infinite-width +Neural Tangent Kernel (NTK), the scaling exponents remain unchanged between +feature learning and kernel regime models. For hard tasks, defined as those +outside the RKHS of the initial NTK, we demonstrate both analytically and +empirically that feature learning can improve scaling with training time and +compute, nearly doubling the exponent for hard tasks. This leads to a different +compute optimal strategy to scale parameters and training time in the feature +learning regime. We support our finding that feature learning improves the +scaling law for hard tasks but not for easy and super-easy tasks with +experiments of nonlinear MLPs fitting functions with power-law Fourier spectra +on the circle and CNNs learning vision tasks. + +
+
+
+
+
+ + ☆ AMARO: All Heavy-Atom Transferable Neural Network Potentials of Protein + Thermodynamics + + +
+ All-atom molecular simulations offer detailed insights into macromolecular +phenomena, but their substantial computational cost hinders the exploration of +complex biological processes. We introduce Advanced Machine-learning Atomic +Representation Omni-force-field (AMARO), a new neural network potential (NNP) +that combines an O(3)-equivariant message-passing neural network architecture, +TensorNet, with a coarse-graining map that excludes hydrogen atoms. AMARO +demonstrates the feasibility of training coarser NNP, without prior energy +terms, to run stable protein dynamics with scalability and generalization +capabilities. + +
+
+
+
+
+ + ☆ Machine Learning-based vs Deep Learning-based Anomaly Detection in + Multivariate Time Series for Spacecraft Attitude Sensors + + +
+ In the framework of Failure Detection, Isolation and Recovery (FDIR) on +spacecraft, new AI-based approaches are emerging in the state of the art to +overcome the limitations commonly imposed by traditional threshold checking. + The present research aims at characterizing two different approaches to the +problem of stuck values detection in multivariate time series coming from +spacecraft attitude sensors. The analysis reveals the performance differences +in the two approaches, while commenting on their interpretability and +generalization to different scenarios. + +
+
+ comment: Accepted for the ESA SPAICE Conference 2024 +
+
+
+
+
+ + ☆ Language Models as Zero-shot Lossless Gradient Compressors: Towards + General Neural Parameter Prior Models NeurIPS 2024 + + +
+ Despite the widespread use of statistical prior models in various fields, +such models for neural network gradients have long been overlooked. The +inherent challenge stems from their high-dimensional structures and complex +interdependencies, which complicate effective modeling. In this work, we +demonstrate the potential of large language models (LLMs) to act as gradient +priors in a zero-shot setting. We examine the property by considering lossless +gradient compression -- a critical application in distributed learning -- that +depends heavily on precise probability modeling. To achieve this, we introduce +LM-GC, a novel method that integrates LLMs with arithmetic coding. Our +technique converts plain gradients into text-like formats, enhancing token +efficiency by up to 38 times compared to their plain representations. We ensure +that this data conversion maintains a close alignment with the structure of +plain gradients and the symbols commonly recognized by LLMs. Our experiments +indicate that LM-GC surpasses existing state-of-the-art lossless compression +methods, improving compression rates by 10\% up to 17.2\% across various +datasets and architectures. Additionally, our approach shows promising +compatibility with lossy compression techniques such as quantization and +sparsification. These findings highlight the significant potential of LLMs as a +model for effectively handling gradients. We will release the source code upon +publication. + +
+
+ comment: To appear in NeurIPS 2024 +
+
+
+
+
+ + ☆ Ordinary Differential Equations for Enhanced 12-Lead ECG Generation + + +
+ In the realm of artificial intelligence, the generation of realistic training +data for supervised learning tasks presents a significant challenge. This is +particularly true in the synthesis of electrocardiograms (ECGs), where the +objective is to develop a synthetic 12-lead ECG model. The primary complexity +of this task stems from accurately modeling the intricate biological and +physiological interactions among different ECG leads. Although mathematical +process simulators have shed light on these dynamics, effectively incorporating +this understanding into generative models is not straightforward. In this work, +we introduce an innovative method that employs ordinary differential equations +(ODEs) to enhance the fidelity of generating 12-lead ECG data. This approach +integrates a system of ODEs that represent cardiac dynamics directly into the +generative model's optimization process, allowing for the production of +biologically plausible ECG training data that authentically reflects real-world +variability and inter-lead dependencies. We conducted an empirical analysis of +thousands of ECGs and found that incorporating cardiac simulation insights into +the data generation process significantly improves the accuracy of heart +abnormality classifiers trained on this synthetic 12-lead ECG data. + +
+
+
+
+
+ + ☆ Physics-aligned Schrödinger bridge + + +
+ The reconstruction of physical fields from sparse measurements is pivotal in +both scientific research and engineering applications. Traditional methods are +increasingly supplemented by deep learning models due to their efficacy in +extracting features from data. However, except for the low accuracy on complex +physical systems, these models often fail to comply with essential physical +constraints, such as governing equations and boundary conditions. To overcome +this limitation, we introduce a novel data-driven field reconstruction +framework, termed the Physics-aligned Schr\"{o}dinger Bridge (PalSB). This +framework leverages a diffusion Schr\"{o}dinger bridge mechanism that is +specifically tailored to align with physical constraints. The PalSB approach +incorporates a dual-stage training process designed to address both local +reconstruction mapping and global physical principles. Additionally, a +boundary-aware sampling technique is implemented to ensure adherence to +physical boundary conditions. We demonstrate the effectiveness of PalSB through +its application to three complex nonlinear systems: cylinder flow from Particle +Image Velocimetry experiments, two-dimensional turbulence, and a +reaction-diffusion system. The results reveal that PalSB not only achieves +higher accuracy but also exhibits enhanced compliance with physical constraints +compared to existing methods. This highlights PalSB's capability to generate +high-quality representations of intricate physical interactions, showcasing its +potential for advancing field reconstruction techniques. + +
+
+
+
+
+ + ☆ Generative Modeling of Molecular Dynamics Trajectories NeurIPS 2024 + + +
+ Molecular dynamics (MD) is a powerful technique for studying microscopic +phenomena, but its computational cost has driven significant interest in the +development of deep learning-based surrogate models. We introduce generative +modeling of molecular trajectories as a paradigm for learning flexible +multi-task surrogate models of MD from data. By conditioning on appropriately +chosen frames of the trajectory, we show such generative models can be adapted +to diverse tasks such as forward simulation, transition path sampling, and +trajectory upsampling. By alternatively conditioning on part of the molecular +system and inpainting the rest, we also demonstrate the first steps towards +dynamics-conditioned molecular design. We validate the full set of these +capabilities on tetrapeptide simulations and show that our model can produce +reasonable ensembles of protein monomers. Altogether, our work illustrates how +generative modeling can unlock value from MD data towards diverse downstream +tasks that are not straightforward to address with existing methods or even MD +itself. Code is available at https://github.com/bjing2016/mdgen. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Continual learning with task specialist + + +
+ Continual learning (CL) adapt the deep learning scenarios with timely updated +datasets. However, existing CL models suffer from the catastrophic forgetting +issue, where new knowledge replaces past learning. In this paper, we propose +Continual Learning with Task Specialists (CLTS) to address the issues of +catastrophic forgetting and limited labelled data in real-world datasets by +performing class incremental learning of the incoming stream of data. The model +consists of Task Specialists (T S) and Task Predictor (T P ) with pre-trained +Stable Diffusion (SD) module. Here, we introduce a new specialist to handle a +new task sequence and each T S has three blocks; i) a variational autoencoder +(V AE) to learn the task distribution in a low dimensional latent space, ii) a +K-Means block to perform data clustering and iii) Bootstrapping Language-Image +Pre-training (BLIP ) model to generate a small batch of captions from the input +data. These captions are fed as input to the pre-trained stable diffusion model +(SD) for the generation of task samples. The proposed model does not store any +task samples for replay, instead uses generated samples from SD to train the T +P module. A comparison study with four SOTA models conducted on three +real-world datasets shows that the proposed model outperforms all the selected +baselines + +
+
+
+
+
+ + ☆ Enriched Functional Tree-Based Classifiers: A Novel Approach Leveraging + Derivatives and Geometric Features + + +
+ The positioning of this research falls within the scalar-on-function +classification literature, a field of significant interest across various +domains, particularly in statistics, mathematics, and computer science. This +study introduces an advanced methodology for supervised classification by +integrating Functional Data Analysis (FDA) with tree-based ensemble techniques +for classifying high-dimensional time series. The proposed framework, Enriched +Functional Tree-Based Classifiers (EFTCs), leverages derivative and geometric +features, benefiting from the diversity inherent in ensemble methods to further +enhance predictive performance and reduce variance. While our approach has been +tested on the enrichment of Functional Classification Trees (FCTs), Functional +K-NN (FKNN), Functional Random Forest (FRF), Functional XGBoost (FXGB), and +Functional LightGBM (FLGBM), it could be extended to other tree-based and +non-tree-based classifiers, with appropriate considerations emerging from this +investigation. Through extensive experimental evaluations on seven real-world +datasets and six simulated scenarios, this proposal demonstrates fascinating +improvements over traditional approaches, providing new insights into the +application of FDA in complex, high-dimensional learning problems. + +
+
+
+
+
+ + ☆ CASPFormer: Trajectory Prediction from BEV Images with Deformable + Attention ICPR 2024 + + +
+ Motion prediction is an important aspect for Autonomous Driving (AD) and +Advance Driver Assistance Systems (ADAS). Current state-of-the-art motion +prediction methods rely on High Definition (HD) maps for capturing the +surrounding context of the ego vehicle. Such systems lack scalability in +real-world deployment as HD maps are expensive to produce and update in +real-time. To overcome this issue, we propose Context Aware Scene Prediction +Transformer (CASPFormer), which can perform multi-modal motion prediction from +rasterized Bird-Eye-View (BEV) images. Our system can be integrated with any +upstream perception module that is capable of generating BEV images. Moreover, +CASPFormer directly decodes vectorized trajectories without any postprocessing. +Trajectories are decoded recurrently using deformable attention, as it is +computationally efficient and provides the network with the ability to focus +its attention on the important spatial locations of the BEV images. In +addition, we also address the issue of mode collapse for generating multiple +scene-consistent trajectories by incorporating learnable mode queries. We +evaluate our model on the nuScenes dataset and show that it reaches +state-of-the-art across multiple metrics + +
+
+ comment: Under Review at ICPR 2024, Kolkata +
+
+
+
+
+ + ☆ Predicting the Stay Length of Patients in Hospitals using Convolutional + Gated Recurrent Deep Learning Model + + +
+ Predicting hospital length of stay (LoS) stands as a critical factor in +shaping public health strategies. This data serves as a cornerstone for +governments to discern trends, patterns, and avenues for enhancing healthcare +delivery. In this study, we introduce a robust hybrid deep learning model, a +combination of Multi-layer Convolutional (CNNs) deep learning, Gated Recurrent +Units (GRU), and Dense neural networks, that outperforms 11 conventional and +state-of-the-art Machine Learning (ML) and Deep Learning (DL) methodologies in +accurately forecasting inpatient hospital stay duration. Our investigation +delves into the implementation of this hybrid model, scrutinising variables +like geographic indicators tied to caregiving institutions, demographic markers +encompassing patient ethnicity, race, and age, as well as medical attributes +such as the CCS diagnosis code, APR DRG code, illness severity metrics, and +hospital stay duration. Statistical evaluations reveal the pinnacle LoS +accuracy achieved by our proposed model (CNN-GRU-DNN), which averages at 89% +across a 10-fold cross-validation test, surpassing LSTM, BiLSTM, GRU, and +Convolutional Neural Networks (CNNs) by 19%, 18.2%, 18.6%, and 7%, +respectively. Accurate LoS predictions not only empower hospitals to optimise +resource allocation and curb expenses associated with prolonged stays but also +pave the way for novel strategies in hospital stay management. This avenue +holds promise for catalysing advancements in healthcare research and +innovation, inspiring a new era of precision-driven healthcare practices. + +
+
+
+
+
+ + ☆ Confidence intervals uncovered: Are we ready for real-world medical + imaging AI? MICCAI 2024 + + +
+ Medical imaging is spearheading the AI transformation of healthcare. +Performance reporting is key to determine which methods should be translated +into clinical practice. Frequently, broad conclusions are simply derived from +mean performance values. In this paper, we argue that this common practice is +often a misleading simplification as it ignores performance variability. Our +contribution is threefold. (1) Analyzing all MICCAI segmentation papers (n = +221) published in 2023, we first observe that more than 50\% of papers do not +assess performance variability at all. Moreover, only one (0.5\%) paper +reported confidence intervals (CIs) for model performance. (2) To address the +reporting bottleneck, we show that the unreported standard deviation (SD) in +segmentation papers can be approximated by a second-order polynomial function +of the mean Dice similarity coefficient (DSC). Based on external validation +data from 56 previous MICCAI challenges, we demonstrate that this approximation +can accurately reconstruct the CI of a method using information provided in +publications. (3) Finally, we reconstructed 95\% CIs around the mean DSC of +MICCAI 2023 segmentation papers. The median CI width was 0.03 which is three +times larger than the median performance gap between the first and second +ranked method. For more than 60\% of papers, the mean performance of the +second-ranked method was within the CI of the first-ranked method. We conclude +that current publications typically do not provide sufficient evidence to +support which models could potentially be translated into clinical practice. + +
+
+ comment: Paper accepted at MICCAI 2024 conference +
+
+
+
+
+ + ☆ Byzantine-Robust Aggregation for Securing Decentralized Federated + Learning + + +
+ Federated Learning (FL) emerges as a distributed machine learning approach +that addresses privacy concerns by training AI models locally on devices. +Decentralized Federated Learning (DFL) extends the FL paradigm by eliminating +the central server, thereby enhancing scalability and robustness through the +avoidance of a single point of failure. However, DFL faces significant +challenges in optimizing security, as most Byzantine-robust algorithms proposed +in the literature are designed for centralized scenarios. In this paper, we +present a novel Byzantine-robust aggregation algorithm to enhance the security +of Decentralized Federated Learning environments, coined WFAgg. This proposal +handles the adverse conditions and strength robustness of dynamic decentralized +topologies at the same time by employing multiple filters to identify and +mitigate Byzantine attacks. Experimental results demonstrate the effectiveness +of the proposed algorithm in maintaining model accuracy and convergence in the +presence of various Byzantine attack scenarios, outperforming state-of-the-art +centralized Byzantine-robust aggregation schemes (such as Multi-Krum or +Clustering). These algorithms are evaluated on an IID image classification +problem in both centralized and decentralized scenarios. + +
+
+ comment: 18 pages, 7 figures, 1 table +
+
+
+
+
+ + ☆ Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval + Model EMNLP 2024 + + +
+ A supervised ranking model, despite its advantage of being effective, usually +involves complex processing - typically multiple stages of task-specific +pre-training and fine-tuning. This has motivated researchers to explore simpler +pipelines leveraging large language models (LLMs) that are capable of working +in a zero-shot manner. However, since zero-shot inference does not make use of +a training set of pairs of queries and their relevant documents, its +performance is mostly worse than that of supervised models, which are trained +on such example pairs. Motivated by the existing findings that training +examples generally improve zero-shot performance, in our work, we explore if +this also applies to ranking models. More specifically, given a query and a +pair of documents, the preference prediction task is improved by augmenting +examples of preferences for similar queries from a training set. Our proposed +pairwise few-shot ranker demonstrates consistent improvements over the +zero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset) +retrieval benchmarks. Our method also achieves a close performance to that of a +supervised model without requiring any complex training pipeline. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ Autoregressive Generation Strategies for Top-K Sequential + Recommendations + + +
+ The goal of modern sequential recommender systems is often formulated in +terms of next-item prediction. In this paper, we explore the applicability of +generative transformer-based models for the Top-K sequential recommendation +task, where the goal is to predict items a user is likely to interact with in +the "near future". + We explore commonly used autoregressive generation strategies, including +greedy decoding, beam search, and temperature sampling, to evaluate their +performance for the Top-K sequential recommendation task. In addition, we +propose novel Reciprocal Rank Aggregation (RRA) and Relevance Aggregation (RA) +generation strategies based on multi-sequence generation with temperature +sampling and subsequent aggregation. + Experiments on diverse datasets give valuable insights regarding commonly +used strategies' applicability and show that suggested approaches improve +performance on longer time horizons compared to widely-used Top-K prediction +approach and single-sequence autoregressive generation strategies. + +
+
+
+
+
+ + ☆ Recent advances in interpretable machine learning using structure-based + protein representations + + +
+ Recent advancements in machine learning (ML) are transforming the field of +structural biology. For example, AlphaFold, a groundbreaking neural network for +protein structure prediction, has been widely adopted by researchers. The +availability of easy-to-use interfaces and interpretable outcomes from the +neural network architecture, such as the confidence scores used to color the +predicted structures, have made AlphaFold accessible even to non-ML experts. In +this paper, we present various methods for representing protein 3D structures +from low- to high-resolution, and show how interpretable ML methods can support +tasks such as predicting protein structures, protein function, and +protein-protein interactions. This survey also emphasizes the significance of +interpreting and visualizing ML-based inference for structure-based protein +representations that enhance interpretability and knowledge discovery. +Developing such interpretable approaches promises to further accelerate fields +including drug development and protein design. + +
+
+
+
+
+ + ☆ QuForge: A Library for Qudits Simulation + + +
+ Quantum computing with qudits, an extension of qubits to multiple levels, is +a research field less mature than qubit-based quantum computing. However, +qudits can offer some advantages over qubits, by representing information with +fewer separated components. In this article, we present QuForge, a Python-based +library designed to simulate quantum circuits with qudits. This library +provides the necessary quantum gates for implementing quantum algorithms, +tailored to any chosen qudit dimension. Built on top of differentiable +frameworks, QuForge supports execution on accelerating devices such as GPUs and +TPUs, significantly speeding up simulations. It also supports sparse +operations, leading to a reduction in memory consumption compared to other +libraries. Additionally, by constructing quantum circuits as differentiable +graphs, QuForge facilitates the implementation of quantum machine learning +algorithms, enhancing the capabilities and flexibility of quantum computing +research. + +
+
+ comment: 18 pages, 7 figures +
+
+
+
+
+ + ☆ Efficient Pointwise-Pairwise Learning-to-Rank for News Recommendation + + +
+ News recommendation is a challenging task that involves personalization based +on the interaction history and preferences of each user. Recent works have +leveraged the power of pretrained language models (PLMs) to directly rank news +items by using inference approaches that predominately fall into three +categories: pointwise, pairwise, and listwise learning-to-rank. While pointwise +methods offer linear inference complexity, they fail to capture crucial +comparative information between items that is more effective for ranking tasks. +Conversely, pairwise and listwise approaches excel at incorporating these +comparisons but suffer from practical limitations: pairwise approaches are +either computationally expensive or lack theoretical guarantees, and listwise +methods often perform poorly in practice. In this paper, we propose a novel +framework for PLM-based news recommendation that integrates both pointwise +relevance prediction and pairwise comparisons in a scalable manner. We present +a rigorous theoretical analysis of our framework, establishing conditions under +which our approach guarantees improved performance. Extensive experiments show +that our approach outperforms the state-of-the-art methods on the MIND and +Adressa news recommendation datasets. + +
+
+
+
+
+ + ☆ Transfer Learning in $\ell_1$ Regularized Regression: Hyperparameter + Selection Strategy based on Sharp Asymptotic Analysis + + +
+ Transfer learning techniques aim to leverage information from multiple +related datasets to enhance prediction quality against a target dataset. Such +methods have been adopted in the context of high-dimensional sparse regression, +and some Lasso-based algorithms have been invented: Trans-Lasso and Pretraining +Lasso are such examples. These algorithms require the statistician to select +hyperparameters that control the extent and type of information transfer from +related datasets. However, selection strategies for these hyperparameters, as +well as the impact of these choices on the algorithm's performance, have been +largely unexplored. To address this, we conduct a thorough, precise study of +the algorithm in a high-dimensional setting via an asymptotic analysis using +the replica method. Our approach reveals a surprisingly simple behavior of the +algorithm: Ignoring one of the two types of information transferred to the +fine-tuning stage has little effect on generalization performance, implying +that efforts for hyperparameter selection can be significantly reduced. Our +theoretical findings are also empirically supported by real-world applications +on the IMDb dataset. + +
+
+ comment: 23 pages, 9 figures +
+
+
+
+
+ + ☆ PGN: The RNN's New Successor is Effective for Long-Range Time Series + Forecasting + + +
+ Due to the recurrent structure of RNN, the long information propagation path +poses limitations in capturing long-term dependencies, gradient +explosion/vanishing issues, and inefficient sequential execution. Based on +this, we propose a novel paradigm called Parallel Gated Network (PGN) as the +new successor to RNN. PGN directly captures information from previous time +steps through the designed Historical Information Extraction (HIE) layer and +leverages gated mechanisms to select and fuse it with the current time step +information. This reduces the information propagation path to $\mathcal{O}(1)$, +effectively addressing the limitations of RNN. To enhance PGN's performance in +long-range time series forecasting tasks, we propose a novel temporal modeling +framework called Temporal PGN (TPGN). TPGN incorporates two branches to +comprehensively capture the semantic information of time series. One branch +utilizes PGN to capture long-term periodic patterns while preserving their +local characteristics. The other branch employs patches to capture short-term +information and aggregate the global representation of the series. TPGN +achieves a theoretical complexity of $\mathcal{O}(\sqrt{L})$, ensuring +efficiency in its operations. Experimental results on five benchmark datasets +demonstrate the state-of-the-art (SOTA) performance and high efficiency of +TPGN, further confirming the effectiveness of PGN as the new successor to RNN +in long-range time series forecasting. The code is available in this +repository: \url{https://github.com/Water2sea/TPGN}. + +
+
+
+
+
+ + ☆ MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard + for Prompt Attacks + + +
+ The proliferation of Large Language Models (LLMs) in diverse applications +underscores the pressing need for robust security measures to thwart potential +jailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger +data integrity and user privacy. Guardrails serve as crucial protective +mechanisms against such threats, but existing models often fall short in terms +of both detection accuracy, and computational efficiency. This paper advocates +for the significance of jailbreak attack prevention on LLMs, and emphasises the +role of input guardrails in safeguarding these models. We introduce MoJE +(Mixture of Jailbreak Expert), a novel guardrail architecture designed to +surpass current limitations in existing state-of-the-art guardrails. By +employing simple linguistic statistical techniques, MoJE excels in detecting +jailbreak attacks while maintaining minimal computational overhead during model +inference. Through rigorous experimentation, MoJE demonstrates superior +performance capable of detecting 90% of the attacks without compromising benign +prompts, enhancing LLMs security against jailbreak attacks. + +
+
+
+
+
+ + ☆ MIO: A Foundation Model on Multimodal Tokens + + +
+ In this paper, we introduce MIO, a novel foundation model built on multimodal +tokens, capable of understanding and generating speech, text, images, and +videos in an end-to-end, autoregressive manner. While the emergence of large +language models (LLMs) and multimodal large language models (MM-LLMs) propels +advancements in artificial general intelligence through their versatile +capabilities, they still lack true any-to-any understanding and generation. +Recently, the release of GPT-4o has showcased the remarkable potential of +any-to-any LLMs for complex real-world tasks, enabling omnidirectional input +and output across images, speech, and text. However, it is closed-source and +does not support the generation of multimodal interleaved sequences. To address +this gap, we present MIO, which is trained on a mixture of discrete tokens +across four modalities using causal multimodal modeling. MIO undergoes a +four-stage training process: (1) alignment pre-training, (2) interleaved +pre-training, (3) speech-enhanced pre-training, and (4) comprehensive +supervised fine-tuning on diverse textual, visual, and speech tasks. Our +experimental results indicate that MIO exhibits competitive, and in some cases +superior, performance compared to previous dual-modal baselines, any-to-any +model baselines, and even modality-specific baselines. Moreover, MIO +demonstrates advanced capabilities inherent to its any-to-any feature, such as +interleaved video-text generation, chain-of-visual-thought reasoning, visual +guideline generation, instructional image editing, etc. + +
+
+ comment: Technical Report. Codes and models will be available soon +
+
+
+
+
+ + ☆ Efficient Bias Mitigation Without Privileged Information ECCV + 2024 + + +
+ Deep neural networks trained via empirical risk minimisation often exhibit +significant performance disparities across groups, particularly when group and +task labels are spuriously correlated (e.g., "grassy background" and "cows"). +Existing bias mitigation methods that aim to address this issue often either +rely on group labels for training or validation, or require an extensive +hyperparameter search. Such data and computational requirements hinder the +practical deployment of these methods, especially when datasets are too large +to be group-annotated, computational resources are limited, and models are +trained through already complex pipelines. In this paper, we propose Targeted +Augmentations for Bias Mitigation (TAB), a simple hyperparameter-free framework +that leverages the entire training history of a helper model to identify +spurious samples, and generate a group-balanced training set from which a +robust model can be trained. We show that TAB improves worst-group performance +without any group information or model selection, outperforming existing +methods while maintaining overall accuracy. + +
+
+ comment: Accepted at the 18th European Conference on Computer Vision (ECCV + 2024) as an Oral presentation +
+
+
+
+
+ + ☆ Graph Edit Distance with General Costs Using Neural Set Divergence NeurIPS 2024 + + +
+ Graph Edit Distance (GED) measures the (dis-)similarity between two given +graphs, in terms of the minimum-cost edit sequence that transforms one graph to +the other. However, the exact computation of GED is NP-Hard, which has recently +motivated the design of neural methods for GED estimation. However, they do not +explicitly account for edit operations with different costs. In response, we +propose GRAPHEDX, a neural GED estimator that can work with general costs +specified for the four edit operations, viz., edge deletion, edge addition, +node deletion and node addition. We first present GED as a quadratic assignment +problem (QAP) that incorporates these four costs. Then, we represent each graph +as a set of node and edge embeddings and use them to design a family of neural +set divergence surrogates. We replace the QAP terms corresponding to each +operation with their surrogates. Computing such neural set divergence require +aligning nodes and edges of the two graphs. We learn these alignments using a +Gumbel-Sinkhorn permutation generator, additionally ensuring that the node and +edge alignments are consistent with each other. Moreover, these alignments are +cognizant of both the presence and absence of edges between node-pairs. +Experiments on several datasets, under a variety of edit cost settings, show +that GRAPHEDX consistently outperforms state-of-the-art methods and heuristics +in terms of prediction error. + +
+
+ comment: Published at NeurIPS 2024 +
+
+
+
+
+ + ☆ Artificial Data Point Generation in Clustered Latent Space for Small + Medical Datasets + + +
+ One of the growing trends in machine learning is the use of data generation +techniques, since the performance of machine learning models is dependent on +the quantity of the training dataset. However, in many medical applications, +collecting large datasets is challenging due to resource constraints, which +leads to overfitting and poor generalization. This paper introduces a novel +method, Artificial Data Point Generation in Clustered Latent Space (AGCL), +designed to enhance classification performance on small medical datasets +through synthetic data generation. The AGCL framework involves feature +extraction, K-means clustering, cluster evaluation based on a class separation +metric, and the generation of synthetic data points from clusters with distinct +class representations. This method was applied to Parkinson's disease +screening, utilizing facial expression data, and evaluated across multiple +machine learning classifiers. Experimental results demonstrate that AGCL +significantly improves classification accuracy compared to baseline, GN and +kNNMTD. AGCL achieved the highest overall test accuracy of 83.33% and +cross-validation accuracy of 90.90% in majority voting over different emotions, +confirming its effectiveness in augmenting small datasets. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ Preserving logical and functional dependencies in synthetic tabular data + + +
+ Dependencies among attributes are a common aspect of tabular data. However, +whether existing tabular data generation algorithms preserve these dependencies +while generating synthetic data is yet to be explored. In addition to the +existing notion of functional dependencies, we introduce the notion of logical +dependencies among the attributes in this article. Moreover, we provide a +measure to quantify logical dependencies among attributes in tabular data. +Utilizing this measure, we compare several state-of-the-art synthetic data +generation algorithms and test their capability to preserve logical and +functional dependencies on several publicly available datasets. We demonstrate +that currently available synthetic tabular data generation algorithms do not +fully preserve functional dependencies when they generate synthetic datasets. +In addition, we also showed that some tabular synthetic data generation models +can preserve inter-attribute logical dependencies. Our review and comparison of +the state-of-the-art reveal research needs and opportunities to develop +task-specific synthetic tabular data generation models. + +
+
+ comment: Submitted to Pattern Recognition Journal +
+
+
+
+
+ + ☆ Optimal Memorization Capacity of Transformers + + +
+ Recent research in the field of machine learning has increasingly focused on +the memorization capacity of Transformers, but how efficient they are is not +yet well understood. We demonstrate that Transformers can memorize labels with +$\tilde{O}(\sqrt{N})$ parameters in a next-token prediction setting for $N$ +input sequences of length $n$, which is proved to be optimal up to logarithmic +factors. This indicates that Transformers can efficiently perform memorization +with little influence from the input length $n$ owing to the benefit of +parameter sharing. We also analyze the memorization capacity in the +sequence-to-sequence setting, and find that $\tilde{O}(\sqrt{nN})$ parameters +are not only sufficient, but also necessary at least for Transformers with +hardmax. These results suggest that while self-attention mechanisms can +efficiently identify input sequences, the feed-forward network becomes a +bottleneck when associating a label to each token. + +
+
+
+
+
+ + ☆ Explanation Bottleneck Models + + +
+ Recent concept-based interpretable models have succeeded in providing +meaningful explanations by pre-defined concept sets. However, the dependency on +the pre-defined concepts restricts the application because of the limited +number of concepts for explanations. This paper proposes a novel interpretable +deep neural network called explanation bottleneck models (XBMs). XBMs generate +a text explanation from the input without pre-defined concepts and then predict +a final task prediction based on the generated explanation by leveraging +pre-trained vision-language encoder-decoder models. To achieve both the target +task performance and the explanation quality, we train XBMs through the target +task loss with the regularization penalizing the explanation decoder via the +distillation from the frozen pre-trained decoder. Our experiments, including a +comparison to state-of-the-art concept bottleneck models, confirm that XBMs +provide accurate and fluent natural language explanations without pre-defined +concept sets. Code will be available at https://github.com/yshinya6/xbm/. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Efficient Fairness-Performance Pareto Front Computation + + +
+ There is a well known intrinsic trade-off between the fairness of a +representation and the performance of classifiers derived from the +representation. Due to the complexity of optimisation algorithms in most modern +representation learning approaches, for a given method it may be non-trivial to +decide whether the obtained fairness-performance curve of the method is +optimal, i.e., whether it is close to the true Pareto front for these +quantities for the underlying data distribution. + In this paper we propose a new method to compute the optimal Pareto front, +which does not require the training of complex representation models. We show +that optimal fair representations possess several useful structural properties, +and that these properties enable a reduction of the computation of the Pareto +Front to a compact discrete problem. We then also show that these compact +approximating problems can be efficiently solved via off-the shelf +concave-convex programming methods. + Since our approach is independent of the specific model of representations, +it may be used as the benchmark to which representation learning algorithms may +be compared. We experimentally evaluate the approach on a number of real world +benchmark datasets. + +
+
+
+
+
+ + ☆ FlowMAC: Conditional Flow Matching for Audio Coding at Low Bit Rates ICASSP 2025 + + +
+ This paper introduces FlowMAC, a novel neural audio codec for high-quality +general audio compression at low bit rates based on conditional flow matching +(CFM). FlowMAC jointly learns a mel spectrogram encoder, quantizer and decoder. +At inference time the decoder integrates a continuous normalizing flow via an +ODE solver to generate a high-quality mel spectrogram. This is the first time +that a CFM-based approach is applied to general audio coding, enabling a +scalable, simple and memory efficient training. Our subjective evaluations show +that FlowMAC at 3 kbps achieves similar quality as state-of-the-art GAN-based +and DDPM-based neural audio codecs at double the bit rate. Moreover, FlowMAC +offers a tunable inference pipeline, which permits to trade off complexity and +quality. This enables real-time coding on CPU, while maintaining high +perceptual quality. + +
+
+ comment: Submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Model-Free Stochastic Process Modeling and Optimization using + Normalizing Flows + + +
+ Real-world chemical processes often exhibit stochastic dynamics with +non-trivial correlations and state-dependent fluctuations. However, most +process models simply add stationary noise terms to a deterministic prediction, +which can lead to inaccurate predictions. This work proposes using conditional +normalizing flows as discrete-time models (DTMs) to learn the stochastic +dynamics of chemical processes. Normalizing flows learn an explicit expression +of the system states' probability density function (PDF) given prior states and +control inputs. The resulting model naturally allows for formulating stochastic +and probabilistic setpoint-tracking objectives and chance constraints. In +applications to a continuous reactor and a reactor cascade, the normalizing +flow yields stable simulations over long time horizons and high-quality results +in stochastic and probabilistic MPC formulation for open-loop control. +Furthermore, a chance-constrained optimization finds reliable startup controls +for the reactor cascade with stochastic reactions. In conclusion, the +conditional normalizing flow presents an excellent choice for modeling +nonlinear stochastic dynamics. + +
+
+ comment: 13 pages, 7 Figures, 5 Tables +
+
+
+
+
+ + ☆ Convolutional Signal Propagation: A Simple Scalable Algorithm for + Hypergraphs + + +
+ Last decade has seen the emergence of numerous methods for learning on +graphs, particularly Graph Neural Networks (GNNs). These methods, however, are +often not directly applicable to more complex structures like bipartite graphs +(equivalent to hypergraphs), which represent interactions among two entity +types (e.g. a user liking a movie). This paper proposes Convolutional Signal +Propagation (CSP), a non-parametric simple and scalable method that natively +operates on bipartite graphs (hypergraphs) and can be implemented with just a +few lines of code. After defining CSP, we demonstrate its relationship with +well-established methods like label propagation, Naive Bayes, and Hypergraph +Convolutional Networks. We evaluate CSP against several reference methods on +real-world datasets from multiple domains, focusing on retrieval and +classification tasks. Our results show that CSP offers competitive performance +while maintaining low computational complexity, making it an ideal first choice +as a baseline for hypergraph node classification and retrieval. Moreover, +despite operating on hypergraphs, CSP achieves good results in tasks typically +not associated with hypergraphs, such as natural language processing. + +
+
+
+
+
+ + ☆ Benign or Not-Benign Overfitting in Token Selection of Attention + Mechanism + + +
+ Modern over-parameterized neural networks can be trained to fit the training +data perfectly while still maintaining a high generalization performance. This +"benign overfitting" phenomenon has been studied in a surge of recent +theoretical work; however, most of these studies have been limited to linear +models or two-layer neural networks. In this work, we analyze benign +overfitting in the token selection mechanism of the attention architecture, +which characterizes the success of transformer models. We first show the +existence of a benign overfitting solution and explain its mechanism in the +attention architecture. Next, we discuss whether the model converges to such a +solution, raising the difficulties specific to the attention architecture. We +then present benign overfitting cases and not-benign overfitting cases by +conditioning different scenarios based on the behavior of attention +probabilities during training. To the best of our knowledge, this is the first +study to characterize benign overfitting for the attention mechanism. + +
+
+
+
+
+ + ☆ Neural P$^3$M: A Long-Range Interaction Modeling Enhancer for Geometric + GNNs NeurIPS 2024 + + +
+ Geometric graph neural networks (GNNs) have emerged as powerful tools for +modeling molecular geometry. However, they encounter limitations in effectively +capturing long-range interactions in large molecular systems. To address this +challenge, we introduce Neural P$^3$M, a versatile enhancer of geometric GNNs +to expand the scope of their capabilities by incorporating mesh points +alongside atoms and reimaging traditional mathematical operations in a +trainable manner. Neural P$^3$M exhibits flexibility across a wide range of +molecular systems and demonstrates remarkable accuracy in predicting energies +and forces, outperforming on benchmarks such as the MD22 dataset. It also +achieves an average improvement of 22% on the OE62 dataset while integrating +with various architectures. + +
+
+ comment: Published as a conference paper at NeurIPS 2024 +
+
+
+
+
+ + ☆ Diversity-Driven Synthesis: Enhancing Dataset Distillation through + Directed Weight Adjustment + + +
+ The sharp increase in data-related expenses has motivated research into +condensing datasets while retaining the most informative features. Dataset +distillation has thus recently come to the fore. This paradigm generates +synthetic dataset that are representative enough to replace the original +dataset in training a neural network. To avoid redundancy in these synthetic +datasets, it is crucial that each element contains unique features and remains +diverse from others during the synthesis stage. In this paper, we provide a +thorough theoretical and empirical analysis of diversity within synthesized +datasets. We argue that enhancing diversity can improve the parallelizable yet +isolated synthesizing approach. Specifically, we introduce a novel method that +employs dynamic and directed weight adjustment techniques to modulate the +synthesis process, thereby maximizing the representativeness and diversity of +each synthetic instance. Our method ensures that each batch of synthetic data +mirrors the characteristics of a large, varying subset of the original dataset. +Extensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet, +and ImageNet-1K, demonstrate the superior performance of our method, +highlighting its effectiveness in producing diverse and representative +synthetic datasets with minimal computational expense. + +
+
+
+
+
+ + ☆ Good Data Is All Imitation Learning Needs + + +
+ In this paper, we address the limitations of traditional teacher-student +models, imitation learning, and behaviour cloning in the context of +Autonomous/Automated Driving Systems (ADS), where these methods often struggle +with incomplete coverage of real-world scenarios. To enhance the robustness of +such models, we introduce the use of Counterfactual Explanations (CFEs) as a +novel data augmentation technique for end-to-end ADS. CFEs, by generating +training samples near decision boundaries through minimal input modifications, +lead to a more comprehensive representation of expert driver strategies, +particularly in safety-critical scenarios. This approach can therefore help +improve the model's ability to handle rare and challenging driving events, such +as anticipating darting out pedestrians, ultimately leading to safer and more +trustworthy decision-making for ADS. Our experiments in the CARLA simulator +demonstrate that CF-Driver outperforms the current state-of-the-art method, +achieving a higher driving score and lower infraction rates. Specifically, +CF-Driver attains a driving score of 84.2, surpassing the previous best model +by 15.02 percentage points. These results highlight the effectiveness of +incorporating CFEs in training end-to-end ADS. To foster further research, the +CF-Driver code is made publicly available. + +
+
+
+
+
+ + ☆ RmGPT: Rotating Machinery Generative Pretrained Model + + +
+ In industry, the reliability of rotating machinery is critical for production +efficiency and safety. Current methods of Prognostics and Health Management +(PHM) often rely on task-specific models, which face significant challenges in +handling diverse datasets with varying signal characteristics, fault modes and +operating conditions. Inspired by advancements in generative pretrained models, +we propose RmGPT, a unified model for diagnosis and prognosis tasks. RmGPT +introduces a novel token-based framework, incorporating Signal Tokens, Prompt +Tokens, Time-Frequency Task Tokens and Fault Tokens to handle heterogeneous +data within a unified model architecture. We leverage self-supervised learning +for robust feature extraction and introduce a next signal token prediction +pretraining strategy, alongside efficient prompt learning for task-specific +adaptation. Extensive experiments demonstrate that RmGPT significantly +outperforms state-of-the-art algorithms, achieving near-perfect accuracy in +diagnosis tasks and exceptionally low errors in prognosis tasks. Notably, RmGPT +excels in few-shot learning scenarios, achieving 92% accuracy in 16-class +one-shot experiments, highlighting its adaptability and robustness. This work +establishes RmGPT as a powerful PHM foundation model for rotating machinery, +advancing the scalability and generalizability of PHM solutions. + +
+
+
+
+
+ + ☆ Deep Manifold Part 1: Anatomy of Neural Network Manifold + + +
+ Based on the numerical manifold method principle, we developed a mathematical +framework of a neural network manifold: Deep Manifold and discovered that +neural networks: 1) is numerical computation combining forward and inverse; 2) +have near infinite degrees of freedom; 3) exponential learning capacity with +depth; 4) have self-progressing boundary conditions; 5) has training hidden +bottleneck. We also define two concepts: neural network learning space and deep +manifold space and introduce two concepts: neural network intrinsic pathway and +fixed point. We raise three fundamental questions: 1). What is the training +completion definition; 2). where is the deep learning convergence point (neural +network fixed point); 3). How important is token timestamp in training data +given negative time is critical in inverse problem. + +
+
+
+
+
+ + ☆ Conjugate Bayesian Two-step Change Point Detection for Hawkes Process NeurIPS 2024 + + +
+ The Bayesian two-step change point detection method is popular for the Hawkes +process due to its simplicity and intuitiveness. However, the non-conjugacy +between the point process likelihood and the prior requires most existing +Bayesian two-step change point detection methods to rely on non-conjugate +inference methods. These methods lack analytical expressions, leading to low +computational efficiency and impeding timely change point detection. To address +this issue, this work employs data augmentation to propose a conjugate Bayesian +two-step change point detection method for the Hawkes process, which proves to +be more accurate and efficient. Extensive experiments on both synthetic and +real data demonstrate the superior effectiveness and efficiency of our method +compared to baseline methods. Additionally, we conduct ablation studies to +explore the robustness of our method concerning various hyperparameters. Our +code is publicly available at https://github.com/Aurora2050/CoBay-CPD. + +
+
+ comment: 10 pages, accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Multimodal Banking Dataset: Understanding Client Needs through Event + Sequences + + +
+ Financial organizations collect a huge amount of data about clients that +typically has a temporal (sequential) structure and is collected from various +sources (modalities). Due to privacy issues, there are no large-scale +open-source multimodal datasets of event sequences, which significantly limits +the research in this area. In this paper, we present the industrial-scale +publicly available multimodal banking dataset, MBD, that contains more than +1.5M corporate clients with several modalities: 950M bank transactions, 1B geo +position events, 5M embeddings of dialogues with technical support and monthly +aggregated purchases of four bank's products. All entries are properly +anonymized from real proprietary bank data. Using this dataset, we introduce a +novel benchmark with two business tasks: campaigning (purchase prediction in +the next month) and matching of clients. We provide numerical results that +demonstrate the superiority of our multi-modal baselines over single-modal +techniques for each task. As a result, the proposed dataset can open new +perspectives and facilitate the future development of practically important +large-scale multimodal algorithms for event sequences. + HuggingFace Link: https://huggingface.co/datasets/ai-lab/MBD + Github Link: https://github.com/Dzhambo/MBD + +
+
+
+
+
+ + ☆ Let the Quantum Creep In: Designing Quantum Neural Network Models by + Gradually Swapping Out Classical Components + + +
+ Artificial Intelligence (AI), with its multiplier effect and wide +applications in multiple areas, could potentially be an important application +of quantum computing. Since modern AI systems are often built on neural +networks, the design of quantum neural networks becomes a key challenge in +integrating quantum computing into AI. To provide a more fine-grained +characterisation of the impact of quantum components on the performance of +neural networks, we propose a framework where classical neural network layers +are gradually replaced by quantum layers that have the same type of input and +output while keeping the flow of information between layers unchanged, +different from most current research in quantum neural network, which favours +an end-to-end quantum model. We start with a simple three-layer classical +neural network without any normalisation layers or activation functions, and +gradually change the classical layers to the corresponding quantum versions. We +conduct numerical experiments on image classification datasets such as the +MNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of +performance brought by the systematic introduction of quantum components. +Through this framework, our research sheds new light on the design of future +quantum neural network models where it could be more favourable to search for +methods and frameworks that harness the advantages from both the classical and +quantum worlds. + +
+
+ comment: 50 pages (including Appendix), many figures, accepted as a poster on + QTML2024. Code available at + https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In +
+
+
+
+
+ + ☆ Multiplicative Logit Adjustment Approximates Neural-Collapse-Aware + Decision Boundary Adjustment + + +
+ Real-world data distributions are often highly skewed. This has spurred a +growing body of research on long-tailed recognition to address this imbalance +in training classification models. Among the methods studied, multiplicative +logit adjustment (MLA) stands out as a simple and effective method. However, it +lacks theoretical guarantees, which raises concerns about the optimality of its +adjustment method. We provide a theoretical justification for the effectiveness +of MLA with the following two-step theory. First, we develop a theory that +adjusts optimal decision boundaries by estimating feature spread on the basis +of neural collapse. Then, we demonstrate that MLA approximates this optimal +method. Additionally, through experiments on long-tailed datasets, we +illustrate the practical usefulness of MLA under more realistic conditions. We +also offer experimental insights to guide the tuning of MLA's hyperparameters. + +
+
+
+
+
+ + ☆ Derandomizing Multi-Distribution Learning + + +
+ Multi-distribution or collaborative learning involves learning a single +predictor that works well across multiple data distributions, using samples +from each during training. Recent research on multi-distribution learning, +focusing on binary loss and finite VC dimension classes, has shown near-optimal +sample complexity that is achieved with oracle efficient algorithms. That is, +these algorithms are computationally efficient given an efficient ERM for the +class. Unlike in classical PAC learning, where the optimal sample complexity is +achieved with deterministic predictors, current multi-distribution learning +algorithms output randomized predictors. This raises the question: can these +algorithms be derandomized to produce a deterministic predictor for multiple +distributions? Through a reduction to discrepancy minimization, we show that +derandomizing multi-distribution learning is computationally hard, even when +ERM is computationally efficient. On the positive side, we identify a +structural condition enabling an efficient black-box reduction, converting +existing randomized multi-distribution predictors into deterministic ones. + +
+
+
+
+
+ + ☆ Pixel-Space Post-Training of Latent Diffusion Models + + +
+ Latent diffusion models (LDMs) have made significant advancements in the +field of image generation in recent years. One major advantage of LDMs is their +ability to operate in a compressed latent space, allowing for more efficient +training and deployment. However, despite these advantages, challenges with +LDMs still remain. For example, it has been observed that LDMs often generate +high-frequency details and complex compositions imperfectly. We hypothesize +that one reason for these flaws is due to the fact that all pre- and +post-training of LDMs are done in latent space, which is typically $8 \times 8$ +lower spatial-resolution than the output images. To address this issue, we +propose adding pixel-space supervision in the post-training process to better +preserve high-frequency details. Experimentally, we show that adding a +pixel-space objective significantly improves both supervised quality +fine-tuning and preference-based post-training by a large margin on a +state-of-the-art DiT transformer and U-Net diffusion models in both visual +quality and visual flaw metrics, while maintaining the same text alignment +quality. + +
+
+
+
+
+ + ☆ Joint Source-Channel Coding: Fundamentals and Recent Progress in + Practical Designs + + +
+ Semantic- and task-oriented communication has emerged as a promising approach +to reducing the latency and bandwidth requirements of next-generation mobile +networks by transmitting only the most relevant information needed to complete +a specific task at the receiver. This is particularly advantageous for +machine-oriented communication of high data rate content, such as images and +videos, where the goal is rapid and accurate inference, rather than perfect +signal reconstruction. While semantic- and task-oriented compression can be +implemented in conventional communication systems, joint source-channel coding +(JSCC) offers an alternative end-to-end approach by optimizing compression and +channel coding together, or even directly mapping the source signal to the +modulated waveform. Although all digital communication systems today rely on +separation, thanks to its modularity, JSCC is known to achieve higher +performance in finite blocklength scenarios, and to avoid cliff and the +levelling-off effects in time-varying channel scenarios. This article provides +an overview of the information theoretic foundations of JSCC, surveys practical +JSCC designs over the decades, and discusses the reasons for their limited +adoption in practical systems. We then examine the recent resurgence of JSCC, +driven by the integration of deep learning techniques, particularly through +DeepJSCC, highlighting its many surprising advantages in various scenarios. +Finally, we discuss why it may be time to reconsider today's strictly separate +architectures, and reintroduce JSCC to enable high-fidelity, low-latency +communications in critical applications such as autonomous driving, drone +surveillance, or wearable systems. + +
+
+ comment: Under review for possible publication +
+
+
+
+
+ + ☆ Advancing Open-Set Domain Generalization Using Evidential Bi-Level + Hardest Domain Scheduler NeurIPS 2024 + + +
+ In Open-Set Domain Generalization (OSDG), the model is exposed to both new +variations of data appearance (domains) and open-set conditions, where both +known and novel categories are present at test time. The challenges of this +task arise from the dual need to generalize across diverse domains and +accurately quantify category novelty, which is critical for applications in +dynamic environments. Recently, meta-learning techniques have demonstrated +superior results in OSDG, effectively orchestrating the meta-train and -test +tasks by employing varied random categories and predefined domain partition +strategies. These approaches prioritize a well-designed training schedule over +traditional methods that focus primarily on data augmentation and the +enhancement of discriminative feature learning. The prevailing meta-learning +models in OSDG typically utilize a predefined sequential domain scheduler to +structure data partitions. However, a crucial aspect that remains inadequately +explored is the influence brought by strategies of domain schedulers during +training. In this paper, we observe that an adaptive domain scheduler benefits +more in OSDG compared with prefixed sequential and random domain schedulers. We +propose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve +an adaptive domain scheduler. This method strategically sequences domains by +assessing their reliabilities in utilizing a follower network, trained with +confidence scores learned in an evidential manner, regularized by max rebiasing +discrepancy, and optimized in a bi-level manner. The results show that our +method substantially improves OSDG performance and achieves more discriminative +embeddings for both the seen and unseen categories. The source code will be +available at https://github.com/KPeng9510/EBiL-HaDS. + +
+
+ comment: Accepted to NeurIPS 2024. The source code will be available at + https://github.com/KPeng9510/EBiL-HaDS +
+
+
+
+
+ + ☆ A Simple but Strong Baseline for Sounding Video Generation: Effective + Adaptation of Audio and Video Diffusion Models for Joint Generation + + +
+ In this work, we build a simple but strong baseline for sounding video +generation. Given base diffusion models for audio and video, we integrate them +with additional modules into a single model and train it to make the model +jointly generate audio and video. To enhance alignment between audio-video +pairs, we introduce two novel mechanisms in our model. The first one is +timestep adjustment, which provides different timestep information to each base +model. It is designed to align how samples are generated along with timesteps +across modalities. The second one is a new design of the additional modules, +termed Cross-Modal Conditioning as Positional Encoding (CMC-PE). In CMC-PE, +cross-modal information is embedded as if it represents temporal position +information, and the embeddings are fed into the model like positional +encoding. Compared with the popular cross-attention mechanism, CMC-PE provides +a better inductive bias for temporal alignment in the generated data. +Experimental results validate the effectiveness of the two newly introduced +mechanisms and also demonstrate that our method outperforms existing methods. + +
+
+ comment: The source code will be released soon +
+
+
+
+
+ + ☆ MASSFormer: Mobility-Aware Spectrum Sensing using Transformer-Driven + Tiered Structure + + +
+ In this paper, we develop a novel mobility-aware transformer-driven tiered +structure (MASSFormer) based cooperative spectrum sensing method that +effectively models the spatio-temporal dynamics of user movements. Unlike +existing methods, our method considers a dynamic scenario involving mobile +primary users (PUs) and secondary users (SUs)and addresses the complexities +introduced by user mobility. The transformer architecture utilizes an attention +mechanism, enabling the proposed method to adeptly model the temporal dynamics +of user mobility by effectively capturing long-range dependencies within the +input data. The proposed method first computes tokens from the sequence of +covariance matrices (CMs) for each SU and processes them in parallel using the +SUtransformer network to learn the spatio-temporal features at SUlevel. +Subsequently, the collaborative transformer network learns the group-level PU +state from all SU-level feature representations. The attention-based sequence +pooling method followed by the transformer encoder adjusts the contributions of +all tokens. The main goal of predicting the PU states at each SU-level and +group-level is to improve detection performance even more. We conducted a +sufficient amount of simulations and compared the detection performance of +different SS methods. The proposed method is tested under imperfect reporting +channel scenarios to show robustness. The efficacy of our method is validated +with the simulation results demonstrating its higher performance compared with +existing methods in terms of detection probability, sensing error, and +classification accuracy. + +
+
+
+
+
+ + ☆ Modulated Intervention Preference Optimization (MIPO): Keey the Easy, + Refine the Difficult AAAI 2025 + + +
+ Preference optimization methods typically begin training with a well-trained +SFT model as a reference model. In RLHF and DPO, a regularization term is used +during the preference optimization process to prevent the policy model from +deviating too far from the reference model's distribution, thereby avoiding the +generation of anomalous responses. When the reference model is already +well-aligned with the given data or only requires slight adjustments, this +approach can produce a well-aligned model. However, if the reference model is +not aligned with the given data and requires significant deviation from its +current state, a regularization term may actually hinder the model alignment. +In this study, we propose \textbf{Modulated Intervention Preference +Optimization (MIPO)} to address this issue. MIPO modulates the degree of +intervention from the reference model based on how well the given data is +aligned with it. If the data is well-aligned, the intervention is increased to +prevent the policy model from diverging significantly from reference model. +Conversely, if the alignment is poor, the interference is reduced to facilitate +more extensive training. We compare the performance of MIPO and DPO using +Mistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental +results demonstrate that MIPO consistently outperforms DPO across various +evaluation scenarios. + +
+
+ comment: 8pages, submitted to AAAI 2025 +
+
+
+
+
+ + ☆ Optimizing the Induced Correlation in Omnibus Joint Graph Embeddings + + +
+ Theoretical and empirical evidence suggests that joint graph embedding +algorithms induce correlation across the networks in the embedding space. In +the Omnibus joint graph embedding framework, previous results explicitly +delineated the dual effects of the algorithm-induced and model-inherent +correlations on the correlation across the embedded networks. Accounting for +and mitigating the algorithm-induced correlation is key to subsequent +inference, as sub-optimal Omnibus matrix constructions have been demonstrated +to lead to loss in inference fidelity. This work presents the first efforts to +automate the Omnibus construction in order to address two key questions in this +joint embedding framework: the correlation-to-OMNI problem and the flat +correlation problem. In the flat correlation problem, we seek to understand the +minimum algorithm-induced flat correlation (i.e., the same across all graph +pairs) produced by a generalized Omnibus embedding. Working in a subspace of +the fully general Omnibus matrices, we prove both a lower bound for this flat +correlation and that the classical Omnibus construction induces the maximal +flat correlation. In the correlation-to-OMNI problem, we present an algorithm +-- named corr2Omni -- that, from a given matrix of estimated pairwise graph +correlations, estimates the matrix of generalized Omnibus weights that induces +optimal correlation in the embedding space. Moreover, in both simulated and +real data settings, we demonstrate the increased effectiveness of our corr2Omni +algorithm versus the classical Omnibus construction. + +
+
+ comment: 34 pages, 8 figures +
+
+
+
+
+ + ☆ On the Implicit Relation Between Low-Rank Adaptation and Differential + Privacy + + +
+ A significant approach in natural language processing involves large-scale +pre-training on general domain data followed by adaptation to specific tasks or +domains. As models grow in size, full fine-tuning all parameters becomes +increasingly impractical. To address this, some methods for low-rank task +adaptation of language models have been proposed, e.g. LoRA and FLoRA. These +methods keep the pre-trained model weights fixed and incorporate trainable +low-rank decomposition matrices into some layers of the transformer +architecture, called adapters. This approach significantly reduces the number +of trainable parameters required for downstream tasks compared to full +fine-tuning all parameters. In this work, we look at low-rank adaptation from +the lens of data privacy. We show theoretically that the low-rank adaptation +used in LoRA and FLoRA is equivalent to injecting some random noise into the +batch gradients w.r.t the adapter parameters coming from their full +fine-tuning, and we quantify the variance of the injected noise. By +establishing a Berry-Esseen type bound on the total variation distance between +the noise distribution and a Gaussian distribution with the same variance, we +show that the dynamics of LoRA and FLoRA are very close to differentially +private full fine-tuning the adapters, which suggests that low-rank adaptation +implicitly provides privacy w.r.t the fine-tuning data. Finally, using +Johnson-Lindenstrauss lemma, we show that when augmented with gradient +clipping, low-rank adaptation is almost equivalent to differentially private +full fine-tuning adapters with a fixed noise scale. + +
+
+
+
+
+ + ☆ Dataset Distillation-based Hybrid Federated Learning on Non-IID Data + + +
+ In federated learning, the heterogeneity of client data has a great impact on +the performance of model training. Many heterogeneity issues in this process +are raised by non-independently and identically distributed (Non-IID) data. +This study focuses on the issue of label distribution skew. To address it, we +propose a hybrid federated learning framework called HFLDD, which integrates +dataset distillation to generate approximately independent and equally +distributed (IID) data, thereby improving the performance of model training. +Particularly, we partition the clients into heterogeneous clusters, where the +data labels among different clients within a cluster are unbalanced while the +data labels among different clusters are balanced. The cluster headers collect +distilled data from the corresponding cluster members, and conduct model +training in collaboration with the server. This training process is like +traditional federated learning on IID data, and hence effectively alleviates +the impact of Non-IID data on model training. Furthermore, we compare our +proposed method with typical baseline methods on public datasets. Experimental +results demonstrate that when the data labels are severely imbalanced, the +proposed HFLDD outperforms the baseline methods in terms of both test accuracy +and communication cost. + +
+
+
+
+
+ + ☆ Functional Classification of Spiking Signal Data Using Artificial + Intelligence Techniques: A Review + + +
+ Human brain neuron activities are incredibly significant nowadays. Neuronal +behavior is assessed by analyzing signal data such as electroencephalography +(EEG), which can offer scientists valuable information about diseases and +human-computer interaction. One of the difficulties researchers confront while +evaluating these signals is the existence of large volumes of spike data. +Spikes are some considerable parts of signal data that can happen as a +consequence of vital biomarkers or physical issues such as electrode movements. +Hence, distinguishing types of spikes is important. From this spot, the spike +classification concept commences. Previously, researchers classified spikes +manually. The manual classification was not precise enough as it involves +extensive analysis. Consequently, Artificial Intelligence (AI) was introduced +into neuroscience to assist clinicians in classifying spikes correctly. This +review discusses the importance and use of AI in spike classification, focusing +on the recognition of neural activity noises. The task is divided into three +main components: preprocessing, classification, and evaluation. Existing +methods are introduced and their importance is determined. The review also +highlights the need for more efficient algorithms. The primary goal is to +provide a perspective on spike classification for future research and provide a +comprehensive understanding of the methodologies and issues involved. The +review organizes materials in the spike classification field for future +studies. In this work, numerous studies were extracted from different +databases. The PRISMA-related research guidelines were then used to choose +papers. Then, research studies based on spike classification using machine +learning and deep learning approaches with effective preprocessing were +selected. + +
+
+ comment: 8 figures, 32 pages +
+
+
+
+
+ + ☆ Comparing Unidirectional, Bidirectional, and Word2vec Models for + Discovering Vulnerabilities in Compiled Lifted Code + + +
+ Ransomware and other forms of malware cause significant financial and +operational damage to organizations by exploiting long-standing and often +difficult-to-detect software vulnerabilities. To detect vulnerabilities such as +buffer overflows in compiled code, this research investigates the application +of unidirectional transformer-based embeddings, specifically GPT-2. Using a +dataset of LLVM functions, we trained a GPT-2 model to generate embeddings, +which were subsequently used to build LSTM neural networks to differentiate +between vulnerable and non-vulnerable code. Our study reveals that embeddings +from the GPT-2 model significantly outperform those from bidirectional models +of BERT and RoBERTa, achieving an accuracy of 92.5% and an F1-score of 89.7%. +LSTM neural networks were developed with both frozen and unfrozen embedding +model layers. The model with the highest performance was achieved when the +embedding layers were unfrozen. Further, the research finds that, in exploring +the impact of different optimizers within this domain, the SGD optimizer +demonstrates superior performance over Adam. Overall, these findings reveal +important insights into the potential of unidirectional transformer-based +approaches in enhancing cybersecurity defenses. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ☆ NeuroPath: A Neural Pathway Transformer for Joining the Dots of Human + Connectomes NeurIPS 2024 + + +
+ Although modern imaging technologies allow us to study connectivity between +two distinct brain regions in-vivo, an in-depth understanding of how anatomical +structure supports brain function and how spontaneous functional fluctuations +emerge remarkable cognition is still elusive. Meanwhile, tremendous efforts +have been made in the realm of machine learning to establish the nonlinear +mapping between neuroimaging data and phenotypic traits. However, the absence +of neuroscience insight in the current approaches poses significant challenges +in understanding cognitive behavior from transient neural activities. To +address this challenge, we put the spotlight on the coupling mechanism of +structural connectivity (SC) and functional connectivity (FC) by formulating +such network neuroscience question into an expressive graph representation +learning problem for high-order topology. Specifically, we introduce the +concept of topological detour to characterize how a ubiquitous instance of FC +(direct link) is supported by neural pathways (detour) physically wired by SC, +which forms a cyclic loop interacted by brain structure and function. In the +clich\'e of machine learning, the multi-hop detour pathway underlying SC-FC +coupling allows us to devise a novel multi-head self-attention mechanism within +Transformer to capture multi-modal feature representation from paired graphs of +SC and FC. Taken together, we propose a biological-inspired deep model, coined +as NeuroPath, to find putative connectomic feature representations from the +unprecedented amount of neuroimages, which can be plugged into various +downstream applications such as task recognition and disease diagnosis. We have +evaluated NeuroPath on large-scale public datasets including HCP and UK Biobank +under supervised and zero-shot learning, where the state-of-the-art performance +by our NeuroPath indicates great potential in network neuroscience. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task + Learning Via Connector-MoE + + +
+ Multi-modal large language models (MLLMs) have shown impressive capabilities +as a general-purpose interface for various visual and linguistic tasks. +However, building a unified MLLM for multi-task learning in the medical field +remains a thorny challenge. To mitigate the tug-of-war problem of multi-modal +multi-task optimization, recent advances primarily focus on improving the LLM +components, while neglecting the connector that bridges the gap between +modalities. In this paper, we introduce Uni-Med, a novel medical generalist +foundation model which consists of a universal visual feature extraction +module, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting +from the proposed CMoE that leverages a well-designed router with a mixture of +projection experts at the connector, Uni-Med achieves efficient solution to the +tug-of-war problem and can perform six different medical tasks including +question answering, visual question answering, report generation, referring +expression comprehension, referring expression generation and image +classification. To the best of our knowledge, Uni-Med is the first effort to +tackle multi-task interference at the connector. Extensive ablation experiments +validate the effectiveness of introducing CMoE under any configuration, with up +to an average 8% performance gains. We further provide interpretation analysis +of the tug-of-war problem from the perspective of gradient optimization and +parameter statistics. Compared to previous state-of-the-art medical MLLMs, +Uni-Med achieves competitive or superior evaluation metrics on diverse tasks. +Code, data and model will be soon available at GitHub. + +
+
+
+
+
+ + ☆ Sequential Kernelized Stein Discrepancy + + +
+ We present a sequential version of the kernelized Stein discrepancy, which +allows for conducting goodness-of-fit tests for unnormalized densities that are +continuously monitored and adaptively stopped. That is, the sample size need +not be fixed prior to data collection; the practitioner can choose whether to +stop the test or continue to gather evidence at any time while controlling the +false discovery rate. In stark contrast to related literature, we do not impose +uniform boundedness on the Stein kernel. Instead, we exploit the potential +boundedness of the Stein kernel at arbitrary point evaluations to define test +martingales, that give way to the subsequent novel sequential tests. We prove +the validity of the test, as well as an asymptotic lower bound for the +logarithmic growth of the wealth process under the alternative. We further +illustrate the empirical performance of the test with a variety of +distributions, including restricted Boltzmann machines. + +
+
+
+
+
+ + ☆ HaloScope: Harnessing Unlabeled LLM Generations for Hallucination + Detection NeurIPS 2024 + + +
+ The surge in applications of large language models (LLMs) has prompted +concerns about the generation of misleading or fabricated information, known as +hallucinations. Therefore, detecting hallucinations has become critical to +maintaining trust in LLM-generated content. A primary challenge in learning a +truthfulness classifier is the lack of a large amount of labeled truthful and +hallucinated data. To address the challenge, we introduce HaloScope, a novel +learning framework that leverages the unlabeled LLM generations in the wild for +hallucination detection. Such unlabeled data arises freely upon deploying LLMs +in the open world, and consists of both truthful and hallucinated information. +To harness the unlabeled data, we present an automated membership estimation +score for distinguishing between truthful and untruthful generations within +unlabeled mixture data, thereby enabling the training of a binary truthfulness +classifier on top. Importantly, our framework does not require extra data +collection and human annotations, offering strong flexibility and practicality +for real-world applications. Extensive experiments show that HaloScope can +achieve superior hallucination detection performance, outperforming the +competitive rivals by a significant margin. Code is available at +https://github.com/deeplearningwisc/haloscope. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ☆ Broadcast Product: Shape-aligned Element-wise Multiplication and Beyond + + +
+ We propose a new operator defined between two tensors, the broadcast product. +The broadcast product calculates the Hadamard product after duplicating +elements to align the shapes of the two tensors. Complex tensor operations in +libraries like \texttt{numpy} can be succinctly represented as mathematical +expressions using the broadcast product. Finally, we propose a novel tensor +decomposition using the broadcast product, highlighting its potential +applications in dimensionality reduction. + +
+
+
+
+
+ + ☆ Does Worst-Performing Agent Lead the Pack? Analyzing Agent Dynamics in + Unified Distributed SGD NeurIPS 2024 + + +
+ Distributed learning is essential to train machine learning algorithms across +heterogeneous agents while maintaining data privacy. We conduct an asymptotic +analysis of Unified Distributed SGD (UD-SGD), exploring a variety of +communication patterns, including decentralized SGD and local SGD within +Federated Learning (FL), as well as the increasing communication interval in +the FL setting. In this study, we assess how different sampling strategies, +such as i.i.d. sampling, shuffling, and Markovian sampling, affect the +convergence speed of UD-SGD by considering the impact of agent dynamics on the +limiting covariance matrix as described in the Central Limit Theorem (CLT). Our +findings not only support existing theories on linear speedup and asymptotic +network independence, but also theoretically and empirically show how efficient +sampling strategies employed by individual agents contribute to overall +convergence in UD-SGD. Simulations reveal that a few agents using highly +efficient sampling can achieve or surpass the performance of the majority +employing moderately improved strategies, providing new insights beyond +traditional analyses focusing on the worst-performing agent. + +
+
+ comment: To appear in NeurIPS 2024 +
+
+
+
+
+ + ☆ MathDSL: A Domain-Specific Language for Concise Mathematical Solutions + Via Program Synthesis + + +
+ We present MathDSL, a Domain-Specific Language (DSL) for mathematical +equation solving, which, when deployed in program synthesis models, outperforms +state-of-the-art reinforcement-learning-based methods. We also introduce a +quantitative metric for measuring the conciseness of a mathematical solution +and demonstrate the improvement in the quality of generated solutions compared +to other methods. Our system demonstrates that a program synthesis system +(DreamCoder) using MathDSL can generate programs that solve linear equations +with greater accuracy and conciseness than using reinforcement learning +systems. Additionally, we demonstrate that if we use the action spaces of +previous reinforcement learning systems as DSLs, MathDSL outperforms the +action-space-DSLs. We use DreamCoder to store equation-solving strategies as +learned abstractions in its program library and demonstrate that by using +MathDSL, these can be converted into human-interpretable solution strategies +that could have applications in mathematical education. + +
+
+
+
+
+ + ♻ ☆ Assumption violations in causal discovery and the robustness of score + matching NeurIPS + 2023 + + +
+ When domain knowledge is limited and experimentation is restricted by +ethical, financial, or time constraints, practitioners turn to observational +causal discovery methods to recover the causal structure, exploiting the +statistical properties of their data. Because causal discovery without further +assumptions is an ill-posed problem, each algorithm comes with its own set of +usually untestable assumptions, some of which are hard to meet in real +datasets. Motivated by these considerations, this paper extensively benchmarks +the empirical performance of recent causal discovery methods on observational +i.i.d. data generated under different background conditions, allowing for +violations of the critical assumptions required by each selected approach. Our +experimental findings show that score matching-based methods demonstrate +surprising performance in the false positive and false negative rate of the +inferred graph in these challenging scenarios, and we provide theoretical +insights into their performance. This work is also the first effort to +benchmark the stability of causal discovery algorithms with respect to the +values of their hyperparameters. Finally, we hope this paper will set a new +standard for the evaluation of causal discovery methods and can serve as an +accessible entry point for practitioners interested in the field, highlighting +the empirical implications of different algorithm choices. + +
+
+ comment: 37th Conference on Neural Information Processing Systems (NeurIPS + 2023) +
+
+
+
+
+ + ♻ ☆ Quantum Kernel Methods under Scrutiny: A Benchmarking Study + + +
+ Since the entry of kernel theory in the field of quantum machine learning, +quantum kernel methods (QKMs) have gained increasing attention with regard to +both probing promising applications and delivering intriguing research +insights. Two common approaches for computing the underlying Gram matrix have +emerged: fidelity quantum kernels (FQKs) and projected quantum kernels (PQKs). +Benchmarking these methods is crucial to gain robust insights and to understand +their practical utility. In this work, we present a comprehensive large-scale +study examining QKMs based on FQKs and PQKs across a manifold of design +choices. Our investigation encompasses both classification and regression tasks +for five dataset families and 64 datasets, systematically comparing the use of +FQKs and PQKs quantum support vector machines and kernel ridge regression. This +resulted in over 20,000 models that were trained and optimized using a +state-of-the-art hyperparameter search to ensure robust and comprehensive +insights. We delve into the importance of hyperparameters on model performance +scores and support our findings through rigorous correlation analyses. In this, +we also closely inspect two data encoding strategies. Moreover, we provide an +in-depth analysis addressing the design freedom of PQKs and explore the +underlying principles responsible for learning. Our goal is not to identify the +best-performing model for a specific task but to uncover the mechanisms that +lead to effective QKMs and reveal universal patterns. + +
+
+ comment: 18 pages main text including 12 figures and 1 table, appendix 14 + pages with 19 figures and 1 table; restructure result section and prune + appendix +
+
+
+
+
+ + ♻ ☆ Two-Timescale Gradient Descent Ascent Algorithms for Nonconvex Minimax + Optimization ICML 2020 + + +
+ We provide a unified analysis of two-timescale gradient descent ascent +(TTGDA) for solving structured nonconvex minimax optimization problems in the +form of $\min_\textbf{x} \max_{\textbf{y} \in Y} f(\textbf{x}, \textbf{y})$, +where the objective function $f(\textbf{x}, \textbf{y})$ is nonconvex in +$\textbf{x}$ and concave in $\textbf{y}$, and the constraint set $Y \subseteq +\mathbb{R}^n$ is convex and bounded. In the convex-concave setting, the +single-timescale gradient descent ascent (GDA) algorithm is widely used in +applications and has been shown to have strong convergence guarantees. In more +general settings, however, it can fail to converge. Our contribution is to +design TTGDA algorithms that are effective beyond the convex-concave setting, +efficiently finding a stationary point of the function $\Phi(\cdot) := +\max_{\textbf{y} \in Y} f(\cdot, \textbf{y})$. We also establish theoretical +bounds on the complexity of solving both smooth and nonsmooth nonconvex-concave +minimax optimization problems. To the best of our knowledge, this is the first +systematic analysis of TTGDA for nonconvex minimax optimization, shedding light +on its superior performance in training generative adversarial networks (GANs) +and in other real-world application problems. + +
+
+ comment: A preliminary version [arXiv:1906.00331] of this paper, with a subset + of the results that are presented here, was presented at ICML 2020; 44 Pages, + 10 Figures +
+
+
+
+
+ + ♻ ☆ Ascend HiFloat8 Format for Deep Learning + + +
+ This preliminary white paper proposes a novel 8-bit floating-point data +format HiFloat8 (abbreviated as HiF8) for deep learning. HiF8 features tapered +precision. For normal value encoding, it provides 7 exponent values with 3-bit +mantissa, 8 exponent values with 2-bit mantissa, and 16 exponent values with +1-bit mantissa. For denormal value encoding, it extends the dynamic range by 7 +extra powers of 2, from 31 to 38 binades (notice that FP16 covers 40 binades). +Meanwhile, HiF8 encodes all the special values except that positive zero and +negative zero are represented by only one bit-pattern. Thanks to the better +balance between precision and dynamic range, HiF8 can be simultaneously used in +both forward and backward passes of AI training. In this paper, we will +describe the definition and rounding methods of HiF8, as well as the tentative +training and inference solutions. To demonstrate the efficacy of HiF8, massive +simulation results on various neural networks, including traditional neural +networks and large language models (LLMs), will also be presented. + +
+
+ comment: 13 Pages, 4 Figures, 9 Tables +
+
+
+
+
+ + ♻ ☆ Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of + Peptides + + +
+ Molecular Dynamics (MD) is crucial in various fields such as materials +science, chemistry, and pharmacology to name a few. Conventional MD software +struggles with the balance between time cost and prediction accuracy, which +restricts its wider application. Recently, data-driven approaches based on deep +generative models have been devised for time-coarsened dynamics, which aim at +learning dynamics of diverse molecular systems over a long timestep, enjoying +both universality and efficiency. Nevertheless, most current methods are +designed solely to learn from the data distribution regardless of the +underlying Boltzmann distribution, and the physics priors such as energies and +forces are constantly overlooked. In this work, we propose a conditional +generative model called Force-guided Bridge Matching (FBM), which learns +full-atom time-coarsened dynamics and targets the Boltzmann-constrained +distribution. With the guidance of our delicately-designed intermediate force +field, FBM leverages favourable physics priors into the generation process, +giving rise to enhanced simulations. Experiments on two datasets consisting of +peptides verify our superiority in terms of comprehensive metrics and +demonstrate transferability to unseen systems. + +
+
+
+
+
+ + ♻ ☆ MLPs Learn In-Context on Regression and Classification Tasks + + +
+ In-context learning (ICL), the remarkable ability to solve a task from only +input exemplars, is often assumed to be a unique hallmark of Transformer +models. By examining commonly employed synthetic ICL tasks, we demonstrate that +multi-layer perceptrons (MLPs) can also learn in-context. Moreover, MLPs, and +the closely related MLP-Mixer models, learn in-context competitively with +Transformers given the same compute budget in this setting. We further show +that MLPs outperform Transformers on a series of classical tasks from +psychology designed to test relational reasoning, which are closely related to +in-context classification. These results underscore a need for studying +in-context learning beyond attention-based architectures, while also +challenging strong prior arguments about MLPs' limited ability to solve +relational tasks. Altogether, our results highlight the unexpected competence +of MLPs, and support the growing interest in all-MLP alternatives to +task-specific architectures. + +
+
+ comment: 30 pages, 10 figures, code available at + https://github.com/wtong98/mlp-icl +
+
+
+
+
+ + ♻ ☆ A Stochastic Quasi-Newton Method for Non-convex Optimization with + Non-uniform Smoothness + + +
+ Classical convergence analyses for optimization algorithms rely on the +widely-adopted uniform smoothness assumption. However, recent experimental +studies have demonstrated that many machine learning problems exhibit +non-uniform smoothness, meaning the smoothness factor is a function of the +model parameter instead of a universal constant. In particular, it has been +observed that the smoothness grows with respect to the gradient norm along the +training trajectory. Motivated by this phenomenon, the recently introduced +$(L_0, L_1)$-smoothness is a more general notion, compared to traditional +$L$-smoothness, that captures such positive relationship between smoothness and +gradient norm. Under this type of non-uniform smoothness, existing literature +has designed stochastic first-order algorithms by utilizing gradient clipping +techniques to obtain the optimal $\mathcal{O}(\epsilon^{-3})$ sample complexity +for finding an $\epsilon$-approximate first-order stationary solution. +Nevertheless, the studies of quasi-Newton methods are still lacking. +Considering higher accuracy and more robustness for quasi-Newton methods, in +this paper we propose a fast stochastic quasi-Newton method when there exists +non-uniformity in smoothness. Leveraging gradient clipping and variance +reduction, our algorithm can achieve the best-known +$\mathcal{O}(\epsilon^{-3})$ sample complexity and enjoys convergence speedup +with simple hyperparameter tuning. Our numerical experiments show that our +proposed algorithm outperforms the state-of-the-art approaches. + +
+
+ comment: Paper accepted by CDC 2024 +
+
+
+
+
+ + ♻ ☆ Message-Passing Monte Carlo: Generating low-discrepancy point sets via + Graph Neural Networks + + +
+ Discrepancy is a well-known measure for the irregularity of the distribution +of a point set. Point sets with small discrepancy are called low-discrepancy +and are known to efficiently fill the space in a uniform manner. +Low-discrepancy points play a central role in many problems in science and +engineering, including numerical integration, computer vision, machine +perception, computer graphics, machine learning, and simulation. In this work, +we present the first machine learning approach to generate a new class of +low-discrepancy point sets named Message-Passing Monte Carlo (MPMC) points. +Motivated by the geometric nature of generating low-discrepancy point sets, we +leverage tools from Geometric Deep Learning and base our model on Graph Neural +Networks. We further provide an extension of our framework to higher +dimensions, which flexibly allows the generation of custom-made points that +emphasize the uniformity in specific dimensions that are primarily important +for the particular problem at hand. Finally, we demonstrate that our proposed +model achieves state-of-the-art performance superior to previous methods by a +significant margin. In fact, MPMC points are empirically shown to be either +optimal or near-optimal with respect to the discrepancy for low dimension and +small number of points, i.e., for which the optimal discrepancy can be +determined. Code for generating MPMC points can be found at +https://github.com/tk-rusch/MPMC. + +
+
+ comment: Published in Proceedings of the National Academy of Sciences (PNAS): + https://www.pnas.org/doi/10.1073/pnas.2409913121 +
+
+
+
+
+ + ♻ ☆ TabGraphs: A Benchmark and Strong Baselines for Learning on Graphs with + Tabular Node Features + + +
+ Tabular machine learning is an important field for industry and science. In +this field, table rows are usually treated as independent data samples, but +additional information about relations between them is sometimes available and +can be used to improve predictive performance. Such information can be +naturally modeled with a graph, thus tabular machine learning may benefit from +graph machine learning methods. However, graph machine learning models are +typically evaluated on datasets with homogeneous node features, which have +little in common with heterogeneous mixtures of numerical and categorical +features present in tabular datasets. Thus, there is a critical difference +between the data used in tabular and graph machine learning studies, which does +not allow one to understand how successfully graph models can be transferred to +tabular data. To bridge this gap, we propose a new benchmark of diverse graphs +with heterogeneous tabular node features and realistic prediction tasks. We use +this benchmark to evaluate a vast set of models, including simple methods +previously overlooked in the literature. Our experiments show that graph neural +networks (GNNs) can indeed often bring gains in predictive performance for +tabular data, but standard tabular models also can be adapted to work with +graph data by using simple feature preprocessing, which sometimes enables them +to compete with and even outperform GNNs. Based on our empirical study, we +provide insights for researchers and practitioners in both tabular and graph +machine learning fields. + +
+
+
+
+
+ + ♻ ☆ Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of + Anomalous Behavior in Bio-regenerative Life Support System Telemetry ECML + + +
+ The detection of abnormal or critical system states is essential in condition +monitoring. While much attention is given to promptly identifying anomalies, a +retrospective analysis of these anomalies can significantly enhance our +comprehension of the underlying causes of observed undesired behavior. This +aspect becomes particularly critical when the monitored system is deployed in a +vital environment. In this study, we delve into anomalies within the domain of +Bio-Regenerative Life Support Systems (BLSS) for space exploration and analyze +anomalies found in telemetry data stemming from the EDEN ISS space greenhouse +in Antarctica. We employ time series clustering on anomaly detection results to +categorize various types of anomalies in both uni- and multivariate settings. +We then assess the effectiveness of these methods in identifying systematic +anomalous behavior. Additionally, we illustrate that the anomaly detection +methods MDI and DAMP produce complementary results, as previously indicated by +research. + +
+
+ comment: 12 pages, + Supplemental Materials, Published at Machine Learning and + Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024 +
+
+
+
+
+ + ♻ ☆ A Comprehensive Framework for Evaluating API-oriented Code Generation in + Large Language Models + + +
+ Large language models (LLMs) like GitHub Copilot and ChatGPT have emerged as +powerful tools for code generation, significantly enhancing productivity and +accelerating software development. However, existing benchmarks primarily focus +on general code generation without considering API-oriented code generation, +i.e., generating code that invokes APIs from specific libraries. Given the +growing demand for API-oriented code generation, there is a pressing need for a +systematic and automated approach to evaluate LLM on API-oriented code +generation. To address this gap, we propose AutoAPIEval, a lightweight and +automated framework designed to evaluate the capabilities of LLMs in +API-oriented code generation. Our framework works with any library that +provides API documentation and focuses on two unit tasks: API recommendation +and code example generation, along with four metrics to evaluate the generated +APIs and code examples, such as the proportion of incorrect API recommendations +for Task 1, and the proportion of code examples where no specific API is +invoked and uncompilable/unexecutable code examples for Task 2. In addition, we +conducted a case study on three LLMs (ChatGPT, MagiCoder, and DeepSeek Coder) +and Java Runtime Environment 8 to demonstrate the framework's effectiveness. +Our findings reveal substantial variability in LLM performance across tasks, +with ChatGPT adhering better to instructions, while sharing similar +effectiveness in code example generation with its counterparts (i.e., MagiCoder +and DeekSeek Coder). We also identify key factors associated with code quality, +such as API popularity and model confidence, and build classifiers that achieve +high accuracy in detecting incorrect API recommendations and erroneous code +examples. Retrieval-augmented generation enhances the quality of code generated +by LLMs, though its effectiveness varies across different LLMs. + +
+
+
+
+
+ + ♻ ☆ Machine Learning for Two-Sample Testing under Right-Censored Data: A + Simulation Study + + +
+ The focus of this study is to evaluate the effectiveness of Machine Learning +(ML) methods for two-sample testing with right-censored observations. To +achieve this, we develop several ML-based methods with varying architectures +and implement them as two-sample tests. Each method is an ensemble (stacking) +that combines predictions from classical two-sample tests. This paper presents +the results of training the proposed ML methods, examines their statistical +power compared to classical two-sample tests, analyzes the null distribution of +the proposed methods when the null hypothesis is true, and evaluates the +significance of the features incorporated into the proposed methods. In total, +this work covers 18 methods for two-sample testing under right-censored +observations, including the proposed methods and classical well-studied +two-sample tests. All results from numerical experiments were obtained from a +synthetic dataset generated using the inverse transform sampling method and +replicated multiple times through Monte Carlo simulation. To test the +two-sample problem with right-censored observations, one can use the proposed +two-sample methods (scripts, dataset, and models are available on GitHub and +Hugging Face). + +
+
+ comment: 20 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Faster Randomized Methods for Orthogonality Constrained Problems + + +
+ Recent literature has advocated the use of randomized methods for +accelerating the solution of various matrix problems arising throughout data +science and computational science. One popular strategy for leveraging +randomization is to use it as a way to reduce problem size. However, methods +based on this strategy lack sufficient accuracy for some applications. +Randomized preconditioning is another approach for leveraging randomization, +which provides higher accuracy. The main challenge in using randomized +preconditioning is the need for an underlying iterative method, thus randomized +preconditioning so far have been applied almost exclusively to solving +regression problems and linear systems. In this article, we show how to expand +the application of randomized preconditioning to another important set of +problems prevalent across data science: optimization problems with +(generalized) orthogonality constraints. We demonstrate our approach, which is +based on the framework of Riemannian optimization and Riemannian +preconditioning, on the problem of computing the dominant canonical +correlations and on the Fisher linear discriminant analysis problem. For both +problems, we evaluate the effect of preconditioning on the computational costs +and asymptotic convergence, and demonstrate empirically the utility of our +approach. + +
+
+
+
+
+ + ♻ Discrete, compositional, and symbolic representations through attractor + dynamics + + +
+ Symbolic systems are powerful frameworks for modeling cognitive processes as +they encapsulate the rules and relationships fundamental to many aspects of +human reasoning and behavior. Central to these models are systematicity, +compositionality, and productivity, making them invaluable in both cognitive +science and artificial intelligence. However, certain limitations remain. For +instance, the integration of structured symbolic processes and latent +sub-symbolic processes has been implemented at the computational level through +fiat methods such as quantization or softmax sampling, which assume, rather +than derive, the operations underpinning discretization and symbolicization. In +this work, we introduce a novel neural stochastic dynamical systems model that +integrates attractor dynamics with symbolic representations to model cognitive +processes akin to the probabilistic language of thought (PLoT). Our model +segments the continuous representational space into discrete basins, with +attractor states corresponding to symbolic sequences, that reflect the +semanticity and compositionality characteristic of symbolic systems through +unsupervised learning, rather than relying on pre-defined primitives. Moreover, +like PLoT, our model learns to sample a diverse distribution of attractor +states that reflect the mutual information between the input data and the +symbolic encodings. This approach establishes a unified framework that +integrates both symbolic and sub-symbolic processing through neural dynamics, a +neuro-plausible substrate with proven expressivity in AI, offering a more +comprehensive model that mirrors the complex duality of cognitive operations. + +
+
+
+
+
+ + ♻ ☆ ZSC-Eval: An Evaluation Toolkit and Benchmark for Multi-agent Zero-shot + Coordination NeurIPS 2024 + + +
+ Zero-shot coordination (ZSC) is a new cooperative multi-agent reinforcement +learning (MARL) challenge that aims to train an ego agent to work with diverse, +unseen partners during deployment. The significant difference between the +deployment-time partners' distribution and the training partners' distribution +determined by the training algorithm makes ZSC a unique out-of-distribution +(OOD) generalization challenge. The potential distribution gap between +evaluation and deployment-time partners leads to inadequate evaluation, which +is exacerbated by the lack of appropriate evaluation metrics. In this paper, we +present ZSC-Eval, the first evaluation toolkit and benchmark for ZSC +algorithms. ZSC-Eval consists of: 1) Generation of evaluation partner +candidates through behavior-preferring rewards to approximate deployment-time +partners' distribution; 2) Selection of evaluation partners by Best-Response +Diversity (BR-Div); 3) Measurement of generalization performance with various +evaluation partners via the Best-Response Proximity (BR-Prox) metric. We use +ZSC-Eval to benchmark ZSC algorithms in Overcooked and Google Research Football +environments and get novel empirical findings. We also conduct a human +experiment of current ZSC algorithms to verify the ZSC-Eval's consistency with +human evaluation. ZSC-Eval is now available at +https://github.com/sjtu-marl/ZSC-Eval. + +
+
+ comment: Accepted in NeurIPS 2024 Dataset and Benchmark Track +
+
+
+
+
+ + ♻ ☆ Strategic Linear Contextual Bandits NeurIPS 2024 + + +
+ Motivated by the phenomenon of strategic agents gaming a recommender system +to maximize the number of times they are recommended to users, we study a +strategic variant of the linear contextual bandit problem, where the arms can +strategically misreport privately observed contexts to the learner. We treat +the algorithm design problem as one of mechanism design under uncertainty and +propose the Optimistic Grim Trigger Mechanism (OptGTM) that incentivizes the +agents (i.e., arms) to report their contexts truthfully while simultaneously +minimizing regret. We also show that failing to account for the strategic +nature of the agents results in linear regret. However, a trade-off between +mechanism design and regret minimization appears to be unavoidable. More +broadly, this work aims to provide insight into the intersection of online +learning and mechanism design. + +
+
+ comment: To appear at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion-based Generative Image Outpainting for Recovery of + FOV-Truncated CT Images + + +
+ Field-of-view (FOV) recovery of truncated chest CT scans is crucial for +accurate body composition analysis, which involves quantifying skeletal muscle +and subcutaneous adipose tissue (SAT) on CT slices. This, in turn, enables +disease prognostication. Here, we present a method for recovering truncated CT +slices using generative image outpainting. We train a diffusion model and apply +it to truncated CT slices generated by simulating a small FOV. Our model +reliably recovers the truncated anatomy and outperforms the previous +state-of-the-art despite being trained on 87% less data. + +
+
+ comment: Shared last authorship: Florian J. Fintelmann and Philip M\"uller +
+
+
+
+
+ + ♻ ☆ Characterizing stable regions in the residual stream of LLMs + + +
+ We identify "stable regions" in the residual stream of Transformers, where +the model's output remains insensitive to small activation changes, but +exhibits high sensitivity at region boundaries. These regions emerge during +training and become more defined as training progresses or model size +increases. The regions appear to be much larger than previously studied +polytopes. Our analysis suggests that these stable regions align with semantic +distinctions, where similar prompts cluster within regions, and activations +from the same region lead to similar next token predictions. This work provides +a promising research direction for understanding the complexity of neural +networks, shedding light on training dynamics, and advancing interpretability. + +
+
+
+
+
+ + ♻ ☆ Learning Constrained Markov Decision Processes With Non-stationary + Rewards and Constraints + + +
+ In constrained Markov decision processes (CMDPs) with adversarial rewards and +constraints, a well-known impossibility result prevents any algorithm from +attaining both sublinear regret and sublinear constraint violation, when +competing against a best-in-hindsight policy that satisfies constraints on +average. In this paper, we show that this negative result can be eased in CMDPs +with non-stationary rewards and constraints, by providing algorithms whose +performances smoothly degrade as non-stationarity increases. Specifically, we +propose algorithms attaining $\tilde{\mathcal{O}} (\sqrt{T} + C)$ regret and +positive constraint violation under bandit feedback, where $C$ is a corruption +value measuring the environment non-stationarity. This can be $\Theta(T)$ in +the worst case, coherently with the impossibility result for adversarial CMDPs. +First, we design an algorithm with the desired guarantees when $C$ is known. +Then, in the case $C$ is unknown, we show how to obtain the same results by +embedding such an algorithm in a general meta-procedure. This is of independent +interest, as it can be applied to any non-stationary constrained online +learning setting. + +
+
+
+
+
+ + ♻ ☆ Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation CoRL 2024 + + +
+ Given the high cost of collecting robotic data in the real world, sample +efficiency is a consistently compelling pursuit in robotics. In this paper, we +introduce SGRv2, an imitation learning framework that enhances sample +efficiency through improved visual and action representations. Central to the +design of SGRv2 is the incorporation of a critical inductive bias-action +locality, which posits that robot's actions are predominantly influenced by the +target object and its interactions with the local environment. Extensive +experiments in both simulated and real-world settings demonstrate that action +locality is essential for boosting sample efficiency. SGRv2 excels in RLBench +tasks with keyframe control using merely 5 demonstrations and surpasses the RVT +baseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and +MimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR. +In real-world environments, with only eight demonstrations, SGRv2 can perform a +variety of tasks at a markedly higher success rate compared to baseline models. +Project website: http://sgrv2-robot.github.io + +
+
+ comment: CoRL 2024. Project website: http://sgrv2-robot.github.io +
+
+
+
+
+ + ♻ ☆ What happens to diffusion model likelihood when your model is + conditional? + + +
+ Diffusion Models (DMs) iteratively denoise random samples to produce +high-quality data. The iterative sampling process is derived from Stochastic +Differential Equations (SDEs), allowing a speed-quality trade-off chosen at +inference. Another advantage of sampling with differential equations is exact +likelihood computation. These likelihoods have been used to rank unconditional +DMs and for out-of-domain classification. Despite the many existing and +possible uses of DM likelihoods, the distinct properties captured are unknown, +especially in conditional contexts such as Text-To-Image (TTI) or +Text-To-Speech synthesis (TTS). Surprisingly, we find that TTS DM likelihoods +are agnostic to the text input. TTI likelihood is more expressive but cannot +discern confounding prompts. Our results show that applying DMs to conditional +tasks reveals inconsistencies and strengthens claims that the properties of DM +likelihood are unknown. This impact sheds light on the previously unknown +nature of DM likelihoods. Although conditional DMs maximise likelihood, the +likelihood in question is not as sensitive to the conditioning input as one +expects. This investigation provides a new point-of-view on diffusion +likelihoods. + +
+
+
+
+
+ + ♻ ☆ Explainable AI needs formal notions of explanation correctness + + +
+ The use of machine learning (ML) in critical domains such as medicine poses +risks and requires regulation. One requirement is that decisions of ML systems +in high-risk applications should be human-understandable. The field of +"explainable artificial intelligence" (XAI) seemingly addresses this need. +However, in its current form, XAI is unfit to provide quality control for ML; +it itself needs scrutiny. Popular XAI methods cannot reliably answer important +questions about ML models, their training data, or a given test input. We +recapitulate results demonstrating that popular XAI methods systematically +attribute importance to input features that are independent of the prediction +target. This limits their utility for purposes such as model and data +(in)validation, model improvement, and scientific discovery. We argue that the +fundamental reason for this limitation is that current XAI methods do not +address well-defined problems and are not evaluated against objective criteria +of explanation correctness. Researchers should formally define the problems +they intend to solve first and then design methods accordingly. This will lead +to notions of explanation correctness that can be theoretically verified and +objective metrics of explanation performance that can be assessed using +ground-truth data. + +
+
+
+
+
+ + ♻ ☆ Efficient Combinatorial Optimization via Heat Diffusion NeurIPS 2024 + + +
+ Combinatorial optimization problems are widespread but inherently challenging +due to their discrete nature. The primary limitation of existing methods is +that they can only access a small fraction of the solution space at each +iteration, resulting in limited efficiency for searching the global optimal. To +overcome this challenge, diverging from conventional efforts of expanding the +solver's search scope, we focus on enabling information to actively propagate +to the solver through heat diffusion. By transforming the target function while +preserving its optima, heat diffusion facilitates information flow from distant +regions to the solver, providing more efficient navigation. Utilizing heat +diffusion, we propose a framework for solving general combinatorial +optimization problems. The proposed methodology demonstrates superior +performance across a range of the most challenging and widely encountered +combinatorial optimizations. Echoing recent advancements in harnessing +thermodynamics for generative artificial intelligence, our study further +reveals its significant potential in advancing combinatorial optimization. + +
+
+ comment: After the rebuttal version for NeurIPS 2024 (poster). Code is + available in https://github.com/AwakerMhy/HeO +
+
+
+
+
+ + ♻ ☆ Learning to Receive Help: Intervention-Aware Concept Embedding Models NeurIPS 2023 + + +
+ Concept Bottleneck Models (CBMs) tackle the opacity of neural architectures +by constructing and explaining their predictions using a set of high-level +concepts. A special property of these models is that they permit concept +interventions, wherein users can correct mispredicted concepts and thus improve +the model's performance. Recent work, however, has shown that intervention +efficacy can be highly dependent on the order in which concepts are intervened +on and on the model's architecture and training hyperparameters. We argue that +this is rooted in a CBM's lack of train-time incentives for the model to be +appropriately receptive to concept interventions. To address this, we propose +Intervention-aware Concept Embedding models (IntCEMs), a novel CBM-based +architecture and training paradigm that improves a model's receptiveness to +test-time interventions. Our model learns a concept intervention policy in an +end-to-end fashion from where it can sample meaningful intervention +trajectories at train-time. This conditions IntCEMs to effectively select and +receive concept interventions when deployed at test-time. Our experiments show +that IntCEMs significantly outperform state-of-the-art concept-interpretable +models when provided with test-time concept interventions, demonstrating the +effectiveness of our approach. + +
+
+ comment: Accepted as a spotlight at the Thirty-seventh Conference on Neural + Information Processing Systems (NeurIPS 2023) +
+
+
+
+
+ + ♻ ☆ Neural Exploratory Landscape Analysis + + +
+ Recent research in Meta-Black-Box Optimization (MetaBBO) have shown that +meta-trained neural networks can effectively guide the design of black-box +optimizers, significantly reducing the need for expert tuning and delivering +robust performance across complex problem distributions. Despite their success, +a paradox remains: MetaBBO still rely on human-crafted Exploratory Landscape +Analysis features to inform the meta-level agent about the low-level +optimization progress. To address the gap, this paper proposes Neural +Exploratory Landscape Analysis (NeurELA), a novel framework that dynamically +profiles landscape features through a two-stage, attention-based neural +network, executed in an entirely end-to-end fashion. NeurELA is pre-trained +over a variety of MetaBBO algorithms using a multi-task neuroevolution +strategy. Extensive experiments show that NeurELA achieves consistently +superior performance when integrated into different and even unseen MetaBBO +tasks and can be efficiently fine-tuned for further performance boost. This +advancement marks a pivotal step in making MetaBBO algorithms more autonomous +and broadly applicable.The source code of NeurELA can be accessed at +https://anonymous.4open.science/r/Neur-ELA-303C. + +
+
+
+
+
+ + ♻ ☆ Unsupervisedly Learned Representations: Should the Quest be Over? + + +
+ After four decades of research there still exists a Classification accuracy +gap of about 20% between our best Unsupervisedly Learned Representations +methods and the accuracy rates achieved by intelligent animals. It thus may +well be that we are looking in the wrong direction. A possible solution to this +puzzle is presented. We demonstrate that Reinforcement Learning can learn +representations which achieve the same accuracy as that of animals. Our main +modest contribution lies in the observations that: a. when applied to a real +world environment Reinforcement Learning does not require labels, and thus may +be legitimately considered as Unsupervised Learning, and b. in contrast, when +Reinforcement Learning is applied in a simulated environment it does inherently +require labels and should thus be generally be considered as Supervised +Learning. The corollary of these observations is that further search for +Unsupervised Learning competitive paradigms which may be trained in simulated +environments may be futile. + +
+
+ comment: To be published at The 6th International Conference on Machine + Learning, Optimization and Data Science - LOD 2020 +
+
+
+
+
+ + ♻ ☆ Exploring Selective Layer Fine-Tuning in Federated Learning + + +
+ Federated learning (FL) has emerged as a promising paradigm for fine-tuning +foundation models using distributed data in a privacy-preserving manner. Under +limited computational resources, clients often find it more practical to +fine-tune a selected subset of layers, rather than the entire model, based on +their task-specific data. In this study, we provide a thorough theoretical +exploration of selective layer fine-tuning in FL, emphasizing a flexible +approach that allows the clients to adjust their selected layers according to +their local data and resources. We theoretically demonstrate that the layer +selection strategy has a significant impact on model convergence in two +critical aspects: the importance of selected layers and the heterogeneous +choices across clients. Drawing from these insights, we further propose a +strategic layer selection method that utilizes local gradients and regulates +layer selections across clients. The extensive experiments on both image and +text datasets demonstrate the effectiveness of the proposed strategy compared +with several baselines, highlighting its advances in identifying critical +layers that adapt to the client heterogeneity and training dynamics in FL. + +
+
+
+
+
+ + ♻ ☆ Modeling and Analyzing the Influence of Non-Item Pages on Sequential + Next-Item Prediction + + +
+ Analyzing sequences of interactions between users and items, sequential +recommendation models can learn user intent and make predictions about the next +item. Next to item interactions, most systems also have interactions with what +we call non-item pages: these pages are not related to specific items but still +can provide insights of the user's interests, as, for example, navigation +pages. + We therefore propose a general way to include these non-item pages in +sequential recommendation models to enhance next-item prediction. First, we +demonstrate the influence of non-item pages on following interactions with the +hypotheses testing framework HypTrails and propose methods for representing +non-item pages in sequential recommendation models. Subsequently, we adapt +popular sequential recommender models to integrate non-item pages and +investigate their performance with different item representation strategies as +well as their ability to handle noisy data. To show the general capabilities of +the models to integrate non-item pages, we create a synthetic dataset for a +controlled setting and then evaluate the improvements from including non-item +pages on two real-world datasets. + Our results show that non-item pages are a valuable source of information, +and incorporating them in sequential recommendation models increases the +performance of next-item prediction across all analyzed model architectures. + +
+
+ comment: 37 pages, 19 figures; Submitted to ACM TORS +
+
+
+
+
+ + ♻ ☆ On the Design and Analysis of LLM-Based Algorithms + + +
+ We initiate a formal investigation into the design and analysis of LLM-based +algorithms, i.e. algorithms that contain one or multiple calls of large +language models (LLMs) as sub-routines and critically rely on the capabilities +of LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt +engineering to complicated LLM-powered agent systems and compound AI systems, +have achieved remarkable empirical success, the design and optimization of them +have mostly relied on heuristics and trial-and-errors, which is largely due to +a lack of formal and analytical study for these algorithms. To fill this gap, +we start by identifying the computational-graph representation of LLM-based +algorithms, the design principle of task decomposition, and some key +abstractions, which then facilitate our formal analysis for the accuracy and +efficiency of LLM-based algorithms, despite the black-box nature of LLMs. +Through extensive analytical and empirical investigation in a series of case +studies, we demonstrate that the proposed framework is broadly applicable to a +wide range of scenarios and diverse patterns of LLM-based algorithms, such as +parallel, hierarchical and recursive task decomposition. Our proposed framework +holds promise for advancing LLM-based algorithms, by revealing the reasons +behind curious empirical phenomena, guiding the choices of hyperparameters, +predicting the empirical performance of algorithms, and inspiring new algorithm +design. To promote further study of LLM-based algorithms, we release our source +code at +https://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm. + +
+
+
+
+
+ + ♻ ☆ dlordinal: a Python package for deep ordinal classification + + +
+ dlordinal is a new Python library that unifies many recent deep ordinal +classification methodologies available in the literature. Developed using +PyTorch as underlying framework, it implements the top performing +state-of-the-art deep learning techniques for ordinal classification problems. +Ordinal approaches are designed to leverage the ordering information present in +the target variable. Specifically, it includes loss functions, various output +layers, dropout techniques, soft labelling methodologies, and other +classification strategies, all of which are appropriately designed to +incorporate the ordinal information. Furthermore, as the performance metrics to +assess novel proposals in ordinal classification depend on the distance between +target and predicted classes in the ordinal scale, suitable ordinal evaluation +metrics are also included. dlordinal is distributed under the BSD-3-Clause +license and is available at https://github.com/ayrna/dlordinal. + +
+
+
+
+
+ + ♻ ☆ Tenplex: Dynamic Parallelism for Deep Learning using Parallelizable + Tensor Collections + + +
+ Deep learning (DL) jobs use multi-dimensional parallelism, i.e. combining +data, model, and pipeline parallelism, to use large GPU clusters efficiently. +Long-running jobs may experience changes to their GPU allocation: (i) resource +elasticity during training adds or removes GPUs; (ii) hardware maintenance may +require redeployment on different GPUs; and (iii) GPU failures force jobs to +run with fewer devices. Current DL frameworks tie jobs to a set of GPUs and +thus lack support for these scenarios. In particular, they cannot change the +multi-dimensional parallelism of an already-running job in an efficient and +model-independent way. + We describe Scalai, a state management library for DL systems that enables +jobs to change their parallelism dynamically after the GPU allocation is +updated at runtime. Scalai achieves this through a new abstraction, a +parallelizable tensor collection (PTC), that externalizes the job state during +training. After a GPU change, Scalai uses the PTC to transform the job state: +the PTC repartitions the dataset state under data parallelism and exposes it to +DL workers through a virtual file system; and the PTC obtains the model state +as partitioned checkpoints and transforms them to reflect the new +parallelization configuration. For efficiency, Scalai executes PTC +transformations in parallel with minimum data movement between workers. Our +experiments show that Scalai enables DL jobs to support dynamic parallelization +with low overhead. + +
+
+ comment: The 30th Symposium on Operating Systems Principles (SOSP24) +
+
+
+
+
+ + ♻ ☆ IDP-PGFE: An Interpretable Disruption Predictor based on Physics-Guided + Feature Extraction + + +
+ Disruption prediction has made rapid progress in recent years, especially in +machine learning (ML)-based methods. Understanding why a predictor makes a +certain prediction can be as crucial as the prediction's accuracy for future +tokamak disruption predictors. The purpose of most disruption predictors is +accuracy or cross-machine capability. However, if a disruption prediction model +can be interpreted, it can tell why certain samples are classified as +disruption precursors. This allows us to tell the types of incoming disruption +and gives us insight into the mechanism of disruption. This paper designs a +disruption predictor called Interpretable Disruption Predictor based On +Physics-guided feature extraction (IDP-PGFE) on J-TEXT. The prediction +performance of the model is effectively improved by extracting physics-guided +features. A high-performance model is required to ensure the validity of the +interpretation results. The interpretability study of IDP-PGFE provides an +understanding of J-TEXT disruption and is generally consistent with existing +comprehension of disruption. IDP-PGFE has been applied to the disruption due to +continuously increasing density towards density limit experiments on J-TEXT. +The time evolution of the PGFE features contribution demonstrates that the +application of ECRH triggers radiation-caused disruption, which lowers the +density at disruption. While the application of RMP indeed raises the density +limit in J-TEXT. The interpretability study guides intuition on the physical +mechanisms of density limit disruption that RMPs affect not only the MHD +instabilities but also the radiation profile, which delays density limit +disruption. + +
+
+ comment: 17 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ SatFed: A Resource-Efficient LEO Satellite-Assisted Heterogeneous + Federated Learning Framework + + +
+ Traditional federated learning (FL) frameworks rely heavily on terrestrial +networks, where coverage limitations and increasing bandwidth congestion +significantly hinder model convergence. Fortunately, the advancement of +low-Earth orbit (LEO) satellite networks offers promising new communication +avenues to augment traditional terrestrial FL. Despite this potential, the +limited satellite-ground communication bandwidth and the heterogeneous +operating environments of ground devices-including variations in data, +bandwidth, and computing power-pose substantial challenges for effective and +robust satellite-assisted FL. To address these challenges, we propose SatFed, a +resource-efficient satellite-assisted heterogeneous FL framework. SatFed +implements freshness-based model prioritization queues to optimize the use of +highly constrained satellite-ground bandwidth, ensuring the transmission of the +most critical models. Additionally, a multigraph is constructed to capture +real-time heterogeneous relationships between devices, including data +distribution, terrestrial bandwidth, and computing capability. This multigraph +enables SatFed to aggregate satellite-transmitted models into peer guidance, +enhancing local training in heterogeneous environments. Extensive experiments +with real-world LEO satellite networks demonstrate that SatFed achieves +superior performance and robustness compared to state-of-the-art benchmarks. + +
+
+ comment: 10 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ VARADE: a Variational-based AutoRegressive model for Anomaly Detection + on the Edge + + +
+ Detecting complex anomalies on massive amounts of data is a crucial task in +Industry 4.0, best addressed by deep learning. However, available solutions are +computationally demanding, requiring cloud architectures prone to latency and +bandwidth issues. This work presents VARADE, a novel solution implementing a +light autoregressive framework based on variational inference, which is best +suited for real-time execution on the edge. The proposed approach was validated +on a robotic arm, part of a pilot production line, and compared with several +state-of-the-art algorithms, obtaining the best trade-off between anomaly +detection accuracy, power consumption and inference frequency on two different +edge platforms. + +
+
+
+
+
+ + ♻ ☆ Recursive Distillation for Open-Set Distributed Robot Localization + + +
+ A typical assumption in state-of-the-art self-localization models is that an +annotated training dataset is available for the target workspace. However, this +is not necessarily true when a robot travels around the general open world. +This work introduces a novel training scheme for open-world distributed robot +systems. In our scheme, a robot (``student") can ask the other robots it meets +at unfamiliar places (``teachers") for guidance. Specifically, a +pseudo-training dataset is reconstructed from the teacher model and then used +for continual learning of the student model under domain, class, and vocabulary +incremental setup. Unlike typical knowledge transfer schemes, our scheme +introduces only minimal assumptions on the teacher model, so that it can handle +various types of open-set teachers, including those uncooperative, untrainable +(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In +this paper, we investigate a ranking function as an instance of such generic +models, using a challenging data-free recursive distillation scenario, where a +student once trained can recursively join the next-generation open teacher set. + +
+
+ comment: 5 pages, 4 figures, technical report +
+
+
+
+
+ + ♻ ☆ On-Air Deep Learning Integrated Semantic Inference Models for Enhanced + Earth Observation Satellite Networks + + +
+ Earth Observation (EO) systems play a crucial role in achieving Sustainable +Development Goals by collecting and analyzing vital global data through +satellite networks. These systems are essential for tasks like mapping, +disaster monitoring, and resource management, but they face challenges in +processing and transmitting large volumes of EO data, especially in specialized +fields such as agriculture and real-time disaster response. Domain-adapted +Large Language Models (LLMs) provide a promising solution by facilitating data +fusion between extensive EO data and semantic EO data. By improving integration +and interpretation of diverse datasets, LLMs address the challenges of +processing specialized information in agriculture and disaster response +applications. This fusion enhances the accuracy and relevance of transmitted +data. This paper presents a framework for semantic communication in EO +satellite networks, aimed at improving data transmission efficiency and overall +system performance through cognitive processing techniques. The proposed system +employs Discrete-Task-Oriented Source-Channel Coding (DT-JSCC) and Semantic +Data Augmentation (SA) to focus on relevant information while minimizing +communication overhead. By integrating cognitive semantic processing and +inter-satellite links, the framework enhances the analysis and transmission of +multispectral satellite imagery, improving object detection, pattern +recognition, and real-time decision-making. The introduction of Cognitive +Semantic Augmentation (CSA) allows satellites to process and transmit semantic +information, boosting adaptability to changing environments and application +needs. This end-to-end architecture is tailored for next-generation satellite +networks, such as those supporting 6G, and demonstrates significant +improvements in efficiency and accuracy. + +
+
+ comment: 18 pages, 10 figures, magazine +
+
+
+
+
+ + ♻ ☆ Realising Synthetic Active Inference Agents, Part II: Variational + Message Updates + + +
+ The Free Energy Principle (FEP) describes (biological) agents as minimising a +variational Free Energy (FE) with respect to a generative model of their +environment. Active Inference (AIF) is a corollary of the FEP that describes +how agents explore and exploit their environment by minimising an expected FE +objective. In two related papers, we describe a scalable, epistemic approach to +synthetic AIF, by message passing on free-form Forney-style Factor Graphs +(FFGs). A companion paper (part I) introduces a Constrained FFG (CFFG) notation +that visually represents (generalised) FE objectives for AIF. The current paper +(part II) derives message passing algorithms that minimise (generalised) FE +objectives on a CFFG by variational calculus. A comparison between simulated +Bethe and generalised FE agents illustrates how the message passing approach to +synthetic AIF induces epistemic behaviour on a T-maze navigation task. +Extension of the T-maze simulation to 1) learning goal statistics, and 2) a +multi-agent bargaining setting, illustrate how this approach encourages reuse +of nodes and updates in alternative settings. With a full message passing +account of synthetic AIF agents, it becomes possible to derive and reuse +message updates across models and move closer to industrial applications of +synthetic AIF. + +
+
+
+
+
+ + ♻ ☆ EPTQ: Enhanced Post-Training Quantization via Hessian-guided + Network-wise Optimization + + +
+ Quantization is a key method for deploying deep neural networks on edge +devices with limited memory and computation resources. Recent improvements in +Post-Training Quantization (PTQ) methods were achieved by an additional local +optimization process for learning the weight quantization rounding policy. +However, a gap exists when employing network-wise optimization with small +representative datasets. In this paper, we propose a new method for enhanced +PTQ (EPTQ) that employs a network-wise quantization optimization process, which +benefits from considering cross-layer dependencies during optimization. EPTQ +enables network-wise optimization with a small representative dataset using a +novel sample-layer attention score based on a label-free Hessian matrix upper +bound. The label-free approach makes our method suitable for the PTQ scheme. We +give a theoretical analysis for the said bound and use it to construct a +knowledge distillation loss that guides the optimization to focus on the more +sensitive layers and samples. In addition, we leverage the Hessian upper bound +to improve the weight quantization parameters selection by focusing on the more +sensitive elements in the weight tensors. Empirically, by employing EPTQ we +achieve state-of-the-art results on various models, tasks, and datasets, +including ImageNet classification, COCO object detection, and Pascal-VOC for +semantic segmentation. + +
+
+
+
+
+ + ♻ ☆ Recurrent Stochastic Configuration Networks for Temporal Data Analytics + + +
+ Temporal data modelling techniques with neural networks are useful in many +domain applications, including time-series forecasting and control engineering. +This paper aims at developing a recurrent version of stochastic configuration +networks (RSCNs) for problem solving, where we have no underlying assumption on +the dynamic orders of the input variables. Given a collection of historical +data, we first build an initial RSCN model in the light of a supervisory +mechanism, followed by an online update of the output weights by using a +projection algorithm. Some theoretical results are established, including the +echo state property, the universal approximation property of RSCNs for both the +offline and online learnings, and the convergence of the output weights. The +proposed RSCN model is remarkably distinguished from the well-known echo state +networks (ESNs) in terms of the way of assigning the input random weight matrix +and a special structure of the random feedback matrix. A comprehensive +comparison study among the long short-term memory (LSTM) network, the original +ESN, and several state-of-the-art ESN methods such as the simple cycle +reservoir (SCR), the polynomial ESN (PESN), the leaky-integrator ESN (LIESN) +and RSCN is carried out. Numerical results clearly indicate that the proposed +RSCN performs favourably over all of the datasets. + +
+
+
+
+
+ + ♻ ☆ Archon: An Architecture Search Framework for Inference-Time Techniques + + +
+ Inference-time techniques are emerging as highly effective tools to increase +large language model (LLM) capabilities. However, there is still limited +understanding of the best practices for developing systems that combine +inference-time techniques with one or more LLMs, with challenges including: (1) +effectively allocating inference compute budget, (2) understanding the +interactions between different combinations of inference-time techniques and +their impact on downstream performance, and 3) efficiently searching over the +large space of model choices, inference-time techniques, and their +compositions. To address these challenges, we introduce Archon, an automated +framework for designing inference-time architectures. Archon defines an +extensible design space, encompassing methods such as generation ensembling, +multi-sampling, ranking, fusion, critiquing, verification, and unit testing. It +then transforms the problem of selecting and combining LLMs and inference-time +techniques into a hyperparameter optimization objective. To optimize this +objective, we introduce automated Inference-Time Architecture Search (ITAS) +algorithms. Given target benchmark(s), an inference compute budget, and +available LLMs, ITAS outputs optimized architectures. We evaluate Archon +architectures across a wide range of instruction-following and reasoning +benchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval, +MixEval Hard, MATH, and CodeContests. We show that automatically designed +inference-time architectures by Archon outperform strong models such as GPT-4o +and Claude 3.5 Sonnet on these benchmarks, achieving an average increase of +15.1 and 11.2 percentage points with all-source models and open-source models, +respectively. We make our code and datasets available publicly on Github: +https://github.com/ScalingIntelligence/Archon. + +
+
+
+
+
+ + ♻ ☆ Quality Matters: Evaluating Synthetic Data for Tool-Using LLMs + + +
+ Training large language models (LLMs) for external tool usage is a rapidly +expanding field, with recent research focusing on generating synthetic data to +address the shortage of available data. However, the absence of systematic data +quality checks poses complications for properly training and testing models. To +that end, we propose two approaches for assessing the reliability of data for +training LLMs to use external tools. The first approach uses intuitive, +human-defined correctness criteria. The second approach uses a model-driven +assessment with in-context evaluation. We conduct a thorough evaluation of data +quality on two popular benchmarks, followed by an extrinsic evaluation that +showcases the impact of data quality on model performance. Our results +demonstrate that models trained on high-quality data outperform those trained +on unvalidated data, even when trained with a smaller quantity of data. These +findings empirically support the significance of assessing and ensuring the +reliability of training data for tool-using LLMs. + +
+
+
+
+
+ + ♻ ☆ Hybrid Spiking Neural Networks for Low-Power Intra-Cortical + Brain-Machine Interfaces + + +
+ Intra-cortical brain-machine interfaces (iBMIs) have the potential to +dramatically improve the lives of people with paraplegia by restoring their +ability to perform daily activities. However, current iBMIs suffer from +scalability and mobility limitations due to bulky hardware and wiring. Wireless +iBMIs offer a solution but are constrained by a limited data rate. To overcome +this challenge, we are investigating hybrid spiking neural networks for +embedded neural decoding in wireless iBMIs. The networks consist of a temporal +convolution-based compression followed by recurrent processing and a final +interpolation back to the original sequence length. As recurrent units, we +explore gated recurrent units (GRUs), leaky integrate-and-fire (LIF) neurons, +and a combination of both - spiking GRUs (sGRUs) and analyze their differences +in terms of accuracy, footprint, and activation sparsity. To that end, we train +decoders on the "Nonhuman Primate Reaching with Multichannel Sensorimotor +Cortex Electrophysiology" dataset and evaluate it using the NeuroBench +framework, targeting both tracks of the IEEE BioCAS Grand Challenge on Neural +Decoding. Our approach achieves high accuracy in predicting velocities of +primate reaching movements from multichannel primary motor cortex recordings +while maintaining a low number of synaptic operations, surpassing the current +baseline models in the NeuroBench framework. This work highlights the potential +of hybrid neural networks to facilitate wireless iBMIs with high decoding +precision and a substantial increase in the number of monitored neurons, paving +the way toward more advanced neuroprosthetic technologies. + +
+
+ comment: This work has been accepted at the 2024 IEEE Biomedical Circuits and + Systems Conference +
+
+
+
+
+ + ♻ ☆ Improving Fast Adversarial Training Paradigm: An Example Taxonomy + Perspective + + +
+ While adversarial training is an effective defense method against adversarial +attacks, it notably increases the training cost. To this end, fast adversarial +training (FAT) is presented for efficient training and has become a hot +research topic. However, FAT suffers from catastrophic overfitting, which leads +to a performance drop compared with multi-step adversarial training. However, +the cause of catastrophic overfitting remains unclear and lacks exploration. In +this paper, we present an example taxonomy in FAT, which identifies that +catastrophic overfitting is caused by the imbalance between the inner and outer +optimization in FAT. Furthermore, we investigated the impact of varying degrees +of training loss, revealing a correlation between training loss and +catastrophic overfitting. Based on these observations, we redesign the loss +function in FAT with the proposed dynamic label relaxation to concentrate the +loss range and reduce the impact of misclassified examples. Meanwhile, we +introduce batch momentum initialization to enhance the diversity to prevent +catastrophic overfitting in an efficient manner. Furthermore, we also propose +Catastrophic Overfitting aware Loss Adaptation (COLA), which employs a separate +training strategy for examples based on their loss degree. Our proposed method, +named example taxonomy aware FAT (ETA), establishes an improved paradigm for +FAT. Experiment results demonstrate our ETA achieves state-of-the-art +performance. Comprehensive experiments on four standard datasets demonstrate +the competitiveness of our proposed method. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ GlycanML: A Multi-Task and Multi-Structure Benchmark for Glycan Machine + Learning + + +
+ Glycans are basic biomolecules and perform essential functions within living +organisms. The rapid increase of functional glycan data provides a good +opportunity for machine learning solutions to glycan understanding. However, +there still lacks a standard machine learning benchmark for glycan function +prediction. In this work, we fill this blank by building a comprehensive +benchmark for Glycan Machine Learning (GlycanML). The GlycanML benchmark +consists of diverse types of tasks including glycan taxonomy prediction, glycan +immunogenicity prediction, glycosylation type prediction, and protein-glycan +interaction prediction. Glycans can be represented by both sequences and graphs +in GlycanML, which enables us to extensively evaluate sequence-based models and +graph neural networks (GNNs) on benchmark tasks. Furthermore, by concurrently +performing eight glycan taxonomy prediction tasks, we introduce the +GlycanML-MTL testbed for multi-task learning (MTL) algorithms. Experimental +results show the superiority of modeling glycans with multi-relational GNNs, +and suitable MTL methods can further boost model performance. We provide all +datasets and source codes at https://github.com/GlycanML/GlycanML and maintain +a leaderboard at https://GlycanML.github.io/project + +
+
+ comment: Research project paper. All code and data are released +
+
+
+
+
+ + ♻ ☆ Understanding the Expressivity and Trainability of Fourier Neural + Operator: A Mean-Field Perspective + + +
+ In this paper, we explores the expressivity and trainability of the Fourier +Neural Operator (FNO). We establish a mean-field theory for the FNO, analyzing +the behavior of the random FNO from an edge of chaos perspective. Our +investigation into the expressivity of a random FNO involves examining the +ordered-chaos phase transition of the network based on the weight distribution. +This phase transition demonstrates characteristics unique to the FNO, induced +by mode truncation, while also showcasing similarities to those of densely +connected networks. Furthermore, we identify a connection between expressivity +and trainability: the ordered and chaotic phases correspond to regions of +vanishing and exploding gradients, respectively. This finding provides a +practical prerequisite for the stable training of the FNO. Our experimental +results corroborate our theoretical findings. + +
+
+
+
+
+ + ♻ ☆ INT-FlashAttention: Enabling Flash Attention for INT8 Quantization + + +
+ As the foundation of large language models (LLMs), self-attention module +faces the challenge of quadratic time and memory complexity with respect to +sequence length. FlashAttention accelerates attention computation and reduces +its memory usage by leveraging the GPU memory hierarchy. A promising research +direction is to integrate FlashAttention with quantization methods. This paper +introduces INT-FlashAttention, the first INT8 quantization architecture +compatible with the forward workflow of FlashAttention, which significantly +improves the inference speed of FlashAttention on Ampere GPUs. We implement our +INT-FlashAttention prototype with fully INT8 activations and general +matrix-multiplication (GEMM) kernels, making it the first attention operator +with fully INT8 input. As a general token-level post-training quantization +framework, INT-FlashAttention is also compatible with other data formats like +INT4, etc. Experimental results show INT-FlashAttention achieves 72% faster +inference speed and 82% smaller quantization error compared to standard +FlashAttention with FP16 and FP8 data format. + +
+
+
+
+
+ + ♻ ☆ Bivariate DeepKriging for Large-scale Spatial Interpolation of Wind + Fields + + +
+ High spatial resolution wind data are essential for a wide range of +applications in climate, oceanographic and meteorological studies. Large-scale +spatial interpolation or downscaling of bivariate wind fields having velocity +in two dimensions is a challenging task because wind data tend to be +non-Gaussian with high spatial variability and heterogeneity. In spatial +statistics, cokriging is commonly used for predicting bivariate spatial fields. +However, the cokriging predictor is not optimal except for Gaussian processes. +Additionally, cokriging is computationally prohibitive for large datasets. In +this paper, we propose a method, called bivariate DeepKriging, which is a +spatially dependent deep neural network (DNN) with an embedding layer +constructed by spatial radial basis functions for bivariate spatial data +prediction. We then develop a distribution-free uncertainty quantification +method based on bootstrap and ensemble DNN. Our proposed approach outperforms +the traditional cokriging predictor with commonly used covariance functions, +such as the linear model of co-regionalization and flexible bivariate Mat\'ern +covariance. We demonstrate the computational efficiency and scalability of the +proposed DNN model, with computations that are, on average, 20 times faster +than those of conventional techniques. We apply the bivariate DeepKriging +method to the wind data over the Middle East region at 506,771 locations. The +prediction performance of the proposed method is superior over the cokriging +predictors and dramatically reduces computation time. + +
+
+
+
+
+ + ♻ ☆ Learning Variable Compliance Control From a Few Demonstrations for + Bimanual Robot with Haptic Feedback Teleoperation System IROS 2024 + + +
+ Automating dexterous, contact-rich manipulation tasks using rigid robots is a +significant challenge in robotics. Rigid robots, defined by their actuation +through position commands, face issues of excessive contact forces due to their +inability to adapt to contact with the environment, potentially causing damage. +While compliance control schemes have been introduced to mitigate these issues +by controlling forces via external sensors, they are hampered by the need for +fine-tuning task-specific controller parameters. Learning from Demonstrations +(LfD) offers an intuitive alternative, allowing robots to learn manipulations +through observed actions. In this work, we introduce a novel system to enhance +the teaching of dexterous, contact-rich manipulations to rigid robots. Our +system is twofold: firstly, it incorporates a teleoperation interface utilizing +Virtual Reality (VR) controllers, designed to provide an intuitive and +cost-effective method for task demonstration with haptic feedback. Secondly, we +present Comp-ACT (Compliance Control via Action Chunking with Transformers), a +method that leverages the demonstrations to learn variable compliance control +from a few demonstrations. Our methods have been validated across various +complex contact-rich manipulation tasks using single-arm and bimanual robot +setups in simulated and real-world environments, demonstrating the +effectiveness of our system in teaching robots dexterous manipulations with +enhanced adaptability and safety. Code available at: +https://github.com/omron-sinicx/CompACT + +
+
+ comment: Accepted to IROS 2024 +
+
+
+
+
+ + ♻ ☆ Decentralised Variational Inference Frameworks for Multi-object Tracking + on Sensor Network + + +
+ This paper tackles the challenge of multi-sensor multi-object tracking by +proposing various decentralised Variational Inference (VI) schemes that match +the tracking performance of centralised sensor fusion with only local message +exchanges among neighboring sensors. We first establish a centralised VI sensor +fusion scheme as a benchmark and analyse the limitations of its decentralised +counterpart, which requires sensors to await consensus at each VI iteration. +Therefore, we propose a decentralised gradient-based VI framework that +optimises the Locally Maximised Evidence Lower Bound (LM-ELBO) instead of the +standard ELBO, which reduces the parameter search space and enables faster +convergence, making it particularly beneficial for decentralised tracking.This +proposed framework is inherently self-evolving, improving with advancements in +decentralised optimisation techniques for convergence guarantees and +efficiency. Further, we enhance the convergence speed of proposed decentralised +schemes using natural gradients and gradient tracking strategies. Results +verify that our decentralised VI schemes are empirically equivalent to +centralised fusion in tracking performance. Notably, the decentralised natural +gradient VI method is the most communication-efficient, with communication +costs comparable to suboptimal decentralised strategies while delivering +notably higher tracking accuracy. + +
+
+
+
+
+ + ♻ ☆ Trust-Region Sequential Quadratic Programming for Stochastic + Optimization with Random Models + + +
+ In this work, we consider solving optimization problems with a stochastic +objective and deterministic equality constraints. We propose a Trust-Region +Sequential Quadratic Programming method to find both first- and second-order +stationary points. Our method utilizes a random model to represent the +objective function, which is constructed from stochastic observations of the +objective and is designed to satisfy proper adaptive accuracy conditions with a +high but fixed probability. To converge to first-order stationary points, our +method computes a gradient step in each iteration defined by minimizing a +quadratic approximation of the objective subject to a (relaxed) linear +approximation of the problem constraints and a trust-region constraint. To +converge to second-order stationary points, our method additionally computes an +eigen step to explore the negative curvature of the reduced Hessian matrix, as +well as a second-order correction step to address the potential Maratos effect, +which arises due to the nonlinearity of the problem constraints. Such an effect +may impede the method from moving away from saddle points. Both gradient and +eigen step computations leverage a novel parameter-free decomposition of the +step and the trust-region radius, accounting for the proportions among the +feasibility residual, optimality residual, and negative curvature. We establish +global almost sure first- and second-order convergence guarantees for our +method, and present computational results on CUTEst problems, regression +problems, and saddle-point problems to demonstrate its superiority over +existing line-search-based stochastic methods. + +
+
+ comment: 41 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ CMamba: Channel Correlation Enhanced State Space Models for Multivariate + Time Series Forecasting + + +
+ Recent advancements in multivariate time series forecasting have been +propelled by Linear-based, Transformer-based, and Convolution-based models, +with Transformer-based architectures gaining prominence for their efficacy in +temporal and cross-channel mixing. More recently, Mamba, a state space model, +has emerged with robust sequence and feature mixing capabilities. However, the +suitability of the vanilla Mamba design for time series forecasting remains an +open question, particularly due to its inadequate handling of cross-channel +dependencies. Capturing cross-channel dependencies is critical in enhancing the +performance of multivariate time series prediction. Recent findings show that +self-attention excels in capturing cross-channel dependencies, whereas other +simpler mechanisms, such as MLP, may degrade model performance. This is +counterintuitive, as MLP, being a learnable architecture, should theoretically +capture both correlations and irrelevances, potentially leading to neutral or +improved performance. Diving into the self-attention mechanism, we attribute +the observed degradation in MLP performance to its lack of data dependence and +global receptive field, which result in MLP's lack of generalization ability. +Based on the above insights, we introduce a refined Mamba variant tailored for +time series forecasting. Our proposed model, \textbf{CMamba}, incorporates a +modified Mamba (M-Mamba) module for temporal dependencies modeling, a global +data-dependent MLP (GDD-MLP) to effectively capture cross-channel dependencies, +and a Channel Mixup mechanism to mitigate overfitting. Comprehensive +experiments conducted on seven real-world datasets demonstrate the efficacy of +our model in improving forecasting performance. + +
+
+
+
+
+ + ♻ ☆ Bayesian Matrix Decomposition and Applications + + +
+ The sole aim of this book is to give a self-contained introduction to +concepts and mathematical tools in Bayesian matrix decomposition in order to +seamlessly introduce matrix decomposition techniques and their applications in +subsequent sections. However, we clearly realize our inability to cover all the +useful and interesting results concerning Bayesian matrix decomposition and +given the paucity of scope to present this discussion, e.g., the separated +analysis of variational inference for conducting the optimization. We refer the +reader to literature in the field of Bayesian analysis for a more detailed +introduction to the related fields. + This book is primarily a summary of purpose, significance of important +Bayesian matrix decomposition methods, e.g., real-valued decomposition, +nonnegative matrix factorization, Bayesian interpolative decomposition, and the +origin and complexity of the methods which shed light on their applications. +The mathematical prerequisite is a first course in statistics and linear +algebra. Other than this modest background, the development is self-contained, +with rigorous proof provided throughout. + +
+
+
+
+
+ + ♻ ☆ Surge Phenomenon in Optimal Learning Rate and Batch Size Scaling + + +
+ In current deep learning tasks, Adam style optimizers such as Adam, Adagrad, +RMSProp, Adafactor, and Lion have been widely used as alternatives to SGD style +optimizers. These optimizers typically update model parameters using the sign +of gradients, resulting in more stable convergence curves. The learning rate +and the batch size are the most critical hyperparameters for optimizers, which +require careful tuning to enable effective convergence. Previous research has +shown that the optimal learning rate increases linearly or follows similar +rules with batch size for SGD style optimizers. However, this conclusion is not +applicable to Adam style optimizers. In this paper, we elucidate the connection +between optimal learning rates and batch sizes for Adam style optimizers +through both theoretical analysis and extensive experiments. First, we raise +the scaling law between batch sizes and optimal learning rates in the sign of +gradient case, in which we prove that the optimal learning rate first rises and +then falls as the batch size increases. Moreover, the peak value of the surge +will gradually move toward the larger batch size as training progresses. +Second, we conducted experiments on various CV and NLP tasks and verified the +correctness of the scaling law. + +
+
+
+
+
+ + ♻ ☆ Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles + Using Latent Space Generative World Models ICRA 2025 + + +
+ We propose the use of latent space generative world models to address the +covariate shift problem in autonomous driving. A world model is a neural +network capable of predicting an agent's next state given past states and +actions. By leveraging a world model during training, the driving policy +effectively mitigates covariate shift without requiring an excessive amount of +training data. During end-to-end training, our policy learns how to recover +from errors by aligning with states observed in human demonstrations, so that +at runtime it can recover from perturbations outside the training distribution. +Additionally, we introduce a novel transformer-based perception encoder that +employs multi-view cross-attention and a learned scene query. We present +qualitative and quantitative results, demonstrating significant improvements +upon prior state of the art in closed-loop testing in the CARLA simulator, as +well as showing the ability to handle perturbations in both CARLA and NVIDIA's +DRIVE Sim. + +
+
+ comment: 7 pages, 6 figures, for ICRA 2025 conference, for associated video + file, see https://youtu.be/fO7RZ57gVxk +
+
+
+
+
+ + ♻ ☆ EDA-DM: Enhanced Distribution Alignment for Post-Training Quantization + of Diffusion Models + + +
+ Diffusion models have achieved great success in image generation tasks +through iterative noise estimation. However, the heavy denoising process and +complex neural networks hinder their low-latency applications in real-world +scenarios. Quantization can effectively reduce model complexity, and +post-training quantization (PTQ), which does not require fine-tuning, is highly +promising for compressing and accelerating diffusion models. Unfortunately, we +find that due to the highly dynamic distribution of activations in different +denoising steps, existing PTQ methods for diffusion models suffer from +distribution mismatch issues at both calibration sample level and +reconstruction output level, which makes the performance far from satisfactory, +especially in low-bit cases. In this paper, we propose Enhanced Distribution +Alignment for Post-Training Quantization of Diffusion Models (EDA-DM) to +address the above issues. Specifically, at the calibration sample level, we +select calibration samples based on the density and variety in the latent +space, thus facilitating the alignment of their distribution with the overall +samples; and at the reconstruction output level, we modify the loss of block +reconstruction with the losses of layers, aligning the outputs of quantized +model and full-precision model at different network granularity. Extensive +experiments demonstrate that EDA-DM significantly outperforms the existing PTQ +methods across various models (DDIM, LDM-4, LDM-8, Stable-Diffusion) and +different datasets (CIFAR-10, LSUN-Bedroom, LSUN-Church, ImageNet, MS-COCO). + +
+
+ comment: Code: http://github.com/BienLuky/EDA-DM +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ Revisiting Acoustic Similarity in Emotional Speech and Music via + Self-Supervised Representations + + +
+ Emotion recognition from speech and music shares similarities due to their +acoustic overlap, which has led to interest in transferring knowledge between +these domains. However, the shared acoustic cues between speech and music, +particularly those encoded by Self-Supervised Learning (SSL) models, remain +largely unexplored, given the fact that SSL models for speech and music have +rarely been applied in cross-domain research. In this work, we revisit the +acoustic similarity between emotion speech and music, starting with an analysis +of the layerwise behavior of SSL models for Speech Emotion Recognition (SER) +and Music Emotion Recognition (MER). Furthermore, we perform cross-domain +adaptation by comparing several approaches in a two-stage fine-tuning process, +examining effective ways to utilize music for SER and speech for MER. Lastly, +we explore the acoustic similarities between emotional speech and music using +Frechet audio distance for individual emotions, uncovering the issue of emotion +bias in both speech and music SSL models. Our findings reveal that while speech +and music SSL models do capture shared acoustic features, their behaviors can +vary depending on different emotions due to their training strategies and +domain-specificities. Additionally, parameter-efficient fine-tuning can enhance +SER and MER performance by leveraging knowledge from each other. This study +provides new insights into the acoustic similarity between emotional speech and +music, and highlights the potential for cross-domain generalization to improve +SER and MER systems. + +
+
+
+
+
+ + ☆ A Multimodal Single-Branch Embedding Network for Recommendation in + Cold-Start and Missing Modality Scenarios + + +
+ Most recommender systems adopt collaborative filtering (CF) and provide +recommendations based on past collective interactions. Therefore, the +performance of CF algorithms degrades when few or no interactions are +available, a scenario referred to as cold-start. To address this issue, +previous work relies on models leveraging both collaborative data and side +information on the users or items. Similar to multimodal learning, these models +aim at combining collaborative and content representations in a shared +embedding space. In this work we propose a novel technique for multimodal +recommendation, relying on a multimodal Single-Branch embedding network for +Recommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction +data as well as multimodal side information using the same single-branch +embedding network on different modalities. This makes SiBraR effective in +scenarios of missing modality, including cold start. Our extensive experiments +on large-scale recommendation datasets from three different recommendation +domains (music, movie, and e-commerce) and providing multimodal content +information (audio, text, image, labels, and interactions) show that SiBraR +significantly outperforms CF as well as state-of-the-art content-based RSs in +cold-start scenarios, and is competitive in warm scenarios. We show that +SiBraR's recommendations are accurate in missing modality scenarios, and that +the model is able to map different modalities to the same region of the shared +embedding space, hence reducing the modality gap. + +
+
+ comment: Accepted at 18th ACM Conference on Recommender Systems (RecSys '24) +
+
+
+
+
+ + ☆ Modeling the Popularity of Events on Web by Sparsity and + Mutual-Excitation Guided Graph Neural Network + + +
+ The content of a webpage described or posted an event in the cyberspace +inevitably reflects viewpoints, values and trends of the physical society. +Mapping an event on web to the popularity score plays a pivot role to sense the +social trends from the cyberspace. However, the complex semantic correspondence +between texts and images, as well as the implicit text-image-popularity mapping +mechanics pose a significant challenge to this non-trivial task. In this paper, +we address this problem from a viewpoint of understanding the interpretable +mapping mechanics. Concretely, we organize the keywords from different events +into an unified graph. The unified graph facilitates to model the popularity of +events via two-level mappings, i.e., the self excitation and the mutual +excitation. The self-excitation assumes that each keyword forms the popularity +while the mutual-excitation models that two keywords would excite each other to +determine the popularity of an event. Specifically, we use Graph Neural Network +(GNN) as the backbone to model the self-excitation, the mutual excitation and +the context of images into a sparse and deep factor model. Besides, to our best +knowledge, we release a challenge web event dataset for the popularity +prediction task. The experimental results on three public datasets demonstrate +that our method achieves significant improvements and outperforms the +state-of-the-art methods. Dataset is publicly available at: +https://github.com/pangjunbiao/Hot-events-dataset. + +
+
+
+
+
+ + ☆ Subjective and Objective Quality-of-Experience Evaluation Study for Live + Video Streaming + + +
+ In recent years, live video streaming has gained widespread popularity across +various social media platforms. Quality of experience (QoE), which reflects +end-users' satisfaction and overall experience, plays a critical role for media +service providers to optimize large-scale live compression and transmission +strategies to achieve perceptually optimal rate-distortion trade-off. Although +many QoE metrics for video-on-demand (VoD) have been proposed, there remain +significant challenges in developing QoE metrics for live video streaming. To +bridge this gap, we conduct a comprehensive study of subjective and objective +QoE evaluations for live video streaming. For the subjective QoE study, we +introduce the first live video streaming QoE dataset, TaoLive QoE, which +consists of $42$ source videos collected from real live broadcasts and $1,155$ +corresponding distorted ones degraded due to a variety of streaming +distortions, including conventional streaming distortions such as compression, +stalling, as well as live streaming-specific distortions like frame skipping, +variable frame rate, etc. Subsequently, a human study was conducted to derive +subjective QoE scores of videos in the TaoLive QoE dataset. For the objective +QoE study, we benchmark existing QoE models on the TaoLive QoE dataset as well +as publicly available QoE datasets for VoD scenarios, highlighting that current +models struggle to accurately assess video QoE, particularly for live content. +Hence, we propose an end-to-end QoE evaluation model, Tao-QoE, which integrates +multi-scale semantic features and optical flow-based motion features to +predicting a retrospective QoE score, eliminating reliance on statistical +quality of service (QoS) features. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ A Simple but Strong Baseline for Sounding Video Generation: Effective + Adaptation of Audio and Video Diffusion Models for Joint Generation + + +
+ In this work, we build a simple but strong baseline for sounding video +generation. Given base diffusion models for audio and video, we integrate them +with additional modules into a single model and train it to make the model +jointly generate audio and video. To enhance alignment between audio-video +pairs, we introduce two novel mechanisms in our model. The first one is +timestep adjustment, which provides different timestep information to each base +model. It is designed to align how samples are generated along with timesteps +across modalities. The second one is a new design of the additional modules, +termed Cross-Modal Conditioning as Positional Encoding (CMC-PE). In CMC-PE, +cross-modal information is embedded as if it represents temporal position +information, and the embeddings are fed into the model like positional +encoding. Compared with the popular cross-attention mechanism, CMC-PE provides +a better inductive bias for temporal alignment in the generated data. +Experimental results validate the effectiveness of the two newly introduced +mechanisms and also demonstrate that our method outperforms existing methods. + +
+
+ comment: The source code will be released soon +
+
+
+
+
+ + ♻ ☆ Exploring Event-based Human Pose Estimation with 3D Event + Representations + + +
+ Human pose estimation is a fundamental and appealing task in computer vision. +Although traditional cameras are commonly applied, their reliability decreases +in scenarios under high dynamic range or heavy motion blur, where event cameras +offer a robust solution. Predominant event-based methods accumulate events into +frames, ignoring the asynchronous and high temporal resolution that is crucial +for distinguishing distinct actions. To address this issue and to unlock the 3D +potential of event information, we introduce two 3D event representations: the +Rasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The +RasEPC aggregates events within concise temporal slices at identical positions, +preserving their 3D attributes along with statistical information, thereby +significantly reducing memory and computational demands. Meanwhile, the DEV +representation discretizes events into voxels and projects them across three +orthogonal planes, utilizing decoupled event attention to retrieve 3D cues from +the 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic +event-based dataset crafted to facilitate training and quantitative analysis in +outdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD +dataset, and our EV-3DPW dataset, with further qualitative validation via a +derived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our +code and dataset have been made publicly available at +https://github.com/MasterHow/EventPointPose. + +
+
+ comment: Accepted to Computer Vision and Image Understanding (CVPU). Extended + version of arXiv:2206.04511. The code and dataset are available at + https://github.com/MasterHow/EventPointPose +
+
+
+
+
+ + ♻ ☆ Multimodal Fusion via Hypergraph Autoencoder and Contrastive Learning + for Emotion Recognition in Conversation + + +
+ Multimodal emotion recognition in conversation (MERC) seeks to identify the +speakers' emotions expressed in each utterance, offering significant potential +across diverse fields. The challenge of MERC lies in balancing speaker modeling +and context modeling, encompassing both long-distance and short-distance +contexts, as well as addressing the complexity of multimodal information +fusion. Recent research adopts graph-based methods to model intricate +conversational relationships effectively. Nevertheless, the majority of these +methods utilize a fixed fully connected structure to link all utterances, +relying on convolution to interpret complex context. This approach can +inherently heighten the redundancy in contextual messages and excessive graph +network smoothing, particularly in the context of long-distance conversations. +To address this issue, we propose a framework that dynamically adjusts +hypergraph connections by variational hypergraph autoencoder (VHGAE), and +employs contrastive learning to mitigate uncertainty factors during the +reconstruction process. Experimental results demonstrate the effectiveness of +our proposal against the state-of-the-art methods on IEMOCAP and MELD datasets. +We release the code to support the reproducibility of this work at +https://github.com/yzjred/-HAUCL. + +
+
+ comment: Accepted by ACM MULTIMEDIA 2024 +
+
+
+
+
+ + ♻ ☆ Arena: A Patch-of-Interest ViT Inference Acceleration System for + Edge-Assisted Video Analytics + + +
+ The advent of edge computing has made real-time intelligent video analytics +feasible. Previous works, based on traditional model architecture (e.g., CNN, +RNN, etc.), employ various strategies to filter out non-region-of-interest +content to minimize bandwidth and computation consumption but show inferior +performance in adverse environments. Recently, visual foundation models based +on transformers have shown great performance in adverse environments due to +their amazing generalization capability. However, they require a large amount +of computation power, which limits their applications in real-time intelligent +video analytics. In this paper, we find visual foundation models like Vision +Transformer (ViT) also have a dedicated acceleration mechanism for video +analytics. To this end, we introduce Arena, an end-to-end edge-assisted video +inference acceleration system based on ViT. We leverage the capability of ViT +that can be accelerated through token pruning by only offloading and feeding +Patches-of-Interest to the downstream models. Additionally, we design an +adaptive keyframe inference switching algorithm tailored to different videos, +capable of adapting to the current video content to jointly optimize accuracy +and bandwidth. Through extensive experiments, our findings reveal that Arena +can boost inference speeds by up to 1.58\(\times\) and 1.82\(\times\) on +average while consuming only 47\% and 31\% of the bandwidth, respectively, all +with high inference accuracy. + +
+
+
+
+
+
+
+
+ + Robotics 80 + +
+
+
+ + RT-GuIDE: Real-Time Gaussian splatting for Information-Driven + Exploration ICRA2025 + + +
+ We propose a framework for active mapping and exploration that leverages +Gaussian splatting for constructing information-rich maps. Further, we develop +a parallelized motion planning algorithm that can exploit the Gaussian map for +real-time navigation. The Gaussian map constructed onboard the robot is +optimized for both photometric and geometric quality while enabling real-time +situational awareness for autonomy. We show through simulation experiments that +our method is competitive with approaches that use alternate information gain +metrics, while being orders of magnitude faster to compute. In real-world +experiments, our algorithm achieves better map quality (10% higher Peak +Signal-to-Noise Ratio (PSNR) and 30% higher geometric reconstruction accuracy) +than Gaussian maps constructed by traditional exploration baselines. Experiment +videos and more details can be found on our project page: +https://tyuezhan.github.io/RT_GuIDE/ + +
+
+ comment: Submitted to ICRA2025 +
+
+
+
+
+ + ☆ Robot See Robot Do: Imitating Articulated Object Manipulation with + Monocular 4D Reconstruction CoRL 2024 + + +
+ Humans can learn to manipulate new objects by simply watching others; +providing robots with the ability to learn from such demonstrations would +enable a natural interface specifying new behaviors. This work develops Robot +See Robot Do (RSRD), a method for imitating articulated object manipulation +from a single monocular RGB human demonstration given a single static +multi-view object scan. We first propose 4D Differentiable Part Models +(4D-DPM), a method for recovering 3D part motion from a monocular video with +differentiable rendering. This analysis-by-synthesis approach uses part-centric +feature fields in an iterative optimization which enables the use of geometric +regularizers to recover 3D motions from only a single video. Given this 4D +reconstruction, the robot replicates object trajectories by planning bimanual +arm motions that induce the demonstrated object part motion. By representing +demonstrations as part-centric trajectories, RSRD focuses on replicating the +demonstration's intended behavior while considering the robot's own +morphological limits, rather than attempting to reproduce the hand's motion. We +evaluate 4D-DPM's 3D tracking accuracy on ground truth annotated 3D part +trajectories and RSRD's physical execution performance on 9 objects across 10 +trials each on a bimanual YuMi robot. Each phase of RSRD achieves an average of +87% success rate, for a total end-to-end success rate of 60% across 90 trials. +Notably, this is accomplished using only feature fields distilled from large +pretrained vision models -- without any task-specific training, fine-tuning, +dataset collection, or annotation. Project page: +https://robot-see-robot-do.github.io + +
+
+ comment: CoRL 2024, Project page: https://robot-see-robot-do.github.io +
+
+
+
+
+ + EvMAPPER: High Altitude Orthomapping with Event Cameras + + +
+ Traditionally, unmanned aerial vehicles (UAVs) rely on CMOS-based cameras to +collect images about the world below. One of the most successful applications +of UAVs is to generate orthomosaics or orthomaps, in which a series of images +are integrated together to develop a larger map. However, the use of CMOS-based +cameras with global or rolling shutters mean that orthomaps are vulnerable to +challenging light conditions, motion blur, and high-speed motion of +independently moving objects under the camera. Event cameras are less sensitive +to these issues, as their pixels are able to trigger asynchronously on +brightness changes. This work introduces the first orthomosaic approach using +event cameras. In contrast to existing methods relying only on CMOS cameras, +our approach enables map generation even in challenging light conditions, +including direct sunlight and after sunset. + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ☆ Language-Embedded Gaussian Splats (LEGS): Incrementally Building + Room-Scale Representations with a Mobile Robot + + +
+ Building semantic 3D maps is valuable for searching for objects of interest +in offices, warehouses, stores, and homes. We present a mapping system that +incrementally builds a Language-Embedded Gaussian Splat (LEGS): a detailed 3D +scene representation that encodes both appearance and semantics in a unified +representation. LEGS is trained online as a robot traverses its environment to +enable localization of open-vocabulary object queries. We evaluate LEGS on 4 +room-scale scenes where we query for objects in the scene to assess how LEGS +can capture semantic meaning. We compare LEGS to LERF and find that while both +systems have comparable object query success rates, LEGS trains over 3.5x +faster than LERF. Results suggest that a multi-camera setup and incremental +bundle adjustment can boost visual reconstruction quality in constrained robot +trajectories, and suggest LEGS can localize open-vocabulary and long-tail +object queries with up to 66% accuracy. + +
+
+
+
+
+ + ☆ StackGen: Generating Stable Structures from Silhouettes via Diffusion + + +
+ Humans naturally obtain intuition about the interactions between and the +stability of rigid objects by observing and interacting with the world. It is +this intuition that governs the way in which we regularly configure objects in +our environment, allowing us to build complex structures from simple, everyday +objects. Robotic agents, on the other hand, traditionally require an explicit +model of the world that includes the detailed geometry of each object and an +analytical model of the environment dynamics, which are difficult to scale and +preclude generalization. Instead, robots would benefit from an awareness of +intuitive physics that enables them to similarly reason over the stable +interaction of objects in their environment. Towards that goal, we propose +StackGen, a diffusion model that generates diverse stable configurations of +building blocks matching a target silhouette. To demonstrate the capability of +the method, we evaluate it in a simulated environment and deploy it in the real +setting using a robotic arm to assemble structures generated by the model. + +
+
+
+
+
+ + ☆ A Sim-to-Real Vision-based Lane Keeping System for a 1:10-scale + Autonomous Vehicle + + +
+ In recent years, several competitions have highlighted the need to +investigate vision-based solutions to address scenarios with functional +insufficiencies in perception, world modeling and localization. This article +presents the Vision-based Lane Keeping System (VbLKS) developed by the +DEI-Unipd Team within the context of the Bosch Future Mobility Challenge 2022. +The main contribution lies in a Simulation-to-Reality (Sim2Real) GPS-denied +VbLKS for a 1:10-scale autonomous vehicle. In this VbLKS, the input to a +tailored Pure Pursuit (PP) based control strategy, namely the Lookahead Heading +Error (LHE), is estimated at a constant lookahead distance employing a +Convolutional Neural Network (CNN). A training strategy for a compact CNN is +proposed, emphasizing data generation and augmentation on simulated camera +images from a 3D Gazebo simulator, and enabling real-time operation on +low-level hardware. A tailored PP-based lateral controller equipped with a +derivative action and a PP-based velocity reference generation are implemented. +Tuning ranges are established through a systematic time-delay stability +analysis. Validation in a representative controlled laboratory setting is +provided. + +
+
+ comment: 16 pages, 23 figures +
+
+
+
+
+ + ☆ DiffSSC: Semantic LiDAR Scan Completion using Denoising Diffusion + Probabilistic Models + + +
+ Perception systems play a crucial role in autonomous driving, incorporating +multiple sensors and corresponding computer vision algorithms. 3D LiDAR sensors +are widely used to capture sparse point clouds of the vehicle's surroundings. +However, such systems struggle to perceive occluded areas and gaps in the scene +due to the sparsity of these point clouds and their lack of semantics. To +address these challenges, Semantic Scene Completion (SSC) jointly predicts +unobserved geometry and semantics in the scene given raw LiDAR measurements, +aiming for a more complete scene representation. Building on promising results +of diffusion models in image generation and super-resolution tasks, we propose +their extension to SSC by implementing the noising and denoising diffusion +processes in the point and semantic spaces individually. To control the +generation, we employ semantic LiDAR point clouds as conditional input and +design local and global regularization losses to stabilize the denoising +process. We evaluate our approach on autonomous driving datasets and our +approach outperforms the state-of-the-art for SSC. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ GSON: A Group-based Social Navigation Framework with Large Multimodal + Model + + +
+ As the number of service robots and autonomous vehicles in human-centered +environments grows, their requirements go beyond simply navigating to a +destination. They must also take into account dynamic social contexts and +ensure respect and comfort for others in shared spaces, which poses significant +challenges for perception and planning. In this paper, we present a group-based +social navigation framework GSON to enable mobile robots to perceive and +exploit the social group of their surroundings by leveling the visual reasoning +capability of the Large Multimodal Model (LMM). For perception, we apply visual +prompting techniques to zero-shot extract the social relationship among +pedestrians and combine the result with a robust pedestrian detection and +tracking pipeline to alleviate the problem of low inference speed of the LMM. +Given the perception result, the planning system is designed to avoid +disrupting the current social structure. We adopt a social structure-based +mid-level planner as a bridge between global path planning and local motion +planning to preserve the global context and reactive response. The proposed +method is validated on real-world mobile robot navigation tasks involving +complex social structure understanding and reasoning. Experimental results +demonstrate the effectiveness of the system in these scenarios compared with +several baselines. + +
+
+
+
+
+ + ☆ SKT: Integrating State-Aware Keypoint Trajectories with Vision-Language + Models for Robotic Garment Manipulation + + +
+ Automating garment manipulation poses a significant challenge for assistive +robotics due to the diverse and deformable nature of garments. Traditional +approaches typically require separate models for each garment type, which +limits scalability and adaptability. In contrast, this paper presents a unified +approach using vision-language models (VLMs) to improve keypoint prediction +across various garment categories. By interpreting both visual and semantic +information, our model enables robots to manage different garment states with a +single model. We created a large-scale synthetic dataset using advanced +simulation techniques, allowing scalable training without extensive real-world +data. Experimental results indicate that the VLM-based method significantly +enhances keypoint detection accuracy and task success rates, providing a more +flexible and general solution for robotic garment manipulation. In addition, +this research also underscores the potential of VLMs to unify various garment +manipulation tasks within a single framework, paving the way for broader +applications in home automation and assistive robotics for future. + +
+
+
+
+
+ + ☆ DualAD: Dual-Layer Planning for Reasoning in Autonomous Driving + + +
+ We present a novel autonomous driving framework, DualAD, designed to imitate +human reasoning during driving. DualAD comprises two layers: a rule-based +motion planner at the bottom layer that handles routine driving tasks requiring +minimal reasoning, and an upper layer featuring a rule-based text encoder that +converts driving scenarios from absolute states into text description. This +text is then processed by a large language model (LLM) to make driving +decisions. The upper layer intervenes in the bottom layer's decisions when +potential danger is detected, mimicking human reasoning in critical situations. +Closed-loop experiments demonstrate that DualAD, using a zero-shot pre-trained +model, significantly outperforms rule-based motion planners that lack reasoning +abilities. Our experiments also highlight the effectiveness of the text +encoder, which considerably enhances the model's scenario understanding. +Additionally, the integrated DualAD model improves with stronger LLMs, +indicating the framework's potential for further enhancement. We make code and +benchmarks publicly available. + +
+
+ comment: Autonomous Driving, Large Language Models (LLMs), Human Reasoning, + Critical Scenario +
+
+
+
+
+ + ☆ Explaining Explaining + + +
+ Explanation is key to people having confidence in high-stakes AI systems. +However, machine-learning-based systems - which account for almost all current +AI - can't explain because they are usually black boxes. The explainable AI +(XAI) movement hedges this problem by redefining "explanation". The +human-centered explainable AI (HCXAI) movement identifies the +explanation-oriented needs of users but can't fulfill them because of its +commitment to machine learning. In order to achieve the kinds of explanations +needed by real people operating in critical domains, we must rethink how to +approach AI. We describe a hybrid approach to developing cognitive agents that +uses a knowledge-based infrastructure supplemented by data obtained through +machine learning when applicable. These agents will serve as assistants to +humans who will bear ultimate responsibility for the decisions and actions of +the human-robot team. We illustrate the explanatory potential of such agents +using the under-the-hood panels of a demonstration system in which a team of +simulated robots collaborates on a search task assigned by a human. + +
+
+
+
+
+ + ☆ Revisit Anything: Visual Place Recognition via Image Segment Retrieval ECCV 2024 + + +
+ Accurately recognizing a revisited place is crucial for embodied agents to +localize and navigate. This requires visual representations to be distinct, +despite strong variations in camera viewpoint and scene appearance. Existing +visual place recognition pipelines encode the "whole" image and search for +matches. This poses a fundamental challenge in matching two images of the same +place captured from different camera viewpoints: "the similarity of what +overlaps can be dominated by the dissimilarity of what does not overlap". We +address this by encoding and searching for "image segments" instead of the +whole images. We propose to use open-set image segmentation to decompose an +image into `meaningful' entities (i.e., things and stuff). This enables us to +create a novel image representation as a collection of multiple overlapping +subgraphs connecting a segment with its neighboring segments, dubbed +SuperSegment. Furthermore, to efficiently encode these SuperSegments into +compact vector representations, we propose a novel factorized representation of +feature aggregation. We show that retrieving these partial representations +leads to significantly higher recognition recall than the typical whole image +based retrieval. Our segments-based approach, dubbed SegVLAD, sets a new +state-of-the-art in place recognition on a diverse selection of benchmark +datasets, while being applicable to both generic and task-specialized image +encoders. Finally, we demonstrate the potential of our method to ``revisit +anything'' by evaluating our method on an object instance retrieval task, which +bridges the two disparate areas of research: visual place recognition and +object-goal navigation, through their common aim of recognizing goal objects +specific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything. + +
+
+ comment: Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures +
+
+
+
+
+ + ☆ HARMONIC: Cognitive and Control Collaboration in Human-Robotic Teams ICRA 2025 + + +
+ This paper presents a novel approach to multi-robot planning and +collaboration. We demonstrate a cognitive strategy for robots in human-robot +teams that incorporates metacognition, natural language communication, and +explainability. The system is embodied using the HARMONIC architecture that +flexibly integrates cognitive and control capabilities across the team. We +evaluate our approach through simulation experiments involving a joint search +task by a team of heterogeneous robots (a UGV and a drone) and a human. We +detail the system's handling of complex, real-world scenarios, effective action +coordination between robots with different capabilities, and natural +human-robot communication. This work demonstrates that the robots' ability to +reason about plans, goals, and attitudes, and to provide explanations for +actions and decisions are essential prerequisites for realistic human-robot +teaming. + +
+
+ comment: Submitted to ICRA 2025 Conference, Atlanta, GA, USA +
+
+
+
+
+ + ☆ MMDVS-LF: A Multi-Modal Dynamic-Vision-Sensor Line Following Dataset + + +
+ Dynamic Vision Sensors (DVS), offer a unique advantage in control +applications, due to their high temporal resolution, and asynchronous +event-based data. Still, their adoption in machine learning algorithms remains +limited. To address this gap, and promote the development of models that +leverage the specific characteristics of DVS data, we introduce the Multi-Modal +Dynamic-Vision-Sensor Line Following dataset (MMDVS-LF). This comprehensive +dataset, is the first to integrate multiple sensor modalities, including DVS +recordings, RGB video, odometry, and Inertial Measurement Unit (IMU) data, from +a small-scale standardized vehicle. Additionally, the dataset includes +eye-tracking and demographic data of drivers performing a Line Following task +on a track. With its diverse range of data, MMDVS-LF opens new opportunities +for developing deep learning algorithms, and conducting data science projects +across various domains, supporting innovation in autonomous systems and control +applications. + +
+
+
+
+
+ + ☆ HARMONIC: A Framework for Explanatory Cognitive Robots ICRA + + +
+ We present HARMONIC, a framework for implementing cognitive robots that +transforms general-purpose robots into trusted teammates capable of complex +decision-making, natural communication and human-level explanation. The +framework supports interoperability between a strategic (cognitive) layer for +high-level decision-making and a tactical (robot) layer for low-level control +and execution. We describe the core features of the framework and our initial +implementation, in which HARMONIC was deployed on a simulated UGV and drone +involved in a multi-robot search and retrieval task. + +
+
+ comment: Accepted for presentation at ICRA@40. 23-26 September 2024, + Rotterdam, Netherlands +
+
+
+
+
+ + ☆ Reasoning Multi-Agent Behavioral Topology for Interactive Autonomous + Driving + + +
+ Autonomous driving system aims for safe and social-consistent driving through +the behavioral integration among interactive agents. However, challenges remain +due to multi-agent scene uncertainty and heterogeneous interaction. Current +dense and sparse behavioral representations struggle with inefficiency and +inconsistency in multi-agent modeling, leading to instability of collective +behavioral patterns when integrating prediction and planning (IPP). To address +this, we initiate a topological formation that serves as a compliant behavioral +foreground to guide downstream trajectory generations. Specifically, we +introduce Behavioral Topology (BeTop), a pivotal topological formulation that +explicitly represents the consensual behavioral pattern among multi-agent +future. BeTop is derived from braid theory to distill compliant interactive +topology from multi-agent future trajectories. A synergistic learning framework +(BeTopNet) supervised by BeTop facilitates the consistency of behavior +prediction and planning within the predicted topology priors. Through imitative +contingency learning, BeTop also effectively manages behavioral uncertainty for +prediction and planning. Extensive verification on large-scale real-world +datasets, including nuPlan and WOMD, demonstrates that BeTop achieves +state-of-the-art performance in both prediction and planning tasks. Further +validations on the proposed interactive scenario benchmark showcase planning +compliance in interactive cases. + +
+
+
+
+
+ + ☆ ReliOcc: Towards Reliable Semantic Occupancy Prediction via Uncertainty + Learning + + +
+ Vision-centric semantic occupancy prediction plays a crucial role in +autonomous driving, which requires accurate and reliable predictions from +low-cost sensors. Although having notably narrowed the accuracy gap with LiDAR, +there is still few research effort to explore the reliability in predicting +semantic occupancy from camera. In this paper, we conduct a comprehensive +evaluation of existing semantic occupancy prediction models from a reliability +perspective for the first time. Despite the gradual alignment of camera-based +models with LiDAR in term of accuracy, a significant reliability gap persists. +To addresses this concern, we propose ReliOcc, a method designed to enhance the +reliability of camera-based occupancy networks. ReliOcc provides a +plug-and-play scheme for existing models, which integrates hybrid uncertainty +from individual voxels with sampling-based noise and relative voxels through +mix-up learning. Besides, an uncertainty-aware calibration strategy is devised +to further enhance model reliability in offline mode. Extensive experiments +under various settings demonstrate that ReliOcc significantly enhances model +reliability while maintaining the accuracy of both geometric and semantic +predictions. Importantly, our proposed approach exhibits robustness to sensor +failures and out of domain noises during inference. + +
+
+ comment: Technical report. Work in progress +
+
+
+
+
+ + ☆ Control Industrial Automation System with Large Language Models + + +
+ Traditional industrial automation systems require specialized expertise to +operate and complex reprogramming to adapt to new processes. Large language +models offer the intelligence to make them more flexible and easier to use. +However, LLMs' application in industrial settings is underexplored. This paper +introduces a framework for integrating LLMs to achieve end-to-end control of +industrial automation systems. At the core of the framework are an agent system +designed for industrial tasks, a structured prompting method, and an +event-driven information modeling mechanism that provides real-time data for +LLM inference. The framework supplies LLMs with real-time events on different +context semantic levels, allowing them to interpret the information, generate +production plans, and control operations on the automation system. It also +supports structured dataset creation for fine-tuning on this downstream +application of LLMs. Our contribution includes a formal system design, +proof-of-concept implementation, and a method for generating task-specific +datasets for LLM fine-tuning and testing. This approach enables a more adaptive +automation system that can respond to spontaneous events, while allowing easier +operation and configuration through natural language for more intuitive +human-machine interaction. We provide demo videos and detailed data on GitHub: +https://github.com/YuchenXia/LLM4IAS + +
+
+
+
+
+ + ☆ Joint Localization and Planning using Diffusion ICRA 2025 + + +
+ Diffusion models have been successfully applied to robotics problems such as +manipulation and vehicle path planning. In this work, we explore their +application to end-to-end navigation -- including both perception and planning +-- by considering the problem of jointly performing global localization and +path planning in known but arbitrary 2D environments. In particular, we +introduce a diffusion model which produces collision-free paths in a global +reference frame given an egocentric LIDAR scan, an arbitrary map, and a desired +goal position. To this end, we implement diffusion in the space of paths in +SE(2), and describe how to condition the denoising process on both obstacles +and sensor observations. In our evaluation, we show that the proposed +conditioning techniques enable generalization to realistic maps of considerably +different appearance than the training environment, demonstrate our model's +ability to accurately describe ambiguous solutions, and run extensive +simulation experiments showcasing our model's use as a real-time, end-to-end +localization and planning stack. + +
+
+ comment: 7 pages, 9 figures. Submitted to ICRA 2025, under review +
+
+
+
+
+ + ☆ LoopSR: Looping Sim-and-Real for Lifelong Policy Adaptation of Legged + Robots + + +
+ Reinforcement Learning (RL) has shown its remarkable and generalizable +capability in legged locomotion through sim-to-real transfer. However, while +adaptive methods like domain randomization are expected to make policy more +robust to diverse environments, such comprehensiveness potentially detracts +from the policy's performance in any specific environment according to the No +Free Lunch theorem, leading to a suboptimal solution once deployed in the real +world. To address this issue, we propose a lifelong policy adaptation framework +named LoopSR, which utilizes a transformer-based encoder to project real-world +trajectories into a latent space, and accordingly reconstruct the real-world +environments back in simulation for further improvement. Autoencoder +architecture and contrastive learning methods are adopted to better extract the +characteristics of real-world dynamics. The simulation parameters for continual +training are derived by combining predicted parameters from the decoder with +retrieved parameters from the simulation trajectory dataset. By leveraging the +continual training, LoopSR achieves superior data efficiency compared with +strong baselines, with only a limited amount of data to yield eminent +performance in both sim-to-sim and sim-to-real experiments. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Deblur e-NeRF: NeRF from Motion-Blurred Events under High-speed or + Low-light Conditions ECCV 2024 + + +
+ The stark contrast in the design philosophy of an event camera makes it +particularly ideal for operating under high-speed, high dynamic range and +low-light conditions, where standard cameras underperform. Nonetheless, event +cameras still suffer from some amount of motion blur, especially under these +challenging conditions, in contrary to what most think. This is attributed to +the limited bandwidth of the event sensor pixel, which is mostly proportional +to the light intensity. Thus, to ensure that event cameras can truly excel in +such conditions where it has an edge over standard cameras, it is crucial to +account for event motion blur in downstream applications, especially +reconstruction. However, none of the recent works on reconstructing Neural +Radiance Fields (NeRFs) from events, nor event simulators, have considered the +full effects of event motion blur. To this end, we propose, Deblur e-NeRF, a +novel method to directly and effectively reconstruct blur-minimal NeRFs from +motion-blurred events generated under high-speed motion or low-light +conditions. The core component of this work is a physically-accurate pixel +bandwidth model proposed to account for event motion blur under arbitrary speed +and lighting conditions. We also introduce a novel threshold-normalized total +variation loss to improve the regularization of large textureless patches. +Experiments on real and novel realistically simulated sequences verify our +effectiveness. Our code, event simulator and synthetic event dataset will be +open-sourced. + +
+
+ comment: Accepted to ECCV 2024. Project website is accessible at + https://wengflow.github.io/deblur-e-nerf. arXiv admin note: text overlap with + arXiv:2006.07722 by other authors +
+
+
+
+
+ + ☆ Model-Free versus Model-Based Reinforcement Learning for Fixed-Wing UAV + Attitude Control Under Varying Wind Conditions + + +
+ This paper evaluates and compares the performance of model-free and +model-based reinforcement learning for the attitude control of fixed-wing +unmanned aerial vehicles using PID as a reference point. The comparison focuses +on their ability to handle varying flight dynamics and wind disturbances in a +simulated environment. Our results show that the Temporal Difference Model +Predictive Control agent outperforms both the PID controller and other +model-free reinforcement learning methods in terms of tracking accuracy and +robustness over different reference difficulties, particularly in nonlinear +flight regimes. Furthermore, we introduce actuation fluctuation as a key metric +to assess energy efficiency and actuator wear, and we test two different +approaches from the literature: action variation penalty and conditioning for +action policy smoothness. We also evaluate all control methods when subject to +stochastic turbulence and gusts separately, so as to measure their effects on +tracking performance, observe their limitations and outline their implications +on the Markov decision process formalism. + +
+
+ comment: Published at ICINCO 2024 +
+
+
+
+
+ + ☆ Swarm-LIO2: Decentralized, Efficient LiDAR-inertial Odometry for UAV + Swarms + + +
+ Aerial swarm systems possess immense potential in various aspects, such as +cooperative exploration, target tracking, search and rescue. Efficient, +accurate self and mutual state estimation are the critical preconditions for +completing these swarm tasks, which remain challenging research topics. This +paper proposes Swarm-LIO2: a fully decentralized, plug-and-play, +computationally efficient, and bandwidth-efficient LiDAR-inertial odometry for +aerial swarm systems. Swarm-LIO2 uses a decentralized, plug-and-play network as +the communication infrastructure. Only bandwidth-efficient and low-dimensional +information is exchanged, including identity, ego-state, mutual observation +measurements, and global extrinsic transformations. To support the +plug-and-play of new teammate participants, Swarm-LIO2 detects potential +teammate UAVs and initializes the temporal offset and global extrinsic +transformation all automatically. To enhance the initialization efficiency, +novel reflectivity-based UAV detection, trajectory matching, and factor graph +optimization methods are proposed. For state estimation, Swarm-LIO2 fuses +LiDAR, IMU, and mutual observation measurements within an efficient ESIKF +framework, with careful compensation of temporal delay and modeling of +measurements to enhance the accuracy and consistency. + +
+
+ comment: 23 Pages +
+
+
+
+
+ + ☆ SECURE: Semantics-aware Embodied Conversation under Unawareness for + Lifelong Robot Learning + + +
+ This paper addresses a challenging interactive task learning scenario we call +rearrangement under unawareness: to manipulate a rigid-body environment in a +context where the robot is unaware of a concept that's key to solving the +instructed task. We propose SECURE, an interactive task learning framework +designed to solve such problems by fixing a deficient domain model using +embodied conversation. Through dialogue, the robot discovers and then learns to +exploit unforeseen possibilities. Using SECURE, the robot not only learns from +the user's corrective feedback when it makes a mistake, but it also learns to +make strategic dialogue decisions for revealing useful evidence about novel +concepts for solving the instructed task. Together, these abilities allow the +robot to generalise to subsequent tasks using newly acquired knowledge. We +demonstrate that a robot that is semantics-aware -- that is, it exploits the +logical consequences of both sentence and discourse semantics in the learning +and inference process -- learns to solve rearrangement under unawareness more +effectively than a robot that lacks such capabilities. + +
+
+ comment: 10 pages,4 figures, 2 tables +
+
+
+
+
+ + ☆ Robust Ladder Climbing with a Quadrupedal Robot + + +
+ Quadruped robots are proliferating in industrial environments where they +carry sensor suites and serve as autonomous inspection platforms. Despite the +advantages of legged robots over their wheeled counterparts on rough and uneven +terrain, they are still yet to be able to reliably negotiate ubiquitous +features of industrial infrastructure: ladders. Inability to traverse ladders +prevents quadrupeds from inspecting dangerous locations, puts humans in harm's +way, and reduces industrial site productivity. In this paper, we learn +quadrupedal ladder climbing via a reinforcement learning-based control policy +and a complementary hooked end-effector. We evaluate the robustness in +simulation across different ladder inclinations, rung geometries, and +inter-rung spacings. On hardware, we demonstrate zero-shot transfer with an +overall 90% success rate at ladder angles ranging from 70{\deg} to 90{\deg}, +consistent climbing performance during unmodeled perturbations, and climbing +speeds 232x faster than the state of the art. This work expands the scope of +industrial quadruped robot applications beyond inspection on nominal terrains +to challenging infrastructural features in the environment, highlighting +synergies between robot morphology and control policy when performing complex +skills. More information can be found at the project website: +https://sites.google.com/leggedrobotics.com/climbingladders. + +
+
+ comment: Project website: + https://sites.google.com/leggedrobotics.com/climbingladders +
+
+
+
+
+ + ☆ Robotic-CLIP: Fine-tuning CLIP on Action Data for Robotic Applications + + +
+ Vision language models have played a key role in extracting meaningful +features for various robotic applications. Among these, Contrastive +Language-Image Pretraining (CLIP) is widely used in robotic tasks that require +both vision and natural language understanding. However, CLIP was trained +solely on static images paired with text prompts and has not yet been fully +adapted for robotic tasks involving dynamic actions. In this paper, we +introduce Robotic-CLIP to enhance robotic perception capabilities. We first +gather and label large-scale action data, and then build our Robotic-CLIP by +fine-tuning CLIP on 309,433 videos (~7.4 million frames) of action data using +contrastive learning. By leveraging action data, Robotic-CLIP inherits CLIP's +strong image performance while gaining the ability to understand actions in +robotic contexts. Intensive experiments show that our Robotic-CLIP outperforms +other CLIP-based models across various language-driven robotic tasks. +Additionally, we demonstrate the practical effectiveness of Robotic-CLIP in +real-world grasping applications. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Stable Object Placement Under Geometric Uncertainty via Differentiable + Contact Dynamics + + +
+ From serving a cup of coffee to carefully rearranging delicate items, stable +object placement is a crucial skill for future robots. This skill is +challenging due to the required accuracy, which is difficult to achieve under +geometric uncertainty. We leverage differentiable contact dynamics to develop a +principled method for stable object placement under geometric uncertainty. We +estimate the geometric uncertainty by minimizing the discrepancy between the +force-torque sensor readings and the model predictions through gradient +descent. We further keep track of a belief over multiple possible geometric +parameters to mitigate the gradient-based method's sensitivity to the +initialization. We verify our approach in the real world on various geometric +uncertainties, including the in-hand pose uncertainty of the grasped object, +the object's shape uncertainty, and the environment's shape uncertainty. + +
+
+
+
+
+ + ☆ Scene Understanding in Pick-and-Place Tasks: Analyzing Transformations + Between Initial and Final Scenes + + +
+ With robots increasingly collaborating with humans in everyday tasks, it is +important to take steps toward robotic systems capable of understanding the +environment. This work focuses on scene understanding to detect pick and place +tasks given initial and final images from the scene. To this end, a dataset is +collected for object detection and pick and place task detection. A YOLOv5 +network is subsequently trained to detect the objects in the initial and final +scenes. Given the detected objects and their bounding boxes, two methods are +proposed to detect the pick and place tasks which transform the initial scene +into the final scene. A geometric method is proposed which tracks objects' +movements in the two scenes and works based on the intersection of the bounding +boxes which moved within scenes. Contrarily, the CNN-based method utilizes a +Convolutional Neural Network to classify objects with intersected bounding +boxes into 5 classes, showing the spatial relationship between the involved +objects. The performed pick and place tasks are then derived from analyzing the +experiments with both scenes. Results show that the CNN-based method, using a +VGG16 backbone, outscores the geometric method by roughly 12 percentage points +in certain scenarios, with an overall success rate of 84.3%. + +
+
+ comment: Conference Paper, ICEE 2024, 7 pages, 5 figures +
+
+
+
+
+ + ☆ Episodic Memory Verbalization using Hierarchical Representations of + Life-Long Robot Experience + + +
+ Verbalization of robot experience, i.e., summarization of and question +answering about a robot's past, is a crucial ability for improving human-robot +interaction. Previous works applied rule-based systems or fine-tuned deep +models to verbalize short (several-minute-long) streams of episodic data, +limiting generalization and transferability. In our work, we apply large +pretrained models to tackle this task with zero or few examples, and +specifically focus on verbalizing life-long experiences. For this, we derive a +tree-like data structure from episodic memory (EM), with lower levels +representing raw perception and proprioception data, and higher levels +abstracting events to natural language concepts. Given such a hierarchical +representation built from the experience stream, we apply a large language +model as an agent to interactively search the EM given a user's query, +dynamically expanding (initially collapsed) tree nodes to find the relevant +information. The approach keeps computational costs low even when scaling to +months of robot experience data. We evaluate our method on simulated household +robot data, human egocentric videos, and real-world robot recordings, +demonstrating its flexibility and scalability. + +
+
+ comment: Code, data and demo videos at https://hierarchical-emv.github.io +
+
+
+
+
+ + ☆ Event-based Stereo Depth Estimation: A Survey + + +
+ Stereopsis has widespread appeal in robotics as it is the predominant way by +which living beings perceive depth to navigate our 3D world. Event cameras are +novel bio-inspired sensors that detect per-pixel brightness changes +asynchronously, with very high temporal resolution and high dynamic range, +enabling machine perception in high-speed motion and broad illumination +conditions. The high temporal precision also benefits stereo matching, making +disparity (depth) estimation a popular research area for event cameras ever +since its inception. Over the last 30 years, the field has evolved rapidly, +from low-latency, low-power circuit design to current deep learning (DL) +approaches driven by the computer vision community. The bibliography is vast +and difficult to navigate for non-experts due its highly interdisciplinary +nature. Past surveys have addressed distinct aspects of this topic, in the +context of applications, or focusing only on a specific class of techniques, +but have overlooked stereo datasets. This survey provides a comprehensive +overview, covering both instantaneous stereo and long-term methods suitable for +simultaneous localization and mapping (SLAM), along with theoretical and +empirical comparisons. It is the first to extensively review DL methods as well +as stereo datasets, even providing practical suggestions for creating new +benchmarks to advance the field. The main advantages and challenges faced by +event-based stereo depth estimation are also discussed. Despite significant +progress, challenges remain in achieving optimal performance in not only +accuracy but also efficiency, a cornerstone of event-based computing. We +identify several gaps and propose future research directions. We hope this +survey inspires future research in this area, by serving as an accessible entry +point for newcomers, as well as a practical guide for seasoned researchers in +the community. + +
+
+ comment: 28 pages, 20 figures, 7 tables +
+
+
+
+
+ + ☆ AssistantX: An LLM-Powered Proactive Assistant in Collaborative + Human-Populated Environment + + +
+ The increasing demand for intelligent assistants in human-populated +environments has motivated significant research in autonomous robotic systems. +Traditional service robots and virtual assistants, however, struggle with +real-world task execution due to their limited capacity for dynamic reasoning +and interaction, particularly when human collaboration is required. Recent +developments in Large Language Models have opened new avenues for improving +these systems, enabling more sophisticated reasoning and natural interaction +capabilities. In this paper, we introduce AssistantX, an LLM-powered proactive +assistant designed to operate autonomously in a physical office environment. +Unlike conventional service robots, AssistantX leverages a novel multi-agent +architecture, PPDR4X, which provides advanced inference capabilities and +comprehensive collaboration awareness. By effectively bridging the gap between +virtual operations and physical interactions, AssistantX demonstrates robust +performance in managing complex real-world scenarios. Our evaluation highlights +the architecture's effectiveness, showing that AssistantX can respond to clear +instructions, actively retrieve supplementary information from memory, and +proactively seek collaboration from team members to ensure successful task +completion. More details and videos can be found at +https://assistantx-agent.github.io/AssistantX/. + +
+
+ comment: 6 pages, 8 figures, 4 tables +
+
+
+
+
+ + ☆ FactorSim: Generative Simulation via Factorized Representation + + +
+ Generating simulations to train intelligent agents in game-playing and +robotics from natural language input, from user input or task documentation, +remains an open-ended challenge. Existing approaches focus on parts of this +challenge, such as generating reward functions or task hyperparameters. Unlike +previous work, we introduce FACTORSIM that generates full simulations in code +from language input that can be used to train agents. Exploiting the structural +modularity specific to coded simulations, we propose to use a factored +partially observable Markov decision process representation that allows us to +reduce context dependence during each step of the generation. For evaluation, +we introduce a generative simulation benchmark that assesses the generated +simulation code's accuracy and effectiveness in facilitating zero-shot +transfers in reinforcement learning settings. We show that FACTORSIM +outperforms existing methods in generating simulations regarding prompt +alignment (e.g., accuracy), zero-shot transfer abilities, and human evaluation. +We also demonstrate its effectiveness in generating robotic tasks. + +
+
+ comment: neurips 2024, project website: + https://cs.stanford.edu/~sunfanyun/factorsim/ +
+
+
+
+
+ + ☆ AP-VLM: Active Perception Enabled by Vision-Language Models + + +
+ Active perception enables robots to dynamically gather information by +adjusting their viewpoints, a crucial capability for interacting with complex, +partially observable environments. In this paper, we present AP-VLM, a novel +framework that combines active perception with a Vision-Language Model (VLM) to +guide robotic exploration and answer semantic queries. Using a 3D virtual grid +overlaid on the scene and orientation adjustments, AP-VLM allows a robotic +manipulator to intelligently select optimal viewpoints and orientations to +resolve challenging tasks, such as identifying objects in occluded or inclined +positions. We evaluate our system on two robotic platforms: a 7-DOF Franka +Panda and a 6-DOF UR5, across various scenes with differing object +configurations. Our results demonstrate that AP-VLM significantly outperforms +passive perception methods and baseline models, including Toward Grounded +Common Sense Reasoning (TGCSR), particularly in scenarios where fixed camera +views are inadequate. The adaptability of AP-VLM in real-world settings shows +promise for enhancing robotic systems' understanding of complex environments, +bridging the gap between high-level semantic reasoning and low-level control. + +
+
+
+
+
+ + ☆ System-Level Safety Monitoring and Recovery for Perception Failures in + Autonomous Vehicles + + +
+ The safety-critical nature of autonomous vehicle (AV) operation necessitates +development of task-relevant algorithms that can reason about safety at the +system level and not just at the component level. To reason about the impact of +a perception failure on the entire system performance, such task-relevant +algorithms must contend with various challenges: complexity of AV stacks, high +uncertainty in the operating environments, and the need for real-time +performance. To overcome these challenges, in this work, we introduce a +Q-network called SPARQ (abbreviation for Safety evaluation for Perception And +Recovery Q-network) that evaluates the safety of a plan generated by a planning +algorithm, accounting for perception failures that the planning process may +have overlooked. This Q-network can be queried during system runtime to assess +whether a proposed plan is safe for execution or poses potential safety risks. +If a violation is detected, the network can then recommend a corrective plan +while accounting for the perceptual failure. We validate our algorithm using +the NuPlan-Vegas dataset, demonstrating its ability to handle cases where a +perception failure compromises a proposed plan while the corrective plan +remains safe. We observe an overall accuracy and recall of 90% while sustaining +a frequency of 42Hz on the unseen testing dataset. We compare our performance +to a popular reachability-based baseline and analyze some interesting +properties of our approach in improving the safety properties of an AV +pipeline. + +
+
+
+
+
+ + HGS-Planner: Hierarchical Planning Framework for Active Scene + Reconstruction Using 3D Gaussian Splatting + + +
+ In complex missions such as search and rescue,robots must make intelligent +decisions in unknown environments, relying on their ability to perceive and +understand their surroundings. High-quality and real-time reconstruction +enhances situational awareness and is crucial for intelligent robotics. +Traditional methods often struggle with poor scene representation or are too +slow for real-time use. Inspired by the efficacy of 3D Gaussian Splatting +(3DGS), we propose a hierarchical planning framework for fast and high-fidelity +active reconstruction. Our method evaluates completion and quality gain to +adaptively guide reconstruction, integrating global and local planning for +efficiency. Experiments in simulated and real-world environments show our +approach outperforms existing real-time methods. + +
+
+
+
+
+ + ☆ Leveraging Semantic and Geometric Information for Zero-Shot + Robot-to-Human Handover + + +
+ Human-robot interaction (HRI) encompasses a wide range of collaborative +tasks, with handover being one of the most fundamental. As robots become more +integrated into human environments, the potential for service robots to assist +in handing objects to humans is increasingly promising. In robot-to-human (R2H) +handover, selecting the optimal grasp is crucial for success, as it requires +avoiding interference with the humans preferred grasp region and minimizing +intrusion into their workspace. Existing methods either inadequately consider +geometric information or rely on data-driven approaches, which often struggle +to generalize across diverse objects. To address these limitations, we propose +a novel zero-shot system that combines semantic and geometric information to +generate optimal handover grasps. Our method first identifies grasp regions +using semantic knowledge from vision-language models (VLMs) and, by +incorporating customized visual prompts, achieves finer granularity in region +grounding. A grasp is then selected based on grasp distance and approach angle +to maximize human ease and avoid interference. We validate our approach through +ablation studies and real-world comparison experiments. Results demonstrate +that our system improves handover success rates and provides a more +user-preferred interaction experience. Videos, appendixes and more are +available at https://sites.google.com/view/vlm-handover/. + +
+
+ comment: 6 pages, 5 figures, conference +
+
+
+
+
+ + Learning Occlusion-aware Decision-making from Agent Interaction via + Active Perception + + +
+ Occlusion-aware decision-making is essential in autonomous driving due to the +high uncertainty of various occlusions. Recent occlusion-aware decision-making +methods encounter issues such as high computational complexity, scenario +scalability challenges, or reliance on limited expert data. Benefiting from +automatically generating data by exploration randomization, we uncover that +reinforcement learning (RL) may show promise in occlusion-aware +decision-making. However, previous occlusion-aware RL faces challenges in +expanding to various dynamic and static occlusion scenarios, low learning +efficiency, and lack of predictive ability. To address these issues, we +introduce Pad-AI, a self-reinforcing framework to learn occlusion-aware +decision-making through active perception. Pad-AI utilizes vectorized +representation to represent occluded environments efficiently and learns over +the semantic motion primitives to focus on high-level active perception +exploration. Furthermore, Pad-AI integrates prediction and RL within a unified +framework to provide risk-aware learning and security guarantees. Our framework +was tested in challenging scenarios under both dynamic and static occlusions +and demonstrated efficient and general perception-aware exploration performance +to other strong baselines in closed-loop evaluations. + +
+
+
+
+
+ + ☆ Software for the SpaceDREAM Robotic Arm + + +
+ Impedance-controlled robots are widely used on Earth to perform +interaction-rich tasks and will be a key enabler for In-Space Servicing, +Assembly and Manufacturing (ISAM) activities. This paper introduces the +software architecture used on the On-Board Computer (OBC) for the planned +SpaceDREAM mission aiming to validate such robotic arm in Lower Earth Orbit +(LEO) conducted by the German Aerospace Center (DLR) in cooperation with +KINETIK Space GmbH and the Technical University of Munich (TUM). During the +mission several free motion as well as contact tasks are to be performed in +order to verify proper functionality of the robot in position and impedance +control on joint level as well as in cartesian control. The tasks are selected +to be representative for subsequent servicing missions e.g. requiring interface +docking or precise manipulation. + The software on the OBC commands the robot's joints via SpaceWire to perform +those mission tasks, reads camera images and data from additional sensors and +sends telemetry data through an Ethernet link via the spacecraft down to Earth. +It is set up to execute a predefined mission after receiving a start signal +from the spacecraft while it should be extendable to receive commands from +Earth for later missions. Core design principle was to reuse as much existing +software and to stay as close as possible to existing robot software stacks at +DLR. This allowed for a quick full operational start of the robot arm compared +to a custom development of all robot software, a lower entry barrier for +software developers as well as a reuse of existing libraries. While not every +line of code can be tested with this design, most of the software has already +proven its functionality through daily execution on multiple robot systems. + +
+
+
+
+
+ + ☆ Canonical Representation and Force-Based Pretraining of 3D Tactile for + Dexterous Visuo-Tactile Policy Learning + + +
+ Tactile sensing plays a vital role in enabling robots to perform +fine-grained, contact-rich tasks. However, the high dimensionality of tactile +data, due to the large coverage on dexterous hands, poses significant +challenges for effective tactile feature learning, especially for 3D tactile +data, as there are no large standardized datasets and no strong pretrained +backbones. To address these challenges, we propose a novel canonical +representation that reduces the difficulty of 3D tactile feature learning and +further introduces a force-based self-supervised pretraining task to capture +both local and net force features, which are crucial for dexterous +manipulation. Our method achieves an average success rate of 78% across four +fine-grained, contact-rich dexterous manipulation tasks in real-world +experiments, demonstrating effectiveness and robustness compared to other +methods. Further analysis shows that our method fully utilizes both spatial and +force information from 3D tactile data to accomplish the tasks. The videos can +be viewed at https://3dtacdex.github.io. + +
+
+
+
+
+ + ☆ Robotic Environmental State Recognition with Pre-Trained Vision-Language + Models and Black-Box Optimization + + +
+ In order for robots to autonomously navigate and operate in diverse +environments, it is essential for them to recognize the state of their +environment. On the other hand, the environmental state recognition has +traditionally involved distinct methods tailored to each state to be +recognized. In this study, we perform a unified environmental state recognition +for robots through the spoken language with pre-trained large-scale +vision-language models. We apply Visual Question Answering and Image-to-Text +Retrieval, which are tasks of Vision-Language Models. We show that with our +method, it is possible to recognize not only whether a room door is +open/closed, but also whether a transparent door is open/closed and whether +water is running in a sink, without training neural networks or manual +programming. In addition, the recognition accuracy can be improved by selecting +appropriate texts from the set of prepared texts based on black-box +optimization. For each state recognition, only the text set and its weighting +need to be changed, eliminating the need to prepare multiple different models +and programs, and facilitating the management of source code and computer +resource. We experimentally demonstrate the effectiveness of our method and +apply it to the recognition behavior on a mobile robot, Fetch. + +
+
+ comment: Accepted at Advanced Robotics, website - + https://haraduka.github.io/vlm-bbo/ +
+
+
+
+
+ + ☆ Precise Interception Flight Targets by Image-based Visual Servoing of + Multicopter + + +
+ Interception of low-altitude intruding targets with low-cost drones equipped +strapdown camera presents a competitive option. However, the malicious +maneuvers by the non-cooperative target and the coupling of the camera make the +task challenging. To solve this problem, an Image-Based Visual Servoing (IBVS) +control algorithm based on proportional navigation guidance with field-of-view +holding capability is designed. The proposed controller reduces the miss +distance while improving the stability of the visual servo system during +interception. Software-in-the-loop (SITL) simulation experiments show a 72.8% +reduction in the circular error probability (CEP) compared to the most recent +study. This improvement enhances interception accuracy from the decimeter to +the centimeter level. Real-world experiments further validate the effectiveness +of the proposed algorithm. + +
+
+ comment: 9 pages, 15 figures, In the process of being submitted to the Journal + of IEEE Transactions on Industrial Electronics +
+
+
+
+
+ + ☆ Traverse the Non-Traversable: Estimating Traversability for Wheeled + Mobility on Vertically Challenging Terrain + + +
+ Most traversability estimation techniques divide off-road terrain into +traversable (e.g., pavement, gravel, and grass) and non-traversable (e.g., +boulders, vegetation, and ditches) regions and then inform subsequent planners +to produce trajectories on the traversable part. However, recent research +demonstrated that wheeled robots can traverse vertically challenging terrain +(e.g., extremely rugged boulders comparable in size to the vehicles +themselves), which unfortunately would be deemed as non-traversable by existing +techniques. Motivated by such limitations, this work aims at identifying the +traversable from the seemingly non-traversable, vertically challenging terrain +based on past kinodynamic vehicle-terrain interactions in a data-driven manner. +Our new Traverse the Non-Traversable(TNT) traversability estimator can +efficiently guide a down-stream sampling-based planner containing a +high-precision 6-DoF kinodynamic model, which becomes deployable onboard a +small-scale vehicle. Additionally, the estimated traversability can also be +used as a costmap to plan global and local paths without sampling. Our +experiment results show that TNT can improve planning performance, efficiency, +and stability by 50%, 26.7%, and 9.2% respectively on a physical robot +platform. + +
+
+ comment: for associated video file, see + https://www.youtube.com/watch?v=Shcalb8sGcA +
+
+
+
+
+ + ☆ Tactile Probabilistic Contact Dynamics Estimation of Unknown Objects + + +
+ We study the problem of rapidly identifying contact dynamics of unknown +objects in partially known environments. The key innovation of our method is a +novel formulation of the contact dynamics estimation problem as the joint +estimation of contact geometries and physical parameters. We leverage DeepSDF, +a compact and expressive neural-network-based geometry representation over a +distribution of geometries, and adopt a particle filter to estimate both the +geometries in contact and the physical parameters. In addition, we couple the +estimator with an active exploration strategy that plans information-gathering +moves to further expedite online estimation. Through simulation and physical +experiments, we show that our method estimates accurate contact dynamics with +fewer than 30 exploration moves for unknown objects touching partially known +environments. + +
+
+
+
+
+ + ☆ Verti-Selector: Automatic Curriculum Learning for Wheeled Mobility on + Vertically Challenging Terrain + + +
+ Reinforcement Learning (RL) has the potential to enable extreme off-road +mobility by circumventing complex kinodynamic modeling, planning, and control +by simulated end-to-end trial-and-error learning experiences. However, most RL +methods are sample-inefficient when training in a large amount of manually +designed simulation environments and struggle at generalizing to the real +world. To address these issues, we introduce Verti-Selector (VS), an automatic +curriculum learning framework designed to enhance learning efficiency and +generalization by selectively sampling training terrain. VS prioritizes +vertically challenging terrain with higher Temporal Difference (TD) errors when +revisited, thereby allowing robots to learn at the edge of their evolving +capabilities. By dynamically adjusting the sampling focus, VS significantly +boosts sample efficiency and generalization within the VW-Chrono simulator +built on the Chrono multi-physics engine. Furthermore, we provide simulation +and physical results using VS on a Verti-4-Wheeler platform. These results +demonstrate that VS can achieve 23.08% improvement in terms of success rate by +efficiently sampling during training and robustly generalizing to the real +world. + +
+
+
+
+
+ + ☆ Cat-and-Mouse Satellite Dynamics: Divergent Adversarial Reinforcement + Learning for Contested Multi-Agent Space Operations + + +
+ As space becomes increasingly crowded and contested, robust autonomous +capabilities for multi-agent environments are gaining critical importance. +Current autonomous systems in space primarily rely on optimization-based path +planning or long-range orbital maneuvers, which have not yet proven effective +in adversarial scenarios where one satellite is actively pursuing another. We +introduce Divergent Adversarial Reinforcement Learning (DARL), a two-stage +Multi-Agent Reinforcement Learning (MARL) approach designed to train autonomous +evasion strategies for satellites engaged with multiple adversarial spacecraft. +Our method enhances exploration during training by promoting diverse +adversarial strategies, leading to more robust and adaptable evader models. We +validate DARL through a cat-and-mouse satellite scenario, modeled as a +partially observable multi-agent capture the flag game where two adversarial +`cat' spacecraft pursue a single `mouse' evader. DARL's performance is compared +against several benchmarks, including an optimization-based satellite path +planner, demonstrating its ability to produce highly robust models for +adversarial multi-agent space environments. + +
+
+
+
+
+ + ☆ Active Vision Might Be All You Need: Exploring Active Vision in Bimanual + Robotic Manipulation + + +
+ Imitation learning has demonstrated significant potential in performing +high-precision manipulation tasks using visual feedback from cameras. However, +it is common practice in imitation learning for cameras to be fixed in place, +resulting in issues like occlusion and limited field of view. Furthermore, +cameras are often placed in broad, general locations, without an effective +viewpoint specific to the robot's task. In this work, we investigate the +utility of active vision (AV) for imitation learning and manipulation, in +which, in addition to the manipulation policy, the robot learns an AV policy +from human demonstrations to dynamically change the robot's camera viewpoint to +obtain better information about its environment and the given task. We +introduce AV-ALOHA, a new bimanual teleoperation robot system with AV, an +extension of the ALOHA 2 robot system, incorporating an additional 7-DoF robot +arm that only carries a stereo camera and is solely tasked with finding the +best viewpoint. This camera streams stereo video to an operator wearing a +virtual reality (VR) headset, allowing the operator to control the camera pose +using head and body movements. The system provides an immersive teleoperation +experience, with bimanual first-person control, enabling the operator to +dynamically explore and search the scene and simultaneously interact with the +environment. We conduct imitation learning experiments of our system both in +real-world and in simulation, across a variety of tasks that emphasize +viewpoint planning. Our results demonstrate the effectiveness of human-guided +AV for imitation learning, showing significant improvements over fixed cameras +in tasks with limited visibility. Project website: +https://soltanilara.github.io/av-aloha/ + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ Progress Towards Submersible Microrobots: A Novel 13-mg Low-Power + SMA-Based Actuator for Underwater Propulsion ICRA + + +
+ We introduce a new low-power 13-mg microactuator driven by shape-memory alloy +(SMA) wires for underwater operation. The development of this device was +motivated by the recent creation of microswimmers such as the FRISHBot, +WaterStrider, VLEIBot, VLEIBot+, and VLEIBot++. The first four of these robots, +ranging from 30 to 90 mg, function tethered to an electrical power supply while +the last platform is an 810-mg fully autonomous system. These five robots are +driven by dry SMA-based microactuators first developed for microrobotic +crawlers such as the SMALLBug and SMARTI. As shown in this abstract, dry +SMA-based actuators do not operate efficiently under water due to high +heat-transfer rates in this medium; for example, the actuators that drive the +VLEIBot++ require about 40 mW of average power at 1 Hz in dry air while +requiring about 900 mW of average power at 1 Hz in water. In contrast, the +microactuator presented in this abstract consumes about 150 mW of average power +at 1 Hz in both dry air and water; additionally, it can be excited directly +using an onboard battery through simple power electronics implemented on a +custom-built printed circuit board (PCB). This technological breakthrough was +enabled by the integration of a soft structure that encapsulates the SMA wires +that drive the actuator in order to passively control the rates of heat +transfer. The results presented here represent preliminary, yet compelling, +experimental evidence that the proposed actuation approach will enable the +development of fully autonomous and controllable submersible microswimmers. To +accomplish this objective, we will evolve the current version of the VLEIBot++ +and introduce new bioinspired underwater propulsion mechanisms. + +
+
+ comment: Presented at 40th Anniversary of the IEEE International Conference on + Robotics and Automation (ICRA@40) +
+
+
+
+
+ + ☆ Accelerated gradient descent for high frequency Model Predictive Control + + +
+ The recent promises of Model Predictive Control in robotics have motivated +the development of tailored second-order methods to solve optimal control +problems efficiently. While those methods benefit from strong convergence +properties, tailored efficient implementations are challenging to derive. In +this work, we study the potential effectiveness of first-order methods and show +on a torque controlled manipulator that they can equal the performances of +second-order methods. + +
+
+
+
+
+ + ☆ Embodied-RAG: General non-parametric Embodied Memory for Retrieval and + Generation + + +
+ There is no limit to how much a robot might explore and learn, but all of +that knowledge needs to be searchable and actionable. Within language research, +retrieval augmented generation (RAG) has become the workhouse of large-scale +non-parametric knowledge, however existing techniques do not directly transfer +to the embodied domain, which is multimodal, data is highly correlated, and +perception requires abstraction. + To address these challenges, we introduce Embodied-RAG, a framework that +enhances the foundational model of an embodied agent with a non-parametric +memory system capable of autonomously constructing hierarchical knowledge for +both navigation and language generation. Embodied-RAG handles a full range of +spatial and semantic resolutions across diverse environments and query types, +whether for a specific object or a holistic description of ambiance. At its +core, Embodied-RAG's memory is structured as a semantic forest, storing +language descriptions at varying levels of detail. This hierarchical +organization allows the system to efficiently generate context-sensitive +outputs across different robotic platforms. We demonstrate that Embodied-RAG +effectively bridges RAG to the robotics domain, successfully handling over 200 +explanation and navigation queries across 19 environments, highlighting its +promise for general-purpose non-parametric system for embodied agents. + +
+
+ comment: Web: https://quanting-xie.github.io/Embodied-RAG-web/ +
+
+
+
+
+ + ☆ SOAR: Self-supervision Optimized UAV Action Recognition with Efficient + Object-Aware Pretraining + + +
+ We introduce SOAR, a novel Self-supervised pretraining algorithm for aerial +footage captured by Unmanned Aerial Vehicles (UAVs). We incorporate human +object knowledge throughout the pretraining process to enhance UAV video +pretraining efficiency and downstream action recognition performance. This is +in contrast to prior works that primarily incorporate object information during +the fine-tuning stage. Specifically, we first propose a novel object-aware +masking strategy designed to retain the visibility of certain patches related +to objects throughout the pretraining phase. Second, we introduce an +object-aware loss function that utilizes object information to adjust the +reconstruction loss, preventing bias towards less informative background +patches. In practice, SOAR with a vanilla ViT backbone, outperforms best UAV +action recognition models, recording a 9.7% and 21.4% boost in top-1 accuracy +on the NEC-Drone and UAV-Human datasets, while delivering an inference speed of +18.7ms per video, making it 2x to 5x faster. Additionally, SOAR obtains +comparable accuracy to prior self-supervised learning (SSL) methods while +requiring 87.5% less pretraining time and 25% less memory usage + +
+
+
+
+
+ + ☆ Flat'n'Fold: A Diverse Multi-Modal Dataset for Garment Perception and + Manipulation + + +
+ We present Flat'n'Fold, a novel large-scale dataset for garment manipulation +that addresses critical gaps in existing datasets. Comprising 1,212 human and +887 robot demonstrations of flattening and folding 44 unique garments across 8 +categories, Flat'n'Fold surpasses prior datasets in size, scope, and diversity. +Our dataset uniquely captures the entire manipulation process from crumpled to +folded states, providing synchronized multi-view RGB-D images, point clouds, +and action data, including hand or gripper positions and rotations. We quantify +the dataset's diversity and complexity compared to existing benchmarks and show +that our dataset features natural and diverse manipulations of real-world +demonstrations of human and robot demonstrations in terms of visual and action +information. To showcase Flat'n'Fold's utility, we establish new benchmarks for +grasping point prediction and subtask decomposition. Our evaluation of +state-of-the-art models on these tasks reveals significant room for +improvement. This underscores Flat'n'Fold's potential to drive advances in +robotic perception and manipulation of deformable objects. Our dataset can be +downloaded at https://cvas-ug.github.io/flat-n-fold + +
+
+
+
+
+ + Towards Safe and Efficient Through-the-Canopy Autonomous Fruit Counting + with UAVs + + +
+ We present an autonomous aerial system for safe and efficient +through-the-canopy fruit counting. Aerial robot applications in large-scale +orchards face significant challenges due to the complexity of fine-tuning +flight paths based on orchard layouts, canopy density, and plant variability. +Through-the-canopy navigation is crucial for minimizing occlusion by leaves and +branches but is more challenging due to the complex and dense environment +compared to traditional over-the-canopy flights. Our system addresses these +challenges by integrating: i) a high-fidelity simulation framework for +optimizing flight trajectories, ii) a low-cost autonomy stack for canopy-level +navigation and data collection, and iii) a robust workflow for fruit detection +and counting using RGB images. We validate our approach through fruit counting +with canopy-level aerial images and by demonstrating the autonomous navigation +capabilities of our experimental vehicle. + +
+
+
+
+
+ + ☆ Autonomous Excavation of Challenging Terrain using Oscillatory + Primitives and Adaptive Impedance Control + + +
+ This paper addresses the challenge of autonomous excavation of challenging +terrains, in particular those that are prone to jamming and inter-particle +adhesion when tackled by a standard penetrate-drag-scoop motion pattern. +Inspired by human excavation strategies, our approach incorporates oscillatory +rotation elements -- including swivel, twist, and dive motions -- to break up +compacted, tangled grains and reduce jamming. We also present an adaptive +impedance control method, the Reactive Attractor Impedance Controller (RAIC), +that adapts a motion trajectory to unexpected forces during loading in a manner +that tracks a trajectory closely when loads are low, but avoids excessive loads +when significant resistance is met. Our method is evaluated on four terrains +using a robotic arm, demonstrating improved excavation performance across +multiple metrics, including volume scooped, protective stop rate, and +trajectory completion percentage. + +
+
+
+
+
+ + ☆ UAV-Assisted Self-Supervised Terrain Awareness for Off-Road Navigation ICRA 2025 + + +
+ Terrain awareness is an essential milestone to enable truly autonomous +off-road navigation. Accurately predicting terrain characteristics allows +optimizing a vehicle's path against potential hazards. Recent methods use deep +neural networks to predict traversability-related terrain properties in a +self-supervised manner, relying on proprioception as a training signal. +However, onboard cameras are inherently limited by their point-of-view relative +to the ground, suffering from occlusions and vanishing pixel density with +distance. This paper introduces a novel approach for self-supervised terrain +characterization using an aerial perspective from a hovering drone. We capture +terrain-aligned images while sampling the environment with a ground vehicle, +effectively training a simple predictor for vibrations, bumpiness, and energy +consumption. Our dataset includes 2.8 km of off-road data collected in forest +environment, comprising 13 484 ground-based images and 12 935 aerial images. +Our findings show that drone imagery improves terrain property prediction by +21.37 % on the whole dataset and 37.35 % in high vegetation, compared to ground +robot images. We conduct ablation studies to identify the main causes of these +performance improvements. We also demonstrate the real-world applicability of +our approach by scouting an unseen area with a drone, planning and executing an +optimized path on the ground. + +
+
+ comment: 7 pages, 5 figures, submitted to ICRA 2025 +
+
+
+
+
+ + ☆ ReloPush: Multi-object Rearrangement in Confined Spaces with a + Nonholonomic Mobile Robot Pusher ICRA 2025 + + +
+ We focus on the problem of rearranging a set of objects within a confined +space with a nonholonomically constrained mobile robot pusher. This problem is +relevant to many real-world domains, including warehouse automation and +construction. These domains give rise to instances involving a combination of +geometric, kinematic, and physics constraints, which make planning particularly +challenging. Prior work often makes simplifying assumptions like the use of +holonomic mobile robots or dexterous manipulators capable of unconstrained +overhand reaching. Our key insight is we can empower even a constrained mobile +pusher to tackle complex rearrangement tasks by enabling it to modify the +environment to its favor in a constraint-aware fashion. To this end, we +describe a Push-Traversability graph, whose vertices represent poses that the +pusher can push objects from and edges represent optimal, kinematically +feasible, and stable push-rearrangements of objects. Based on this graph, we +develop ReloPush, a planning framework that leverages Dubins curves and +standard graph search techniques to generate an efficient sequence of object +rearrangements to be executed by the pusher. We evaluate ReloPush across a +series of challenging scenarios, involving the rearrangement of densely +cluttered workspaces with up to eight objects by a 1tenth mobile robot pusher. +ReloPush exhibits orders of magnitude faster runtimes and significantly more +robust execution in the real world, evidenced in lower execution times and +fewer losses of object contact, compared to two baselines lacking our proposed +graph structure. + +
+
+ comment: Submitted to ICRA 2025 +
+
+
+
+
+ + ☆ Learning to Drive via Asymmetric Self-Play ECCV 2024 + + +
+ Large-scale data is crucial for learning realistic and capable driving +policies. However, it can be impractical to rely on scaling datasets with real +data alone. The majority of driving data is uninteresting, and deliberately +collecting new long-tail scenarios is expensive and unsafe. We propose +asymmetric self-play to scale beyond real data with additional challenging, +solvable, and realistic synthetic scenarios. Our approach pairs a teacher that +learns to generate scenarios it can solve but the student cannot, with a +student that learns to solve them. When applied to traffic simulation, we learn +realistic policies with significantly fewer collisions in both nominal and +long-tail scenarios. Our policies further zero-shot transfer to generate +training data for end-to-end autonomy, significantly outperforming +state-of-the-art adversarial approaches, or using real data alone. For more +information, visit https://waabi.ai/selfplay . + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ MemFusionMap: Working Memory Fusion for Online Vectorized HD Map + Construction + + +
+ High-definition (HD) maps provide environmental information for autonomous +driving systems and are essential for safe planning. While existing methods +with single-frame input achieve impressive performance for online vectorized HD +map construction, they still struggle with complex scenarios and occlusions. We +propose MemFusionMap, a novel temporal fusion model with enhanced temporal +reasoning capabilities for online HD map construction. Specifically, we +contribute a working memory fusion module that improves the model's memory +capacity to reason across history frames. We also design a novel temporal +overlap heatmap to explicitly inform the model about the temporal overlap +information and vehicle trajectory in the Bird's Eye View space. By integrating +these two designs, MemFusionMap significantly outperforms existing methods +while also maintaining a versatile design for scalability. We conduct extensive +evaluation on open-source benchmarks and demonstrate a maximum improvement of +5.4% in mAP over state-of-the-art methods. The code for MemFusionMap will be +made open-source upon publication of this paper. + +
+
+
+
+
+ + ♻ ☆ Exploring Event-based Human Pose Estimation with 3D Event + Representations + + +
+ Human pose estimation is a fundamental and appealing task in computer vision. +Although traditional cameras are commonly applied, their reliability decreases +in scenarios under high dynamic range or heavy motion blur, where event cameras +offer a robust solution. Predominant event-based methods accumulate events into +frames, ignoring the asynchronous and high temporal resolution that is crucial +for distinguishing distinct actions. To address this issue and to unlock the 3D +potential of event information, we introduce two 3D event representations: the +Rasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The +RasEPC aggregates events within concise temporal slices at identical positions, +preserving their 3D attributes along with statistical information, thereby +significantly reducing memory and computational demands. Meanwhile, the DEV +representation discretizes events into voxels and projects them across three +orthogonal planes, utilizing decoupled event attention to retrieve 3D cues from +the 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic +event-based dataset crafted to facilitate training and quantitative analysis in +outdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD +dataset, and our EV-3DPW dataset, with further qualitative validation via a +derived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our +code and dataset have been made publicly available at +https://github.com/MasterHow/EventPointPose. + +
+
+ comment: Accepted to Computer Vision and Image Understanding (CVPU). Extended + version of arXiv:2206.04511. The code and dataset are available at + https://github.com/MasterHow/EventPointPose +
+
+
+
+
+ + ♻ ☆ Valeo4Cast: A Modular Approach to End-to-End Forecasting ECCV + + +
+ Motion forecasting is crucial in autonomous driving systems to anticipate the +future trajectories of surrounding agents such as pedestrians, vehicles, and +traffic signals. In end-to-end forecasting, the model must jointly detect and +track from sensor data (cameras or LiDARs) the past trajectories of the +different elements of the scene and predict their future locations. We depart +from the current trend of tackling this task via end-to-end training from +perception to forecasting, and instead use a modular approach. We individually +build and train detection, tracking and forecasting modules. We then only use +consecutive finetuning steps to integrate the modules better and alleviate +compounding errors. We conduct an in-depth study on the finetuning strategies +and it reveals that our simple yet effective approach significantly improves +performance on the end-to-end forecasting benchmark. Consequently, our solution +ranks first in the Argoverse 2 End-to-end Forecasting Challenge, with 63.82 +mAPf. We surpass forecasting results by +17.1 points over last year's winner +and by +13.3 points over this year's runner-up. This remarkable performance in +forecasting can be explained by our modular paradigm, which integrates +finetuning strategies and significantly outperforms the end-to-end-trained +counterparts. The code, model weights and results are made available +https://github.com/valeoai/valeo4cast. + +
+
+ comment: Winning solution of the Argoverse 2 "Unified Detection, Tracking, and + Forecasting" challenge; work accepted at Road++ ECCVW 2024 +
+
+
+
+
+ + ♻ ☆ TypeFly: Flying Drones with Large Language Model + + +
+ Recent advancements in robot control using large language models (LLMs) have +demonstrated significant potential, primarily due to LLMs' capabilities to +understand natural language commands and generate executable plans in various +languages. However, in real-time and interactive applications involving mobile +robots, particularly drones, the sequential token generation process inherent +to LLMs introduces substantial latency, i.e. response time, in control plan +generation. + In this paper, we present a system called ChatFly that tackles this problem +using a combination of a novel programming language called MiniSpec and its +runtime to reduce the plan generation time and drone response time. That is, +instead of asking an LLM to write a program (robotic plan) in the popular but +verbose Python, ChatFly gets it to do it in MiniSpec specially designed for +token efficiency and stream interpretation. Using a set of challenging drone +tasks, we show that design choices made by ChatFly can reduce up to 62% +response time and provide a more consistent user experience, enabling +responsive and intelligent LLM-based drone control with efficient completion. + +
+
+
+
+
+ + ♻ ☆ LingoQA: Visual Question Answering for Autonomous Driving ECCV 2024 + + +
+ We introduce LingoQA, a novel dataset and benchmark for visual question +answering in autonomous driving. The dataset contains 28K unique short video +scenarios, and 419K annotations. Evaluating state-of-the-art vision-language +models on our benchmark shows that their performance is below human +capabilities, with GPT-4V responding truthfully to 59.6% of the questions +compared to 96.6% for humans. For evaluation, we propose a truthfulness +classifier, called Lingo-Judge, that achieves a 0.95 Spearman correlation +coefficient to human evaluations, surpassing existing techniques like METEOR, +BLEU, CIDEr, and GPT-4. We establish a baseline vision-language model and run +extensive ablation studies to understand its performance. We release our +dataset and benchmark as an evaluation platform for vision-language models in +autonomous driving. + +
+
+ comment: Accepted to ECCV 2024. Benchmark and dataset are available at + https://github.com/wayveai/LingoQA/ +
+
+
+
+
+ + ♻ An Active Perception Game for Robust Information Gathering + + +
+ Active perception approaches select future viewpoints by using some estimate +of the information gain. An inaccurate estimate can be detrimental in critical +situations, e.g., locating a person in distress. However the true information +gained can only be calculated post hoc, i.e., after the observation is +realized. We present an approach for estimating the discrepancy between the +information gain (which is the average over putative future observations) and +the true information gain. The key idea is to analyze the mathematical +relationship between active perception and the estimation error of the +information gain in a game-theoretic setting. Using this, we develop an online +estimation approach that achieves sub-linear regret (in the number of +time-steps) for the estimation of the true information gain and reduces the +sub-optimality of active perception systems. + We demonstrate our approach for active perception using a comprehensive set +of experiments on: (a) different types of environments, including a quadrotor +in a photorealistic simulation, real-world robotic data, and real-world +experiments with ground robots exploring indoor and outdoor scenes; (b) +different types of robotic perception data; and (c) different map +representations. On average, our approach reduces information gain estimation +errors by 42%, increases the information gain by 7%, PSNR by 5%, and semantic +accuracy (measured as the number of objects that are localized correctly) by +6%. In real-world experiments with a Jackal ground robot, our approach +demonstrated complex trajectories to explore occluded regions. + +
+
+
+
+
+ + ♻ ☆ OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera + Fusion for Colorizing Point Clouds ICRA + + +
+ A Colored point cloud, as a simple and efficient 3D representation, has many +advantages in various fields, including robotic navigation and scene +reconstruction. This representation is now commonly used in 3D reconstruction +tasks relying on cameras and LiDARs. However, fusing data from these two types +of sensors is poorly performed in many existing frameworks, leading to +unsatisfactory mapping results, mainly due to inaccurate camera poses. This +paper presents OmniColor, a novel and efficient algorithm to colorize point +clouds using an independent 360-degree camera. Given a LiDAR-based point cloud +and a sequence of panorama images with initial coarse camera poses, our +objective is to jointly optimize the poses of all frames for mapping images +onto geometric reconstructions. Our pipeline works in an off-the-shelf manner +that does not require any feature extraction or matching process. Instead, we +find optimal poses by directly maximizing the photometric consistency of LiDAR +maps. In experiments, we show that our method can overcome the severe visual +distortion of omnidirectional images and greatly benefit from the wide field of +view (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy +and stability. The code will be released at +https://github.com/liubonan123/OmniColor/. + +
+
+ comment: 2024 IEEE International Conference on Robotics and Automation (ICRA) +
+
+
+
+
+ + ♻ ☆ Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation CoRL 2024 + + +
+ Given the high cost of collecting robotic data in the real world, sample +efficiency is a consistently compelling pursuit in robotics. In this paper, we +introduce SGRv2, an imitation learning framework that enhances sample +efficiency through improved visual and action representations. Central to the +design of SGRv2 is the incorporation of a critical inductive bias-action +locality, which posits that robot's actions are predominantly influenced by the +target object and its interactions with the local environment. Extensive +experiments in both simulated and real-world settings demonstrate that action +locality is essential for boosting sample efficiency. SGRv2 excels in RLBench +tasks with keyframe control using merely 5 demonstrations and surpasses the RVT +baseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and +MimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR. +In real-world environments, with only eight demonstrations, SGRv2 can perform a +variety of tasks at a markedly higher success rate compared to baseline models. +Project website: http://sgrv2-robot.github.io + +
+
+ comment: CoRL 2024. Project website: http://sgrv2-robot.github.io +
+
+
+
+
+ + ♻ ☆ Gaussian-LIC: Real-Time Photo-Realistic SLAM with Gaussian Splatting and + LiDAR-Inertial-Camera Fusion + + +
+ In this paper, we present a real-time photo-realistic SLAM method based on +marrying Gaussian Splatting with LiDAR-Inertial-Camera SLAM. Most existing +radiance-field-based SLAM systems mainly focus on bounded indoor environments, +equipped with RGB-D or RGB sensors. However, they are prone to decline when +expanding to unbounded scenes or encountering adverse conditions, such as +violent motions and changing illumination. In contrast, oriented to general +scenarios, our approach additionally tightly fuses LiDAR, IMU, and camera for +robust pose estimation and photo-realistic online mapping. To compensate for +regions unobserved by the LiDAR, we propose to integrate both the triangulated +visual points from images and LiDAR points for initializing 3D Gaussians. In +addition, the modeling of the sky and varying camera exposure have been +realized for high-quality rendering. Notably, we implement our system purely +with C++ and CUDA, and meticulously design a series of strategies to accelerate +the online optimization of the Gaussian-based scene representation. Extensive +experiments demonstrate that our method outperforms its counterparts while +maintaining real-time capability. Impressively, regarding photo-realistic +mapping, our method with our estimated poses even surpasses all the compared +approaches that utilize privileged ground-truth poses for mapping. Our code +will be released on project page https://xingxingzuo.github.io/gaussian_lic. + +
+
+
+
+
+ + ♻ ☆ AnoVox: A Benchmark for Multimodal Anomaly Detection in Autonomous + Driving ECCV 2024 + + +
+ The scale-up of autonomous vehicles depends heavily on their ability to deal +with anomalies, such as rare objects on the road. In order to handle such +situations, it is necessary to detect anomalies in the first place. Anomaly +detection for autonomous driving has made great progress in the past years but +suffers from poorly designed benchmarks with a strong focus on camera data. In +this work, we propose AnoVox, the largest benchmark for ANOmaly detection in +autonomous driving to date. AnoVox incorporates large-scale multimodal sensor +data and spatial VOXel ground truth, allowing for the comparison of methods +independent of their used sensor. We propose a formal definition of normality +and provide a compliant training dataset. AnoVox is the first benchmark to +contain both content and temporal anomalies. + +
+
+ comment: Daniel Bogdoll, Iramm Hamdard, and Lukas Namgyu R\"o{\ss}ler + contributed equally. Accepted for publication at ECCV 2024 W-CODA workshop +
+
+
+
+
+ + ♻ Humanoid Parkour Learning CoRL 2024 + + +
+ Parkour is a grand challenge for legged locomotion, even for quadruped +robots, requiring active perception and various maneuvers to overcome multiple +challenging obstacles. Existing methods for humanoid locomotion either optimize +a trajectory for a single parkour track or train a reinforcement learning +policy only to walk with a significant amount of motion references. In this +work, we propose a framework for learning an end-to-end vision-based +whole-body-control parkour policy for humanoid robots that overcomes multiple +parkour skills without any motion prior. Using the parkour policy, the humanoid +robot can jump on a 0.42m platform, leap over hurdles, 0.8m gaps, and much +more. It can also run at 1.8m/s in the wild and walk robustly on different +terrains. We test our policy in indoor and outdoor environments to demonstrate +that it can autonomously select parkour skills while following the rotation +command of the joystick. We override the arm actions and show that this +framework can easily transfer to humanoid mobile manipulation tasks. Videos can +be found at https://humanoid4parkour.github.io + +
+
+ comment: Published on CoRL 2024 +
+
+
+
+
+ + ♻ ☆ General-purpose Clothes Manipulation with Semantic Keypoints + + +
+ Clothes manipulation is a critical skill for household robots. Recent +advancements have been made in task-specific clothes manipulation, such as +folding, flattening, and hanging. However, due to clothes' complex geometries +and deformability, creating a general-purpose robot system that can manipulate +a diverse range of clothes in many ways remains challenging. Since clothes are +typically designed with specific structures, we propose identifying these +specific features like ``left sleeve'' as semantic keypoints. Semantic +keypoints can provide semantic cues for task planning and geometric cues for +low-level action generation. With this insight, we develop a hierarchical +learning framework using the large language model (LLM) for general-purpose +CLothes mAnipulation with Semantic keyPoints (CLASP). Extensive simulation +experiments show that CLASP outperforms baseline methods on both seen and +unseen tasks across various clothes manipulation tasks. Real-world experiments +show that CLASP can be directly deployed in the real world and applied to a +wide variety of clothes. + +
+
+
+
+
+ + ♻ ☆ Recursive Distillation for Open-Set Distributed Robot Localization + + +
+ A typical assumption in state-of-the-art self-localization models is that an +annotated training dataset is available for the target workspace. However, this +is not necessarily true when a robot travels around the general open world. +This work introduces a novel training scheme for open-world distributed robot +systems. In our scheme, a robot (``student") can ask the other robots it meets +at unfamiliar places (``teachers") for guidance. Specifically, a +pseudo-training dataset is reconstructed from the teacher model and then used +for continual learning of the student model under domain, class, and vocabulary +incremental setup. Unlike typical knowledge transfer schemes, our scheme +introduces only minimal assumptions on the teacher model, so that it can handle +various types of open-set teachers, including those uncooperative, untrainable +(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In +this paper, we investigate a ranking function as an instance of such generic +models, using a challenging data-free recursive distillation scenario, where a +student once trained can recursively join the next-generation open teacher set. + +
+
+ comment: 5 pages, 4 figures, technical report +
+
+
+
+
+ + ♻ ☆ SliceIt! -- A Dual Simulator Framework for Learning Robot Food Slicing ICRA 2024 + + +
+ Cooking robots can enhance the home experience by reducing the burden of +daily chores. However, these robots must perform their tasks dexterously and +safely in shared human environments, especially when handling dangerous tools +such as kitchen knives. This study focuses on enabling a robot to autonomously +and safely learn food-cutting tasks. More specifically, our goal is to enable a +collaborative robot or industrial robot arm to perform food-slicing tasks by +adapting to varying material properties using compliance control. Our approach +involves using Reinforcement Learning (RL) to train a robot to compliantly +manipulate a knife, by reducing the contact forces exerted by the food items +and by the cutting board. However, training the robot in the real world can be +inefficient, and dangerous, and result in a lot of food waste. Therefore, we +proposed SliceIt!, a framework for safely and efficiently learning robot +food-slicing tasks in simulation. Following a real2sim2real approach, our +framework consists of collecting a few real food slicing data, calibrating our +dual simulation environment (a high-fidelity cutting simulator and a robotic +simulator), learning compliant control policies on the calibrated simulation +environment, and finally, deploying the policies on the real robot. + +
+
+ comment: Accepted to ICRA 2024 +
+
+
+
+
+ + ♻ ☆ Learning Variable Compliance Control From a Few Demonstrations for + Bimanual Robot with Haptic Feedback Teleoperation System IROS 2024 + + +
+ Automating dexterous, contact-rich manipulation tasks using rigid robots is a +significant challenge in robotics. Rigid robots, defined by their actuation +through position commands, face issues of excessive contact forces due to their +inability to adapt to contact with the environment, potentially causing damage. +While compliance control schemes have been introduced to mitigate these issues +by controlling forces via external sensors, they are hampered by the need for +fine-tuning task-specific controller parameters. Learning from Demonstrations +(LfD) offers an intuitive alternative, allowing robots to learn manipulations +through observed actions. In this work, we introduce a novel system to enhance +the teaching of dexterous, contact-rich manipulations to rigid robots. Our +system is twofold: firstly, it incorporates a teleoperation interface utilizing +Virtual Reality (VR) controllers, designed to provide an intuitive and +cost-effective method for task demonstration with haptic feedback. Secondly, we +present Comp-ACT (Compliance Control via Action Chunking with Transformers), a +method that leverages the demonstrations to learn variable compliance control +from a few demonstrations. Our methods have been validated across various +complex contact-rich manipulation tasks using single-arm and bimanual robot +setups in simulated and real-world environments, demonstrating the +effectiveness of our system in teaching robots dexterous manipulations with +enhanced adaptability and safety. Code available at: +https://github.com/omron-sinicx/CompACT + +
+
+ comment: Accepted to IROS 2024 +
+
+
+
+
+ + ♻ ☆ A Learning Framework for Diverse Legged Robot Locomotion Using + Barrier-Based Style Rewards + + +
+ This work introduces a model-free reinforcement learning framework that +enables various modes of motion (quadruped, tripod, or biped) and diverse tasks +for legged robot locomotion. We employ a motion-style reward based on a relaxed +logarithmic barrier function as a soft constraint, to bias the learning process +toward the desired motion style, such as gait, foot clearance, joint position, +or body height. The predefined gait cycle is encoded in a flexible manner, +facilitating gait adjustments throughout the learning process. Extensive +experiments demonstrate that KAIST HOUND, a 45 kg robotic system, can achieve +biped, tripod, and quadruped locomotion using the proposed framework; +quadrupedal capabilities include traversing uneven terrain, galloping at 4.67 +m/s, and overcoming obstacles up to 58 cm (67 cm for HOUND2); bipedal +capabilities include running at 3.6 m/s, carrying a 7.5 kg object, and +ascending stairs-all performed without exteroceptive input. + +
+
+ comment: 7 pages, 5 figures, Videos at https://youtu.be/JV2_HfTlOKI +
+
+
+
+
+ + ♻ ☆ Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles + Using Latent Space Generative World Models ICRA 2025 + + +
+ We propose the use of latent space generative world models to address the +covariate shift problem in autonomous driving. A world model is a neural +network capable of predicting an agent's next state given past states and +actions. By leveraging a world model during training, the driving policy +effectively mitigates covariate shift without requiring an excessive amount of +training data. During end-to-end training, our policy learns how to recover +from errors by aligning with states observed in human demonstrations, so that +at runtime it can recover from perturbations outside the training distribution. +Additionally, we introduce a novel transformer-based perception encoder that +employs multi-view cross-attention and a learned scene query. We present +qualitative and quantitative results, demonstrating significant improvements +upon prior state of the art in closed-loop testing in the CARLA simulator, as +well as showing the ability to handle perturbations in both CARLA and NVIDIA's +DRIVE Sim. + +
+
+ comment: 7 pages, 6 figures, for ICRA 2025 conference, for associated video + file, see https://youtu.be/fO7RZ57gVxk +
+
+
+
+
+ + ♻ ☆ Plant Robots: Harnessing Growth Actuation of Plants for Locomotion and + Object Manipulation + + +
+ Plants display physical displacements during their growth due to +photosynthesis, which converts light into chemical energy. This can be +interpreted as plants acting as actuators with a built-in power source. This +paper presents a method to create plant robots that move and perform tasks by +harnessing the actuation output of plants: displacement and force generated +from the growing process. As the target plant, radish sprouts are employed, and +their displacement and force are characterized, followed by the calculation of +power and energy densities. Based on the characterization, two different plant +robots are designed and fabricated: a rotational robot and a gripper. The +former demonstrates ground locomotion, achieving a travel distance of 14.6 mm +with an average speed of 0.8 mm/h. The latter demonstrates the picking and +placing of an object with a 0.1-g mass by the light-controlled open-close +motion of plant fingers. A good agreement between the experimental and model +values is observed in the specific data of the mobile robot, suggesting that +obtaining the actuation characteristics of plants can enable the design and +prediction of behavior in plant robots. These results pave the way for the +realization of novel types of environmentally friendly and sustainable robots. + +
+
+ comment: 16 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Model Predictive Control for Magnetically-Actuated Cellbots + + +
+ This paper presents a control framework for magnetically actuated cellbots, +which combines Model Predictive Control (MPC) with Gaussian Processes (GPs) as +a disturbance estimator for precise trajectory tracking. To address the +challenges posed by unmodeled dynamics, we integrate data-driven modeling with +model-based control to accurately track desired trajectories using relatively +small data. To the best of our knowledge, this is the first work to integrate +data-driven modeling with model-based control for the magnetic actuation of +cellbots. The GP effectively learns and predicts unmodeled disturbances, +providing uncertainty bounds as well. We validate our method through +experiments with cellbots, demonstrating improved trajectory tracking accuracy. + +
+
+
+
+
+ + ♻ ☆ Clio: Real-time Task-Driven Open-Set 3D Scene Graphs + + +
+ Modern tools for class-agnostic image segmentation (e.g., SegmentAnything) +and open-set semantic understanding (e.g., CLIP) provide unprecedented +opportunities for robot perception and mapping. While traditional closed-set +metric-semantic maps were restricted to tens or hundreds of semantic classes, +we can now build maps with a plethora of objects and countless semantic +variations. This leaves us with a fundamental question: what is the right +granularity for the objects (and, more generally, for the semantic concepts) +the robot has to include in its map representation? While related work +implicitly chooses a level of granularity by tuning thresholds for object +detection, we argue that such a choice is intrinsically task-dependent. The +first contribution of this paper is to propose a task-driven 3D scene +understanding problem, where the robot is given a list of tasks in natural +language and has to select the granularity and the subset of objects and scene +structure to retain in its map that is sufficient to complete the tasks. We +show that this problem can be naturally formulated using the Information +Bottleneck (IB), an established information-theoretic framework. The second +contribution is an algorithm for task-driven 3D scene understanding based on an +Agglomerative IB approach, that is able to cluster 3D primitives in the +environment into task-relevant objects and regions and executes incrementally. +The third contribution is to integrate our task-driven clustering algorithm +into a real-time pipeline, named Clio, that constructs a hierarchical 3D scene +graph of the environment online using only onboard compute, as the robot +explores it. Our final contribution is an extensive experimental campaign +showing that Clio not only allows real-time construction of compact open-set 3D +scene graphs, but also improves the accuracy of task execution by limiting the +map to relevant semantic concepts. + +
+
+
+
+
+ + ♻ ☆ Splat-MOVER: Multi-Stage, Open-Vocabulary Robotic Manipulation via + Editable Gaussian Splatting + + +
+ We present Splat-MOVER, a modular robotics stack for open-vocabulary robotic +manipulation, which leverages the editability of Gaussian Splatting (GSplat) +scene representations to enable multi-stage manipulation tasks. Splat-MOVER +consists of: (i) ASK-Splat, a GSplat representation that distills semantic and +grasp affordance features into the 3D scene. ASK-Splat enables geometric, +semantic, and affordance understanding of 3D scenes, which is critical in many +robotics tasks; (ii) SEE-Splat, a real-time scene-editing module using 3D +semantic masking and infilling to visualize the motions of objects that result +from robot interactions in the real-world. SEE-Splat creates a "digital twin" +of the evolving environment throughout the manipulation task; and (iii) +Grasp-Splat, a grasp generation module that uses ASK-Splat and SEE-Splat to +propose affordance-aligned candidate grasps for open-world objects. ASK-Splat +is trained in real-time from RGB images in a brief scanning phase prior to +operation, while SEE-Splat and Grasp-Splat run in real-time during operation. +We demonstrate the superior performance of Splat-MOVER in hardware experiments +on a Kinova robot compared to two recent baselines in four single-stage, +open-vocabulary manipulation tasks and in four multi-stage manipulation tasks, +using the edited scene to reflect changes due to prior manipulation stages, +which is not possible with existing baselines. Video demonstrations and the +code for the project are available at https://splatmover.github.io. + +
+
+ comment: https://splatmover.github.io +
+
+
+
+
+ + ♻ ☆ GND: Global Navigation Dataset with Multi-Modal Perception and + Multi-Category Traversability in Outdoor Campus Environments + + +
+ Navigating large-scale outdoor environments requires complex reasoning in +terms of geometric structures, environmental semantics, and terrain +characteristics, which are typically captured by onboard sensors such as LiDAR +and cameras. While current mobile robots can navigate such environments using +pre-defined, high-precision maps based on hand-crafted rules catered for the +specific environment, they lack commonsense reasoning capabilities that most +humans possess when navigating unknown outdoor spaces. To address this gap, we +introduce the Global Navigation Dataset (GND), a large-scale dataset that +integrates multi-modal sensory data, including 3D LiDAR point clouds and RGB +and 360-degree images, as well as multi-category traversability maps +(pedestrian walkways, vehicle roadways, stairs, off-road terrain, and +obstacles) from ten university campuses. These environments encompass a variety +of parks, urban settings, elevation changes, and campus layouts of different +scales. The dataset covers approximately 2.7km2 and includes at least 350 +buildings in total. We also present a set of novel applications of GND to +showcase its utility to enable global robot navigation, such as map-based +global navigation, mapless navigation, and global place recognition. + +
+
+
+
+
+ + ♻ ☆ VITaL Pretraining: Visuo-Tactile Pretraining for Tactile and Non-Tactile + Manipulation Policies ICRA 2025 + + +
+ Tactile information is a critical tool for dexterous manipulation. As humans, +we rely heavily on tactile information to understand objects in our +environments and how to interact with them. We use touch not only to perform +manipulation tasks but also to learn how to perform these tasks. Therefore, to +create robotic agents that can learn to complete manipulation tasks at a human +or super-human level of performance, we need to properly incorporate tactile +information into both skill execution and skill learning. In this paper, we +investigate how we can incorporate tactile information into imitation learning +platforms to improve performance on manipulation tasks. We show that +incorporating visuo-tactile pretraining improves imitation learning +performance, not only for tactile agents (policies that use tactile information +at inference), but also for non-tactile agents (policies that do not use +tactile information at inference). For these non-tactile agents, pretraining +with tactile information significantly improved performance (for example, +improving the accuracy on USB plugging from 20% to 85%), reaching a level on +par with visuo-tactile agents, and even surpassing them in some cases. For +demonstration videos and access to our codebase, see the project website: +https://sites.google.com/andrew.cmu.edu/visuo-tactile-pretraining + +
+
+ comment: 7 pages, 6 figures, submitted to ICRA 2025. Prior version named + "Visuo-Tactile Pretraining for Cable Plugging" +
+
+
+
+
+ + ♻ ☆ GPT-4V(ision) for Robotics: Multimodal Task Planning from Human + Demonstration + + +
+ We introduce a pipeline that enhances a general-purpose Vision Language +Model, GPT-4V(ision), to facilitate one-shot visual teaching for robotic +manipulation. This system analyzes videos of humans performing tasks and +outputs executable robot programs that incorporate insights into affordances. +The process begins with GPT-4V analyzing the videos to obtain textual +explanations of environmental and action details. A GPT-4-based task planner +then encodes these details into a symbolic task plan. Subsequently, vision +systems spatially and temporally ground the task plan in the videos. Objects +are identified using an open-vocabulary object detector, and hand-object +interactions are analyzed to pinpoint moments of grasping and releasing. This +spatiotemporal grounding allows for the gathering of affordance information +(e.g., grasp types, waypoints, and body postures) critical for robot execution. +Experiments across various scenarios demonstrate the method's efficacy in +enabling real robots to operate from one-shot human demonstrations. Meanwhile, +quantitative tests have revealed instances of hallucination in GPT-4V, +highlighting the importance of incorporating human supervision within the +pipeline. The prompts of GPT-4V/GPT-4 are available at this project page: +https://microsoft.github.io/GPT4Vision-Robot-Manipulation-Prompts/ + +
+
+ comment: 8 pages, 10 figures, 3 tables. Published in IEEE Robotics and + Automation Letters (RA-L) (in press). Last updated on September 26th, 2024 +
+
+
+
+
+
+
+
+ + Systems and Control 30 + +
+
+
+ + ☆ A Sim-to-Real Vision-based Lane Keeping System for a 1:10-scale + Autonomous Vehicle + + +
+ In recent years, several competitions have highlighted the need to +investigate vision-based solutions to address scenarios with functional +insufficiencies in perception, world modeling and localization. This article +presents the Vision-based Lane Keeping System (VbLKS) developed by the +DEI-Unipd Team within the context of the Bosch Future Mobility Challenge 2022. +The main contribution lies in a Simulation-to-Reality (Sim2Real) GPS-denied +VbLKS for a 1:10-scale autonomous vehicle. In this VbLKS, the input to a +tailored Pure Pursuit (PP) based control strategy, namely the Lookahead Heading +Error (LHE), is estimated at a constant lookahead distance employing a +Convolutional Neural Network (CNN). A training strategy for a compact CNN is +proposed, emphasizing data generation and augmentation on simulated camera +images from a 3D Gazebo simulator, and enabling real-time operation on +low-level hardware. A tailored PP-based lateral controller equipped with a +derivative action and a PP-based velocity reference generation are implemented. +Tuning ranges are established through a systematic time-delay stability +analysis. Validation in a representative controlled laboratory setting is +provided. + +
+
+ comment: 16 pages, 23 figures +
+
+
+
+
+ + ☆ End-to-end guarantees for indirect data-driven control of bilinear + systems with finite stochastic data + + +
+ In this paper we propose an end-to-end algorithm for indirect data-driven +control for bilinear systems with stability guarantees. We consider the case +where the collected i.i.d. data is affected by probabilistic noise with +possibly unbounded support and leverage tools from statistical learning theory +to derive finite sample identification error bounds. To this end, we solve the +bilinear identification problem by solving a set of linear and affine +identification problems, by a particular choice of a control input during the +data collection phase. We provide a priori as well as data-dependent finite +sample identification error bounds on the individual matrices as well as +ellipsoidal bounds, both of which are structurally suitable for control. +Further, we integrate the structure of the derived identification error bounds +in a robust controller design to obtain an exponentially stable closed-loop. By +means of an extensive numerical study we showcase the interplay between the +controller design and the derived identification error bounds. Moreover, we +note appealing connections of our results to indirect data-driven control of +general nonlinear systems through Koopman operator theory and discuss how our +results may be applied in this setup. + +
+
+
+
+
+ + ☆ Control Industrial Automation System with Large Language Models + + +
+ Traditional industrial automation systems require specialized expertise to +operate and complex reprogramming to adapt to new processes. Large language +models offer the intelligence to make them more flexible and easier to use. +However, LLMs' application in industrial settings is underexplored. This paper +introduces a framework for integrating LLMs to achieve end-to-end control of +industrial automation systems. At the core of the framework are an agent system +designed for industrial tasks, a structured prompting method, and an +event-driven information modeling mechanism that provides real-time data for +LLM inference. The framework supplies LLMs with real-time events on different +context semantic levels, allowing them to interpret the information, generate +production plans, and control operations on the automation system. It also +supports structured dataset creation for fine-tuning on this downstream +application of LLMs. Our contribution includes a formal system design, +proof-of-concept implementation, and a method for generating task-specific +datasets for LLM fine-tuning and testing. This approach enables a more adaptive +automation system that can respond to spontaneous events, while allowing easier +operation and configuration through natural language for more intuitive +human-machine interaction. We provide demo videos and detailed data on GitHub: +https://github.com/YuchenXia/LLM4IAS + +
+
+
+
+
+ + ☆ Distributed Invariant Unscented Kalman Filter based on Inverse + Covariance Intersection with Intermittent Measurements + + +
+ This paper studies the problem of distributed state estimation (DSE) over +sensor networks on matrix Lie groups, which is crucial for applications where +system states evolve on Lie groups rather than vector spaces. We propose a +diffusion-based distributed invariant Unscented Kalman Filter using the inverse +covariance intersection (DIUKF-ICI) method to address target tracking in 3D +environments. Unlike existing distributed UKFs confined to vector spaces, our +approach extends the distributed UKF framework to Lie groups, enabling local +estimates to be fused with intermediate information from neighboring agents on +Lie groups. To handle the unknown correlations across local estimates, we +extend the ICI fusion strategy to matrix Lie groups for the first time and +integrate it into the diffusion algorithm. We demonstrate that the estimation +error of the proposed method is bounded. Additionally, the algorithm is fully +distributed, robust against intermittent measurements, and adaptable to +time-varying communication topologies. The effectiveness of the proposed method +is validated through extensive Monte-Carlo simulations. + +
+
+
+
+
+ + ☆ Deblur e-NeRF: NeRF from Motion-Blurred Events under High-speed or + Low-light Conditions ECCV 2024 + + +
+ The stark contrast in the design philosophy of an event camera makes it +particularly ideal for operating under high-speed, high dynamic range and +low-light conditions, where standard cameras underperform. Nonetheless, event +cameras still suffer from some amount of motion blur, especially under these +challenging conditions, in contrary to what most think. This is attributed to +the limited bandwidth of the event sensor pixel, which is mostly proportional +to the light intensity. Thus, to ensure that event cameras can truly excel in +such conditions where it has an edge over standard cameras, it is crucial to +account for event motion blur in downstream applications, especially +reconstruction. However, none of the recent works on reconstructing Neural +Radiance Fields (NeRFs) from events, nor event simulators, have considered the +full effects of event motion blur. To this end, we propose, Deblur e-NeRF, a +novel method to directly and effectively reconstruct blur-minimal NeRFs from +motion-blurred events generated under high-speed motion or low-light +conditions. The core component of this work is a physically-accurate pixel +bandwidth model proposed to account for event motion blur under arbitrary speed +and lighting conditions. We also introduce a novel threshold-normalized total +variation loss to improve the regularization of large textureless patches. +Experiments on real and novel realistically simulated sequences verify our +effectiveness. Our code, event simulator and synthetic event dataset will be +open-sourced. + +
+
+ comment: Accepted to ECCV 2024. Project website is accessible at + https://wengflow.github.io/deblur-e-nerf. arXiv admin note: text overlap with + arXiv:2006.07722 by other authors +
+
+
+
+
+ + ☆ Intelligent Energy Management: Remaining Useful Life Prediction and + Charging Automation System Comprised of Deep Learning and the Internet of + Things + + +
+ Remaining Useful Life (RUL) of battery is an important parameter to know the +battery's remaining life and need for recharge. The goal of this research +project is to develop machine learning-based models for the battery RUL +dataset. Different ML models are developed to classify the RUL of the vehicle, +and the IoT (Internet of Things) concept is simulated for automating the +charging system and managing any faults aligning. The graphs plotted depict the +relationship between various vehicle parameters using the Blynk IoT platform. +Results show that the catboost, Multi-Layer Perceptron (MLP), Gated Recurrent +Unit (GRU), and hybrid model developed could classify RUL into three classes +with 99% more accuracy. The data is fed using the tkinter GUI for simulating +artificial intelligence (AI)-based charging, and with a pyserial backend, data +can be entered into the Esp-32 microcontroller for making charge discharge +possible with the model's predictions. Also, with an IoT system, the charging +can be disconnected, monitored, and analyzed for automation. The results show +that an accuracy of 99% can be obtained on models MLP, catboost model and +similar accuracy on GRU model can be obtained, and finally relay-based +triggering can be made by prediction through the model used for automating the +charging and energy-saving mechanism. By showcasing an exemplary Blynk +platform-based monitoring and automation phenomenon, we further present +innovative ways of monitoring parameters and automating the system. + +
+
+
+
+
+ + ☆ Observer-Based Discontinuous Communication in the Secondary Control of + AC Microgrids + + +
+ This paper proposes an observer-based event-driven approach to decrease the +overuse of communication networks. The suggested approach aims to estimate the +required data for sharing between units in line with as much communication +reduction as possible. In other words, the proposed approach effectively +determines which state variables should be shared (observer concept) among the +units during specific time intervals (event-triggered concept). This strategy +significantly reduces the overall communication load. It is shown that the +estimation error remains bounded and Zeno behavior, characterized by an endless +number of transmissions occurring within a limited time frame, does not occur. +The proposed methodology can be systematically applied to any +communication-based secondary controller in alternating current (AC) +microgrids. Simulation results demonstrate a high degree of precision in +estimating the states under the proposed approach. Also, the secondary +controller performance under the proposed method is evaluated in +MATLAB/Simulink environment. + +
+
+ comment: 2024 IEEE PES Innovative Smart Grid Technologies Europe (ISGT Europe) +
+
+
+
+
+ + ☆ PhantomLiDAR: Cross-modality Signal Injection Attacks against LiDAR + + +
+ LiDAR (Light Detection and Ranging) is a pivotal sensor for autonomous +driving, offering precise 3D spatial information. Previous signal attacks +against LiDAR systems mainly exploit laser signals. In this paper, we +investigate the possibility of cross-modality signal injection attacks, i.e., +injecting intentional electromagnetic interference (IEMI) to manipulate LiDAR +output. Our insight is that the internal modules of a LiDAR, i.e., the laser +receiving circuit, the monitoring sensors, and the beam-steering modules, even +with strict electromagnetic compatibility (EMC) testing, can still couple with +the IEMI attack signals and result in the malfunction of LiDAR systems. Based +on the above attack surfaces, we propose the PhantomLiDAR attack, which +manipulates LiDAR output in terms of Points Interference, Points Injection, +Points Removal, and even LiDAR Power-Off. We evaluate and demonstrate the +effectiveness of PhantomLiDAR with both simulated and real-world experiments on +five COTS LiDAR systems. We also conduct feasibility experiments in real-world +moving scenarios. We provide potential defense measures that can be implemented +at both the sensor level and the vehicle system level to mitigate the risks +associated with IEMI attacks. Video demonstrations can be viewed at +https://sites.google.com/view/phantomlidar. + +
+
+
+
+
+ + ☆ Model-Free versus Model-Based Reinforcement Learning for Fixed-Wing UAV + Attitude Control Under Varying Wind Conditions + + +
+ This paper evaluates and compares the performance of model-free and +model-based reinforcement learning for the attitude control of fixed-wing +unmanned aerial vehicles using PID as a reference point. The comparison focuses +on their ability to handle varying flight dynamics and wind disturbances in a +simulated environment. Our results show that the Temporal Difference Model +Predictive Control agent outperforms both the PID controller and other +model-free reinforcement learning methods in terms of tracking accuracy and +robustness over different reference difficulties, particularly in nonlinear +flight regimes. Furthermore, we introduce actuation fluctuation as a key metric +to assess energy efficiency and actuator wear, and we test two different +approaches from the literature: action variation penalty and conditioning for +action policy smoothness. We also evaluate all control methods when subject to +stochastic turbulence and gusts separately, so as to measure their effects on +tracking performance, observe their limitations and outline their implications +on the Markov decision process formalism. + +
+
+ comment: Published at ICINCO 2024 +
+
+
+
+
+ + ☆ Discontinuous Reception with Adjustable Inactivity Timer for IIoT + + +
+ Discontinuous reception (DRX) is a key technology for reducing the energy +consumption of industrial Internet of Things (IIoT) devices. Specifically, DRX +allows the devices to operate in a low-power mode when no data reception is +scheduled, and its effectiveness depends on the proper configuration of the DRX +parameters. In this paper, we characterize the DRX process departing from a +semi-Markov chain modeling. We detail two ways to set DRX parameters to +minimize the device power consumption while meeting a mean delay constraint. +The first method exhaustively searches for the optimal configuration. In +contrast, the second method uses a low-complexity metaheuristic to find a +sub-optimal configuration, thus considering ideal and practical DRX +configurations. Notably, within the DRX parameters, the inactivity timer (IT) +is a caution time that specifies how long a device remains active after the +last information exchange. Traditionally, a device implementing DRX will +restart the IT after each data reception as a precedent to a low-power mode. +The usual approach lies in restarting the IT whenever new data is received +during this cautious period, which might sometimes needlessly extend the active +time. Herein, we propose a more efficient method in which the transmit base +station (BS) explicitly indicates restarting the timer through the control +channel only when appropriate. The decision is taken based on the BS's +knowledge about its buffer status. We consider Poisson and bursty traffic +models, which are typical in IIoT setups, and verify the suitability of our +proposal for reducing the energy consumption of the devices without +significantly compromising the communication latency through extensive +numerical simulations. Specifically, energy-saving gains of up to 30% can be +obtained regardless of the arrival rate and delay constraints. + +
+
+ comment: IEEE Transactions on Industrial Informatics (2024) +
+
+
+
+
+ + ☆ Scene Understanding in Pick-and-Place Tasks: Analyzing Transformations + Between Initial and Final Scenes + + +
+ With robots increasingly collaborating with humans in everyday tasks, it is +important to take steps toward robotic systems capable of understanding the +environment. This work focuses on scene understanding to detect pick and place +tasks given initial and final images from the scene. To this end, a dataset is +collected for object detection and pick and place task detection. A YOLOv5 +network is subsequently trained to detect the objects in the initial and final +scenes. Given the detected objects and their bounding boxes, two methods are +proposed to detect the pick and place tasks which transform the initial scene +into the final scene. A geometric method is proposed which tracks objects' +movements in the two scenes and works based on the intersection of the bounding +boxes which moved within scenes. Contrarily, the CNN-based method utilizes a +Convolutional Neural Network to classify objects with intersected bounding +boxes into 5 classes, showing the spatial relationship between the involved +objects. The performed pick and place tasks are then derived from analyzing the +experiments with both scenes. Results show that the CNN-based method, using a +VGG16 backbone, outscores the geometric method by roughly 12 percentage points +in certain scenarios, with an overall success rate of 84.3%. + +
+
+ comment: Conference Paper, ICEE 2024, 7 pages, 5 figures +
+
+
+
+
+ + ☆ On the Output Redundancy of LTI Systems: A Geometric Approach with + Application to Privacy + + +
+ This paper examines the properties of output-redundant systems, that is, +systems possessing a larger number of outputs than inputs, through the lenses +of the geometric approach of Wonham et al. We begin by formulating a simple +output allocation synthesis problem, which involves ``concealing" input +information from a malicious eavesdropper having access to the system output, +while still allowing for a legitimate user to reconstruct it. It is shown that +the solvability of this problem requires the availability of a redundant set of +outputs. This very problem is instrumental to unveiling the fundamental +geometric properties of output-redundant systems, which form the basis for our +subsequent constructions and results. As a direct application, we demonstrate +how output allocation can be employed to effectively protect the information of +input information from certain output eavesdroppers with guaranteed results. + +
+
+
+
+
+ + ☆ Semantic model for the description of energy data in the Module Type + Package + + +
+ Modular production systems that employ the Module Type Package (MTP) to +describe module interfaces can, at present, only communicate energy data +through proprietary solutions. Due to this limitation, users face additional +effort when calculating energy KPIs for modules or determining the energy +efficiency of modules. To address this issue, we present a model that +facilitates energy data to be described semantically and uniformly in the MTP +on the basis of an industrial standard (OPC 34100). MTPs incorporating this +model can transmit semantically consistent energy data from modules to the +process control system, making the data available for further applications, +such as monitoring or optimization. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ Stereographic Projection of Probabilistic Frequency-Domain Uncertainty + + +
+ This paper investigates the stereographic projection of points along the +Nyquist plots of single input single output (SISO) linear time invariant (LTI) +systems subject to probabilistic uncertainty. At each frequency, there +corresponds a complex-valued random variable with given probability +distribution in the complex plane. The chordal distance between the +stereographic projections of this complex value and the corresponding value for +a nominal model, as per the well-known Nu-Gap metric of Vinnicombe, is also a +random quantity. The main result provides the cumulative density function (CDF) +of the chordal distance at a given frequency. Such a stochastic distance +framework opens up a fresh and a fertile research direction on probabilistic +robust control theory. + +
+
+
+
+
+ + ☆ GLinSAT: The General Linear Satisfiability Neural Network Layer By + Accelerated Gradient Descent + + +
+ Ensuring that the outputs of neural networks satisfy specific constraints is +crucial for applying neural networks to real-life decision-making problems. In +this paper, we consider making a batch of neural network outputs satisfy +bounded and general linear constraints. We first reformulate the neural network +output projection problem as an entropy-regularized linear programming problem. +We show that such a problem can be equivalently transformed into an +unconstrained convex optimization problem with Lipschitz continuous gradient +according to the duality theorem. Then, based on an accelerated gradient +descent algorithm with numerical performance enhancement, we present our +architecture, GLinSAT, to solve the problem. To the best of our knowledge, this +is the first general linear satisfiability layer in which all the operations +are differentiable and matrix-factorization-free. Despite the fact that we can +explicitly perform backpropagation based on automatic differentiation +mechanism, we also provide an alternative approach in GLinSAT to calculate the +derivatives based on implicit differentiation of the optimality condition. +Experimental results on constrained traveling salesman problems, partial graph +matching with outliers, predictive portfolio allocation and power system unit +commitment demonstrate the advantages of GLinSAT over existing satisfiability +layers. + +
+
+
+
+
+ + ☆ Optimal control of stochastic reaction networks with entropic control + cost and emergence of mode-switching strategies + + +
+ Controlling the stochastic dynamics of biological populations is a challenge +that arises across various biological contexts. However, these dynamics are +inherently nonlinear and involve a discrete state space, i.e., the number of +molecules, cells, or organisms. Additionally, the possibility of extinction has +a significant impact on both the dynamics and control strategies, particularly +when the population size is small. These factors hamper the direct application +of conventional control theories to biological systems. To address these +challenges, we formulate the optimal control problem for stochastic population +dynamics by utilizing a control cost function based on the Kullback-Leibler +divergence. This approach naturally accounts for population-specific factors +and simplifies the complex nonlinear Hamilton-Jacobi-Bellman equation into a +linear form, facilitating efficient computation of optimal solutions. We +demonstrate the effectiveness of our approach by applying it to the control of +interacting random walkers, Moran processes, and SIR models, and observe the +mode-switching phenomena in the control strategies. Our approach provides new +opportunities for applying control theory to a wide range of biological +problems. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Survey of Moving Target Defense in Power Grids: Design Principles, + Tradeoffs, and Future Directions + + +
+ Moving target defense (MTD) in power grids is an emerging defense technique +that has gained prominence in the recent past. It aims to solve the +long-standing problem of securing the power grid against stealthy attacks. The +key idea behind MTD is to introduce periodic/event-triggered controlled changes +to the power grid's SCADA network/physical plant, thereby invalidating the +knowledge attackers use for crafting stealthy attacks. In this paper, we +provide a comprehensive overview of this topic and classify the different ways +in which MTD is implemented in power grids. We further introduce the guiding +principles behind the design of MTD, key performance metrics, and the +associated trade-offs in MTD and identify the future development of MTD for +power grid security. + +
+
+ comment: 10 pages, 3 figures, survey +
+
+
+
+
+ + ☆ Multi-platoon car-following models with flexible platoon sizes and + communication levels + + +
+ In this paper, we extend a single platoon car-following (CF) model to some +multi-platoon CF models for connected and autonomous vehicles (CAVs) with +flexible platoon size and communication level. Specifically, we consider +forward and backward communication methods between platoons with delays. Some +general results of linear stability are mathematically proven, and numerical +simulations are performed to illustrate the effects of platoon sizes and +communication levels, as well as to demonstrate the potential for stabilizing +human-driven vehicles (HDVs) in mixed traffic conditions. The simulation +results are consistent with theoretical analysis, and demonstrate that in the +ring road scenario, CAV platoons can stabilize certain percentage of HDVs. This +paper can provide suggestions for the design of communication system of +autonomous vehicles (AVs), and management of mixed traffic flow of CAVs and +HDVs. + +
+
+ comment: Preprint for IEEE +
+
+
+
+
+ + ☆ Causality-based Subject and Task Fingerprints using fMRI Time-series + Data + + +
+ Recently, there has been a revived interest in system neuroscience causation +models due to their unique capability to unravel complex relationships in +multi-scale brain networks. In this paper, our goal is to verify the +feasibility and effectiveness of using a causality-based approach for fMRI +fingerprinting. Specifically, we propose an innovative method that utilizes the +causal dynamics activities of the brain to identify the unique cognitive +patterns of individuals (e.g., subject fingerprint) and fMRI tasks (e.g., task +fingerprint). The key novelty of our approach stems from the development of a +two-timescale linear state-space model to extract 'spatio-temporal' (aka +causal) signatures from an individual's fMRI time series data. To the best of +our knowledge, we pioneer and subsequently quantify, in this paper, the concept +of 'causal fingerprint.' Our method is well-separated from other fingerprint +studies as we quantify fingerprints from a cause-and-effect perspective, which +are then incorporated with a modal decomposition and projection method to +perform subject identification and a GNN-based (Graph Neural Network) model to +perform task identification. Finally, we show that the experimental results and +comparisons with non-causality-based methods demonstrate the effectiveness of +the proposed methods. We visualize the obtained causal signatures and discuss +their biological relevance in light of the existing understanding of brain +functionalities. Collectively, our work paves the way for further studies on +causal fingerprints with potential applications in both healthy controls and +neurodegenerative diseases. + +
+
+
+
+
+ + ☆ Criticality and Safety Margins for Reinforcement Learning + + +
+ State of the art reinforcement learning methods sometimes encounter unsafe +situations. Identifying when these situations occur is of interest both for +post-hoc analysis and during deployment, where it might be advantageous to call +out to a human overseer for help. Efforts to gauge the criticality of different +points in time have been developed, but their accuracy is not well established +due to a lack of ground truth, and they are not designed to be easily +interpretable by end users. Therefore, we seek to define a criticality +framework with both a quantifiable ground truth and a clear significance to +users. We introduce true criticality as the expected drop in reward when an +agent deviates from its policy for n consecutive random actions. We also +introduce the concept of proxy criticality, a low-overhead metric that has a +statistically monotonic relationship to true criticality. Safety margins make +these interpretable, when defined as the number of random actions for which +performance loss will not exceed some tolerance with high confidence. We +demonstrate this approach in several environment-agent combinations; for an A3C +agent in an Atari Beamrider environment, the lowest 5% of safety margins +contain 47% of agent losses; i.e., supervising only 5% of decisions could +potentially prevent roughly half of an agent's errors. This criticality +framework measures the potential impacts of bad decisions, even before those +decisions are made, allowing for more effective debugging and oversight of +autonomous agents. + +
+
+ comment: 17 pages, 10 figures. This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ Optimizing Downlink C-NOMA Transmission with Movable Antennas: A + DDPG-based Approach + + +
+ This paper analyzes a downlink C-NOMA scenario where a base station (BS) is +deployed to serve a pair of users equipped with movable antenna (MA) +technology. The user with better channel conditions with the BS will be able to +transmit the signal to the other user providing an extra transmission resource +and enhancing performance. Both users are equipped with a receiving MA each and +a transmitting MA for the relaying user. In this regard, we formulate an +optimization problem with the objective of maximizing the achievable sum rate +by jointly determining the beamforming vector at the BS, the transmit power at +the device and the positions of the MAs while meeting the quality of service +(QoS) constraints. Due to the non-convex structure of the formulated problem +and the randomness in the channels we adopt a deep deterministic policy +gradient (DDPG) approach, a reinforcement learning (RL) algorithm capable of +dealing with continuous state and action spaces. Numerical results demonstrate +the superiority of the presented model compared to the other benchmark schemes +showing gains reaching 45% compared to the NOMA enabled MA scheme and 60% +compared to C-NOMA model with fixed antennas. The solution approach showed 93% +accuracy compared to the optimal solution. + +
+
+
+
+
+ + ♻ ☆ Network-aware Recommender System via Online Feedback Optimization + + +
+ Personalized content on social platforms can exacerbate negative phenomena +such as polarization, partly due to the feedback interactions between +recommendations and the users. In this paper, we present a control-theoretic +recommender system that explicitly accounts for this feedback loop to mitigate +polarization. Our approach extends online feedback optimization - a control +paradigm for steady-state optimization of dynamical systems - to develop a +recommender system that trades off users engagement and polarization reduction, +while relying solely on online click data. We establish theoretical guarantees +for optimality and stability of the proposed design and validate its +effectiveness via numerical experiments with a user population governed by +Friedkin-Johnsen dynamics. Our results show these "network-aware" +recommendations can significantly reduce polarization while maintaining high +levels of user engagement. + +
+
+
+
+
+ + ♻ ☆ Data-based approaches to learning and control by similarity between + heterogeneous systems + + +
+ This paper proposes basic definitions of similarity and similarity indexes +between admissible behaviors of heterogeneous host and guest systems and +further presents a similarity-based learning control framework by exploiting +the offline sampled data. By exploring helpful geometric properties of the +admissible behavior and decomposing it into the subspace and offset components, +the similarity indexes between two admissible behaviors are defined as the +principal angles between their corresponding subspace components. By +reconstructing the admissible behaviors leveraging sampled data, an efficient +strategy for calculating the similarity indexes is developed, based on which a +similarity-based learning control framework is proposed. It is shown that, with +the application of similarity-based learning control, the host system can +directly accomplish the same control tasks by utilizing the successful +experience provided by the guest system, without having to undergo the +trial-and-error process. All results in this paper are supported by simulation +examples. + +
+
+
+
+
+ + ♻ ☆ Data-Driven Abstractions for Control Systems via Random Exploration + + +
+ At the intersection of dynamical systems, control theory, and formal methods +lies the construction of symbolic abstractions: these typically represent +simpler, finite-state models whose behavior mimics that of an underlying +concrete system but are easier to analyse. Building an abstraction usually +requires an accurate knowledge of the underlying model: this knowledge may be +costly to gather, especially in real-life applications. We aim to bridge this +gap by building abstractions based on sampling finite length trajectories. To +refine a controller built for the abstraction to one for the concrete system, +we newly define a notion of probabilistic alternating simulation, and provide +Probably Approximately Correct (PAC) guarantees that the constructed +abstraction includes all behaviors of the concrete system and that it is +suitable for control design, for arbitrarily long time horizons, leveraging +scenario theory. Our method is then tested on several numerical benchmarks. + +
+
+
+
+
+ + ♻ ☆ Adaptive Control of an Inverted Pendulum by a Reinforcement + Learning-based LQR Method + + +
+ Inverted pendulums constitute one of the popular systems for benchmarking +control algorithms. Several methods have been proposed for the control of this +system, the majority of which rely on the availability of a mathematical model. +However, deriving a mathematical model using physical parameters or system +identification techniques requires manual effort. Moreover, the designed +controllers may perform poorly if system parameters change. To mitigate these +problems, recently, some studies used Reinforcement Learning (RL) based +approaches for the control of inverted pendulum systems. Unfortunately, these +methods suffer from slow convergence and local minimum problems. Moreover, they +may require hyperparameter tuning which complicates the design process +significantly. To alleviate these problems, the present study proposes an +LQR-based RL method for adaptive balancing control of an inverted pendulum. As +shown by numerical experiments, the algorithm stabilizes the system very fast +without requiring a mathematical model or extensive hyperparameter tuning. In +addition, it can adapt to parametric changes online. + +
+
+
+
+
+ + ♻ ☆ Convection-Enabled Boundary Control of a 2D Channel Flow + + +
+ Nonlinear convection, the source of turbulence in fluid flows, may hold the +key to stabilizing turbulence by solving a specific cubic polynomial equation. +We consider the incompressible Navier-Stokes equations in a two-dimensional +channel. The tangential and normal velocities are assumed to be periodic in the +streamwise direction. The pressure difference between the left and right ends +of the channel is constant. Moreover, we consider no-slip boundary conditions, +that is, zero tangential velocity, at the top and bottom walls of the channel, +and normal velocity actuation at the top and bottom walls. We design the +boundary control inputs to achieve global exponential stabilization, in the L2 +sense, of a chosen Poiseuille equilibrium profile for an arbitrarily large +Reynolds number. The key idea behind our approach is to select the boundary +controllers such that they have zero spatial mean (to guarantee mass +conservation) but non-zero spatial cubic mean. We reveal that, because of +convection, the time derivative of the L2 energy of the regulation error is a +cubic polynomial in the cubic mean of the boundary inputs. Regulation is then +achieved by solving a specific cubic equation, using the Cardano root formula. +The results are illustrated via a numerical example. + +
+
+ comment: To be presented at the 63rd IEEE Conference on Decision and Control + (CDC 2024) +
+
+
+
+
+ + ♻ ☆ Safe stabilization using generalized Lyapunov barrier function + + +
+ This paper addresses the safe stabilization problem, focusing on controlling +the system state to the origin while avoiding entry into unsafe state sets. The +current methods for solving this issue rely on smooth Lyapunov and barrier +functions, which do not always ensure the existence of an effective controller +even when such smooth functions are created. To tackle this challenge, we +introduce the concept of a generalized (nonsmooth) Lyapunov barrier function +(GenLBF), which guarantees the existence of a safe and stable controller. We +outline a systematic approach for constructing a GenLBF, including a technique +for efficiently calculating the upper generalized derivative of the GenLBF. +Using the constructed GenLBF, we propose a method for certifying safe +stabilization of autonomous systems and design a piecewise continuous feedback +control to achieve safe stabilization of non-autonomous systems. A general +controller refinement strategy is further proposed to help the state trajectory +escape from undesired local points occurring in systems with special physical +structure. A thorough theoretical analysis demonstrates the effectiveness of +our method in addressing the safe stabilization problem for systems with single +or multiple bounded unsafe state sets. Extensive simulations of linear and +nonlinear systems further illustrate the efficacy of the proposed method and +its superiority over the smooth control Lyapunov barrier function method. + +
+
+ comment: 19 pages, 14 figures, under review by a journal +
+
+
+
+
+ + ♻ ☆ Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles + Using Latent Space Generative World Models ICRA 2025 + + +
+ We propose the use of latent space generative world models to address the +covariate shift problem in autonomous driving. A world model is a neural +network capable of predicting an agent's next state given past states and +actions. By leveraging a world model during training, the driving policy +effectively mitigates covariate shift without requiring an excessive amount of +training data. During end-to-end training, our policy learns how to recover +from errors by aligning with states observed in human demonstrations, so that +at runtime it can recover from perturbations outside the training distribution. +Additionally, we introduce a novel transformer-based perception encoder that +employs multi-view cross-attention and a learned scene query. We present +qualitative and quantitative results, demonstrating significant improvements +upon prior state of the art in closed-loop testing in the CARLA simulator, as +well as showing the ability to handle perturbations in both CARLA and NVIDIA's +DRIVE Sim. + +
+
+ comment: 7 pages, 6 figures, for ICRA 2025 conference, for associated video + file, see https://youtu.be/fO7RZ57gVxk +
+
+
+
+
+ + ♻ ☆ The Top Manifold Connectedness of Quantum Control Landscapes + + +
+ The control of quantum systems has been proven to possess trap-free +optimization landscapes under the satisfaction of proper assumptions. However, +many details of the landscape geometry and their influence on search efficiency +still need to be fully understood. This paper numerically explores the +path-connectedness of globally optimal control solutions forming the top +manifold of the landscape. We randomly sample a plurality of optimal controls +in the top manifold to assess the existence of a continuous path at the top of +the landscape that connects two arbitrary optimal solutions. It is shown that +for different quantum control objectives including state-to-state transition +probabilities, observable expectation values and unitary transformations, such +a continuous path can be readily found, implying that these top manifolds are +fundamentally path-connected. The significance of the latter conjecture lies in +seeking locations in the top manifold where an ancillary objective can also be +optimized while maintaining the full optimality of the original objective that +defined the landscape. + +
+
+ comment: 34 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Distributed Quasi-Newton Method for Multi-Agent Optimization + + +
+ We present a distributed quasi-Newton (DQN) method, which enables a group of +agents to compute an optimal solution of a separable multi-agent optimization +problem locally using an approximation of the curvature of the aggregate +objective function. Each agent computes a descent direction from its local +estimate of the aggregate Hessian, obtained from quasi-Newton approximation +schemes using the gradient of its local objective function. Moreover, we +introduce a distributed quasi-Newton method for equality-constrained +optimization (EC-DQN), where each agent takes Karush-Kuhn-Tucker-like update +steps to compute an optimal solution. In our algorithms, each agent +communicates with its one-hop neighbors over a peer-to-peer communication +network to compute a common solution. We prove convergence of our algorithms to +a stationary point of the optimization problem. In addition, we demonstrate the +competitive empirical convergence of our algorithm in both well-conditioned and +ill-conditioned optimization problems, in terms of the computation time and +communication cost incurred by each agent for convergence, compared to existing +distributed first-order and second-order methods. Particularly, in +ill-conditioned problems, our algorithms achieve a faster computation time for +convergence, while requiring a lower communication cost, across a range of +communication networks with different degrees of connectedness. + +
+
+
+
+
+
+
+
+ + Artificial Intelligence 172 + +
+
+
+ + ☆ Multi-View and Multi-Scale Alignment for Contrastive Language-Image + Pre-training in Mammography MICCAI 2024 + + +
+ Contrastive Language-Image Pre-training (CLIP) shows promise in medical image +analysis but requires substantial data and computational resources. Due to +these restrictions, existing CLIP applications in medical imaging focus mainly +on modalities like chest X-rays that have abundant image-report data available, +leaving many other important modalities under-explored. Here, we propose the +first adaptation of the full CLIP model to mammography, which presents +significant challenges due to labeled data scarcity, high-resolution images +with small regions of interest, and data imbalance. We first develop a +specialized supervision framework for mammography that leverages its multi-view +nature. Furthermore, we design a symmetric local alignment module to better +focus on detailed features in high-resolution images. Lastly, we incorporate a +parameter-efficient fine-tuning approach for large language models pre-trained +with medical knowledge to address data limitations. Our multi-view and +multi-scale alignment (MaMA) method outperforms state-of-the-art baselines for +three different tasks on two large real-world mammography datasets, EMBED and +RSNA-Mammo, with only 52% model size compared with the largest baseline. + +
+
+ comment: This work is also the basis of the overall best solution for the + MICCAI 2024 CXR-LT Challenge +
+
+
+
+
+ + ☆ Find Rhinos without Finding Rhinos: Active Learning with Multimodal + Imagery of South African Rhino Habitats IJCAI 2023 + + +
+ Much of Earth's charismatic megafauna is endangered by human activities, +particularly the rhino, which is at risk of extinction due to the poaching +crisis in Africa. Monitoring rhinos' movement is crucial to their protection +but has unfortunately proven difficult because rhinos are elusive. Therefore, +instead of tracking rhinos, we propose the novel approach of mapping communal +defecation sites, called middens, which give information about rhinos' spatial +behavior valuable to anti-poaching, management, and reintroduction efforts. +This paper provides the first-ever mapping of rhino midden locations by +building classifiers to detect them using remotely sensed thermal, RGB, and +LiDAR imagery in passive and active learning settings. As existing active +learning methods perform poorly due to the extreme class imbalance in our +dataset, we design MultimodAL, an active learning system employing a ranking +technique and multimodality to achieve competitive performance with passive +learning models with 94% fewer labels. Our methods could therefore save over 76 +hours in labeling time when used on a similarly-sized dataset. Unexpectedly, +our midden map reveals that rhino middens are not randomly distributed +throughout the landscape; rather, they are clustered. Consequently, rangers +should be targeted at areas with high midden densities to strengthen +anti-poaching efforts, in line with UN Target 15.7. + +
+
+ comment: 9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good +
+
+
+
+
+ + ☆ AI-Powered Augmented Reality for Satellite Assembly, Integration and + Test + + +
+ The integration of Artificial Intelligence (AI) and Augmented Reality (AR) is +set to transform satellite Assembly, Integration, and Testing (AIT) processes +by enhancing precision, minimizing human error, and improving operational +efficiency in cleanroom environments. This paper presents a technical +description of the European Space Agency's (ESA) project "AI for AR in +Satellite AIT," which combines real-time computer vision and AR systems to +assist technicians during satellite assembly. Leveraging Microsoft HoloLens 2 +as the AR interface, the system delivers context-aware instructions and +real-time feedback, tackling the complexities of object recognition and 6D pose +estimation in AIT workflows. All AI models demonstrated over 70% accuracy, with +the detection model exceeding 95% accuracy, indicating a high level of +performance and reliability. A key contribution of this work lies in the +effective use of synthetic data for training AI models in AR applications, +addressing the significant challenges of obtaining real-world datasets in +highly dynamic satellite environments, as well as the creation of the Segmented +Anything Model for Automatic Labelling (SAMAL), which facilitates the automatic +annotation of real data, achieving speeds up to 20 times faster than manual +human annotation. The findings demonstrate the efficacy of AI-driven AR systems +in automating critical satellite assembly tasks, setting a foundation for +future innovations in the space industry. + +
+
+
+
+
+ + ☆ EfficientCrackNet: A Lightweight Model for Crack Segmentation + + +
+ Crack detection, particularly from pavement images, presents a formidable +challenge in the domain of computer vision due to several inherent complexities +such as intensity inhomogeneity, intricate topologies, low contrast, and noisy +backgrounds. Automated crack detection is crucial for maintaining the +structural integrity of essential infrastructures, including buildings, +pavements, and bridges. Existing lightweight methods often face challenges +including computational inefficiency, complex crack patterns, and difficult +backgrounds, leading to inaccurate detection and impracticality for real-world +applications. To address these limitations, we propose EfficientCrackNet, a +lightweight hybrid model combining Convolutional Neural Networks (CNNs) and +transformers for precise crack segmentation. EfficientCrackNet integrates +depthwise separable convolutions (DSC) layers and MobileViT block to capture +both global and local features. The model employs an Edge Extraction Method +(EEM) and for efficient crack edge detection without pretraining, and +Ultra-Lightweight Subspace Attention Module (ULSAM) to enhance feature +extraction. Extensive experiments on three benchmark datasets Crack500, +DeepCrack, and GAPs384 demonstrate that EfficientCrackNet achieves superior +performance compared to existing lightweight models, while requiring only 0.26M +parameters, and 0.483 FLOPs (G). The proposed model offers an optimal balance +between accuracy and computational efficiency, outperforming state-of-the-art +lightweight models, and providing a robust and adaptable solution for +real-world crack segmentation. + +
+
+
+
+
+ + ☆ DiffSSC: Semantic LiDAR Scan Completion using Denoising Diffusion + Probabilistic Models + + +
+ Perception systems play a crucial role in autonomous driving, incorporating +multiple sensors and corresponding computer vision algorithms. 3D LiDAR sensors +are widely used to capture sparse point clouds of the vehicle's surroundings. +However, such systems struggle to perceive occluded areas and gaps in the scene +due to the sparsity of these point clouds and their lack of semantics. To +address these challenges, Semantic Scene Completion (SSC) jointly predicts +unobserved geometry and semantics in the scene given raw LiDAR measurements, +aiming for a more complete scene representation. Building on promising results +of diffusion models in image generation and super-resolution tasks, we propose +their extension to SSC by implementing the noising and denoising diffusion +processes in the point and semantic spaces individually. To control the +generation, we employ semantic LiDAR point clouds as conditional input and +design local and global regularization losses to stabilize the denoising +process. We evaluate our approach on autonomous driving datasets and our +approach outperforms the state-of-the-art for SSC. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ GSON: A Group-based Social Navigation Framework with Large Multimodal + Model + + +
+ As the number of service robots and autonomous vehicles in human-centered +environments grows, their requirements go beyond simply navigating to a +destination. They must also take into account dynamic social contexts and +ensure respect and comfort for others in shared spaces, which poses significant +challenges for perception and planning. In this paper, we present a group-based +social navigation framework GSON to enable mobile robots to perceive and +exploit the social group of their surroundings by leveling the visual reasoning +capability of the Large Multimodal Model (LMM). For perception, we apply visual +prompting techniques to zero-shot extract the social relationship among +pedestrians and combine the result with a robust pedestrian detection and +tracking pipeline to alleviate the problem of low inference speed of the LMM. +Given the perception result, the planning system is designed to avoid +disrupting the current social structure. We adopt a social structure-based +mid-level planner as a bridge between global path planning and local motion +planning to preserve the global context and reactive response. The proposed +method is validated on real-world mobile robot navigation tasks involving +complex social structure understanding and reasoning. Experimental results +demonstrate the effectiveness of the system in these scenarios compared with +several baselines. + +
+
+
+
+
+ + ☆ SKT: Integrating State-Aware Keypoint Trajectories with Vision-Language + Models for Robotic Garment Manipulation + + +
+ Automating garment manipulation poses a significant challenge for assistive +robotics due to the diverse and deformable nature of garments. Traditional +approaches typically require separate models for each garment type, which +limits scalability and adaptability. In contrast, this paper presents a unified +approach using vision-language models (VLMs) to improve keypoint prediction +across various garment categories. By interpreting both visual and semantic +information, our model enables robots to manage different garment states with a +single model. We created a large-scale synthetic dataset using advanced +simulation techniques, allowing scalable training without extensive real-world +data. Experimental results indicate that the VLM-based method significantly +enhances keypoint detection accuracy and task success rates, providing a more +flexible and general solution for robotic garment manipulation. In addition, +this research also underscores the potential of VLMs to unify various garment +manipulation tasks within a single framework, paving the way for broader +applications in home automation and assistive robotics for future. + +
+
+
+
+
+ + ☆ Infer Human's Intentions Before Following Natural Language Instructions + + +
+ For AI agents to be helpful to humans, they should be able to follow natural +language instructions to complete everyday cooperative tasks in human +environments. However, real human instructions inherently possess ambiguity, +because the human speakers assume sufficient prior knowledge about their hidden +goals and intentions. Standard language grounding and planning methods fail to +address such ambiguities because they do not model human internal goals as +additional partially observable factors in the environment. We propose a new +framework, Follow Instructions with Social and Embodied Reasoning (FISER), +aiming for better natural language instruction following in collaborative +embodied tasks. Our framework makes explicit inferences about human goals and +intentions as intermediate reasoning steps. We implement a set of +Transformer-based models and evaluate them over a challenging benchmark, +HandMeThat. We empirically demonstrate that using social reasoning to +explicitly infer human intentions before making action plans surpasses purely +end-to-end approaches. We also compare our implementation with strong +baselines, including Chain of Thought prompting on the largest available +pre-trained language models, and find that FISER provides better performance on +the embodied social reasoning tasks under investigation, reaching the +state-of-the-art on HandMeThat. + +
+
+
+
+
+ + ☆ FreeEdit: Mask-free Reference-based Image Editing with Multi-modal + Instruction + + +
+ Introducing user-specified visual concepts in image editing is highly +practical as these concepts convey the user's intent more precisely than +text-based descriptions. We propose FreeEdit, a novel approach for achieving +such reference-based image editing, which can accurately reproduce the visual +concept from the reference image based on user-friendly language instructions. +Our approach leverages the multi-modal instruction encoder to encode language +instructions to guide the editing process. This implicit way of locating the +editing area eliminates the need for manual editing masks. To enhance the +reconstruction of reference details, we introduce the Decoupled Residual +ReferAttention (DRRA) module. This module is designed to integrate fine-grained +reference features extracted by a detail extractor into the image editing +process in a residual way without interfering with the original self-attention. +Given that existing datasets are unsuitable for reference-based image editing +tasks, particularly due to the difficulty in constructing image triplets that +include a reference image, we curate a high-quality dataset, FreeBench, using a +newly developed twice-repainting scheme. FreeBench comprises the images before +and after editing, detailed editing instructions, as well as a reference image +that maintains the identity of the edited object, encompassing tasks such as +object addition, replacement, and deletion. By conducting phased training on +FreeBench followed by quality tuning, FreeEdit achieves high-quality zero-shot +editing through convenient language instructions. We conduct extensive +experiments to evaluate the effectiveness of FreeEdit across multiple task +types, demonstrating its superiority over existing methods. The code will be +available at: https://freeedit.github.io/. + +
+
+ comment: 14 pages, 14 figures, project website: https://freeedit.github.io/ +
+
+
+
+
+ + ☆ Visual Data Diagnosis and Debiasing with Concept Graphs + + +
+ The widespread success of deep learning models today is owed to the curation +of extensive datasets significant in size and complexity. However, such models +frequently pick up inherent biases in the data during the training process, +leading to unreliable predictions. Diagnosing and debiasing datasets is thus a +necessity to ensure reliable model performance. In this paper, we present +CONBIAS, a novel framework for diagnosing and mitigating Concept co-occurrence +Biases in visual datasets. CONBIAS represents visual datasets as knowledge +graphs of concepts, enabling meticulous analysis of spurious concept +co-occurrences to uncover concept imbalances across the whole dataset. +Moreover, we show that by employing a novel clique-based concept balancing +strategy, we can mitigate these imbalances, leading to enhanced performance on +downstream tasks. Extensive experiments show that data augmentation based on a +balanced concept distribution augmented by CONBIAS improves generalization +performance across multiple datasets compared to state-of-the-art methods. We +will make our code and data publicly available. + +
+
+
+
+
+ + ☆ DualAD: Dual-Layer Planning for Reasoning in Autonomous Driving + + +
+ We present a novel autonomous driving framework, DualAD, designed to imitate +human reasoning during driving. DualAD comprises two layers: a rule-based +motion planner at the bottom layer that handles routine driving tasks requiring +minimal reasoning, and an upper layer featuring a rule-based text encoder that +converts driving scenarios from absolute states into text description. This +text is then processed by a large language model (LLM) to make driving +decisions. The upper layer intervenes in the bottom layer's decisions when +potential danger is detected, mimicking human reasoning in critical situations. +Closed-loop experiments demonstrate that DualAD, using a zero-shot pre-trained +model, significantly outperforms rule-based motion planners that lack reasoning +abilities. Our experiments also highlight the effectiveness of the text +encoder, which considerably enhances the model's scenario understanding. +Additionally, the integrated DualAD model improves with stronger LLMs, +indicating the framework's potential for further enhancement. We make code and +benchmarks publicly available. + +
+
+ comment: Autonomous Driving, Large Language Models (LLMs), Human Reasoning, + Critical Scenario +
+
+
+
+
+ + ☆ Explaining Explaining + + +
+ Explanation is key to people having confidence in high-stakes AI systems. +However, machine-learning-based systems - which account for almost all current +AI - can't explain because they are usually black boxes. The explainable AI +(XAI) movement hedges this problem by redefining "explanation". The +human-centered explainable AI (HCXAI) movement identifies the +explanation-oriented needs of users but can't fulfill them because of its +commitment to machine learning. In order to achieve the kinds of explanations +needed by real people operating in critical domains, we must rethink how to +approach AI. We describe a hybrid approach to developing cognitive agents that +uses a knowledge-based infrastructure supplemented by data obtained through +machine learning when applicable. These agents will serve as assistants to +humans who will bear ultimate responsibility for the decisions and actions of +the human-robot team. We illustrate the explanatory potential of such agents +using the under-the-hood panels of a demonstration system in which a team of +simulated robots collaborates on a search task assigned by a human. + +
+
+
+
+
+ + ☆ Revisit Anything: Visual Place Recognition via Image Segment Retrieval ECCV 2024 + + +
+ Accurately recognizing a revisited place is crucial for embodied agents to +localize and navigate. This requires visual representations to be distinct, +despite strong variations in camera viewpoint and scene appearance. Existing +visual place recognition pipelines encode the "whole" image and search for +matches. This poses a fundamental challenge in matching two images of the same +place captured from different camera viewpoints: "the similarity of what +overlaps can be dominated by the dissimilarity of what does not overlap". We +address this by encoding and searching for "image segments" instead of the +whole images. We propose to use open-set image segmentation to decompose an +image into `meaningful' entities (i.e., things and stuff). This enables us to +create a novel image representation as a collection of multiple overlapping +subgraphs connecting a segment with its neighboring segments, dubbed +SuperSegment. Furthermore, to efficiently encode these SuperSegments into +compact vector representations, we propose a novel factorized representation of +feature aggregation. We show that retrieving these partial representations +leads to significantly higher recognition recall than the typical whole image +based retrieval. Our segments-based approach, dubbed SegVLAD, sets a new +state-of-the-art in place recognition on a diverse selection of benchmark +datasets, while being applicable to both generic and task-specialized image +encoders. Finally, we demonstrate the potential of our method to ``revisit +anything'' by evaluating our method on an object instance retrieval task, which +bridges the two disparate areas of research: visual place recognition and +object-goal navigation, through their common aim of recognizing goal objects +specific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything. + +
+
+ comment: Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures +
+
+
+
+
+ + ☆ HARMONIC: Cognitive and Control Collaboration in Human-Robotic Teams ICRA 2025 + + +
+ This paper presents a novel approach to multi-robot planning and +collaboration. We demonstrate a cognitive strategy for robots in human-robot +teams that incorporates metacognition, natural language communication, and +explainability. The system is embodied using the HARMONIC architecture that +flexibly integrates cognitive and control capabilities across the team. We +evaluate our approach through simulation experiments involving a joint search +task by a team of heterogeneous robots (a UGV and a drone) and a human. We +detail the system's handling of complex, real-world scenarios, effective action +coordination between robots with different capabilities, and natural +human-robot communication. This work demonstrates that the robots' ability to +reason about plans, goals, and attitudes, and to provide explanations for +actions and decisions are essential prerequisites for realistic human-robot +teaming. + +
+
+ comment: Submitted to ICRA 2025 Conference, Atlanta, GA, USA +
+
+
+
+
+ + ☆ IFCap: Image-like Retrieval and Frequency-based Entity Filtering for + Zero-shot Captioning EMNLP 2024 + + +
+ Recent advancements in image captioning have explored text-only training +methods to overcome the limitations of paired image-text data. However, +existing text-only training methods often overlook the modality gap between +using text data during training and employing images during inference. To +address this issue, we propose a novel approach called Image-like Retrieval, +which aligns text features with visually relevant features to mitigate the +modality gap. Our method further enhances the accuracy of generated captions by +designing a Fusion Module that integrates retrieved captions with input +features. Additionally, we introduce a Frequency-based Entity Filtering +technique that significantly improves caption quality. We integrate these +methods into a unified framework, which we refer to as IFCap +($\textbf{I}$mage-like Retrieval and $\textbf{F}$requency-based Entity +Filtering for Zero-shot $\textbf{Cap}$tioning). Through extensive +experimentation, our straightforward yet powerful approach has demonstrated its +efficacy, outperforming the state-of-the-art methods by a significant margin in +both image captioning and video captioning compared to zero-shot captioning +based on text-only training. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ HARMONIC: A Framework for Explanatory Cognitive Robots ICRA + + +
+ We present HARMONIC, a framework for implementing cognitive robots that +transforms general-purpose robots into trusted teammates capable of complex +decision-making, natural communication and human-level explanation. The +framework supports interoperability between a strategic (cognitive) layer for +high-level decision-making and a tactical (robot) layer for low-level control +and execution. We describe the core features of the framework and our initial +implementation, in which HARMONIC was deployed on a simulated UGV and drone +involved in a multi-robot search and retrieval task. + +
+
+ comment: Accepted for presentation at ICRA@40. 23-26 September 2024, + Rotterdam, Netherlands +
+
+
+
+
+ + ☆ Compositional Hardness of Code in Large Language Models -- A + Probabilistic Perspective + + +
+ A common practice in large language model (LLM) usage for complex analytical +tasks such as code generation, is to sample a solution for the entire task +within the model's context window. Previous works have shown that subtask +decomposition within the model's context (chain of thought), is beneficial for +solving such tasks. In this work, we point a limitation of LLMs' ability to +perform several sub-tasks within the same context window - an in-context +hardness of composition, pointing to an advantage for distributing a decomposed +problem in a multi-agent system of LLMs. The hardness of composition is +quantified by a generation complexity metric, i.e., the number of LLM +generations required to sample at least one correct solution. We find a gap +between the generation complexity of solving a compositional problem within the +same context relative to distributing it among multiple agents, that increases +exponentially with the solution's length. We prove our results theoretically +and demonstrate them empirically. + +
+
+
+
+
+ + ☆ An Adversarial Perspective on Machine Unlearning for AI Safety + + +
+ Large language models are finetuned to refuse questions about hazardous +knowledge, but these protections can often be bypassed. Unlearning methods aim +at completely removing hazardous capabilities from models and make them +inaccessible to adversaries. This work challenges the fundamental differences +between unlearning and traditional safety post-training from an adversarial +perspective. We demonstrate that existing jailbreak methods, previously +reported as ineffective against unlearning, can be successful when applied +carefully. Furthermore, we develop a variety of adaptive methods that recover +most supposedly unlearned capabilities. For instance, we show that finetuning +on 10 unrelated examples or removing specific directions in the activation +space can recover most hazardous capabilities for models edited with RMU, a +state-of-the-art unlearning method. Our findings challenge the robustness of +current unlearning approaches and question their advantages over safety +training. + +
+
+
+
+
+ + ☆ Transferring disentangled representations: bridging the gap between + synthetic and real images + + +
+ Developing meaningful and efficient representations that separate the +fundamental structure of the data generation mechanism is crucial in +representation learning. However, Disentangled Representation Learning has not +fully shown its potential on real images, because of correlated generative +factors, their resolution and limited access to ground truth labels. +Specifically on the latter, we investigate the possibility of leveraging +synthetic data to learn general-purpose disentangled representations applicable +to real data, discussing the effect of fine-tuning and what properties of +disentanglement are preserved after the transfer. We provide an extensive +empirical study to address these issues. In addition, we propose a new +interpretable intervention-based metric, to measure the quality of factors +encoding in the representation. Our results indicate that some level of +disentanglement, transferring a representation from synthetic to real data, is +possible and effective. + +
+
+
+
+
+ + ☆ Role-RL: Online Long-Context Processing with Role Reinforcement Learning + for Distinct LLMs in Their Optimal Roles + + +
+ Large language models (LLMs) with long-context processing are still +challenging because of their implementation complexity, training efficiency and +data sparsity. To address this issue, a new paradigm named Online Long-context +Processing (OLP) is proposed when we process a document of unlimited length, +which typically occurs in the information reception and organization of diverse +streaming media such as automated news reporting, live e-commerce, and viral +short videos. Moreover, a dilemma was often encountered when we tried to select +the most suitable LLM from a large number of LLMs amidst explosive growth +aiming for outstanding performance, affordable prices, and short response +delays. In view of this, we also develop Role Reinforcement Learning (Role-RL) +to automatically deploy different LLMs in their respective roles within the OLP +pipeline according to their actual performance. Extensive experiments are +conducted on our OLP-MINI dataset and it is found that OLP with Role-RL +framework achieves OLP benchmark with an average recall rate of 93.2% and the +LLM cost saved by 79.4%. The code and dataset are publicly available at: +https://anonymous.4open.science/r/Role-RL. + +
+
+
+
+
+ + ☆ Control Industrial Automation System with Large Language Models + + +
+ Traditional industrial automation systems require specialized expertise to +operate and complex reprogramming to adapt to new processes. Large language +models offer the intelligence to make them more flexible and easier to use. +However, LLMs' application in industrial settings is underexplored. This paper +introduces a framework for integrating LLMs to achieve end-to-end control of +industrial automation systems. At the core of the framework are an agent system +designed for industrial tasks, a structured prompting method, and an +event-driven information modeling mechanism that provides real-time data for +LLM inference. The framework supplies LLMs with real-time events on different +context semantic levels, allowing them to interpret the information, generate +production plans, and control operations on the automation system. It also +supports structured dataset creation for fine-tuning on this downstream +application of LLMs. Our contribution includes a formal system design, +proof-of-concept implementation, and a method for generating task-specific +datasets for LLM fine-tuning and testing. This approach enables a more adaptive +automation system that can respond to spontaneous events, while allowing easier +operation and configuration through natural language for more intuitive +human-machine interaction. We provide demo videos and detailed data on GitHub: +https://github.com/YuchenXia/LLM4IAS + +
+
+
+
+
+ + ☆ Joint Localization and Planning using Diffusion ICRA 2025 + + +
+ Diffusion models have been successfully applied to robotics problems such as +manipulation and vehicle path planning. In this work, we explore their +application to end-to-end navigation -- including both perception and planning +-- by considering the problem of jointly performing global localization and +path planning in known but arbitrary 2D environments. In particular, we +introduce a diffusion model which produces collision-free paths in a global +reference frame given an egocentric LIDAR scan, an arbitrary map, and a desired +goal position. To this end, we implement diffusion in the space of paths in +SE(2), and describe how to condition the denoising process on both obstacles +and sensor observations. In our evaluation, we show that the proposed +conditioning techniques enable generalization to realistic maps of considerably +different appearance than the training environment, demonstrate our model's +ability to accurately describe ambiguous solutions, and run extensive +simulation experiments showcasing our model's use as a real-time, end-to-end +localization and planning stack. + +
+
+ comment: 7 pages, 9 figures. Submitted to ICRA 2025, under review +
+
+
+
+
+ + ☆ CRoP: Context-wise Robust Static Human-Sensing Personalization + + +
+ The advancement in deep learning and internet-of-things have led to diverse +human sensing applications. However, distinct patterns in human sensing, +influenced by various factors or contexts, challenge generic neural network +model's performance due to natural distribution shifts. To address this, +personalization tailors models to individual users. Yet most personalization +studies overlook intra-user heterogeneity across contexts in sensory data, +limiting intra-user generalizability. This limitation is especially critical in +clinical applications, where limited data availability hampers both +generalizability and personalization. Notably, intra-user sensing attributes +are expected to change due to external factors such as treatment progression, +further complicating the challenges.This work introduces CRoP, a novel static +personalization approach using an off-the-shelf pre-trained model and pruning +to optimize personalization and generalization. CRoP shows superior +personalization effectiveness and intra-user robustness across four +human-sensing datasets, including two from real-world health domains, +highlighting its practical and social impact. Additionally, to support CRoP's +generalization ability and design choices, we provide empirical justification +through gradient inner product analysis, ablation studies, and comparisons +against state-of-the-art baselines. + +
+
+ comment: 31 pages, 10 figues and 13 tables +
+
+
+
+
+ + ☆ HydraViT: Stacking Heads for a Scalable ViT + + +
+ The architecture of Vision Transformers (ViTs), particularly the Multi-head +Attention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs +on devices with varying constraints, such as mobile phones, requires multiple +models of different sizes. However, this approach has limitations, such as +training and storing each required model separately. This paper introduces +HydraViT, a novel approach that addresses these limitations by stacking +attention heads to achieve a scalable ViT. By repeatedly changing the size of +the embedded dimensions throughout each layer and their corresponding number of +attention heads in MHA during training, HydraViT induces multiple subnetworks. +Thereby, HydraViT achieves adaptability across a wide spectrum of hardware +environments while maintaining performance. Our experimental results +demonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10 +subnetworks, covering a wide range of resource constraints. HydraViT achieves +up to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy +with the same throughput on ImageNet-1K compared to the baselines, making it an +effective solution for scenarios where hardware availability is diverse or +varies over time. Source code available at https://github.com/ds-kiel/HydraViT. + +
+
+
+
+
+ + ☆ Enhancing elusive clues in knowledge learning by contrasting attention + of language models + + +
+ Causal language models acquire vast amount of knowledge from general text +corpus during pretraining, but the efficiency of knowledge learning is known to +be unsatisfactory, especially when learning from knowledge-dense and +small-sized corpora. The deficiency can come from long-distance dependencies +which are hard to capture by language models, and overfitting to co-occurrence +patterns and distracting clues in the training text. To address these issues, +the paper proposes a method to enhance knowledge learning during language model +pretraining, by enhancing elusive but important clues in text discovered by the +language model themselves. We found that larger language models pay more +attention to non-obvious but important clues, which are often overlooked by +smaller language models. Therefore, we can identify these clues by contrasting +the attention weights of large and small language models. We use the identified +clues as a guide to perform token-dropout data augmentation on the training +text, and observed a significant boost in both small and large models' +performance in fact memorization. This shows that the behavior contrast between +more and less-performant language models contains important clues for knowledge +learning, and it can be ``amplified" for a straight-forward improvement in +knowledge learning efficiency. + +
+
+ comment: 7 pages and 17 figures +
+
+
+
+
+ + ☆ Weak-To-Strong Backdoor Attacks for LLMs with Contrastive Knowledge + Distillation + + +
+ Despite being widely applied due to their exceptional capabilities, Large +Language Models (LLMs) have been proven to be vulnerable to backdoor attacks. +These attacks introduce targeted vulnerabilities into LLMs by poisoning +training samples and full-parameter fine-tuning. However, this kind of backdoor +attack is limited since they require significant computational resources, +especially as the size of LLMs increases. Besides, parameter-efficient +fine-tuning (PEFT) offers an alternative but the restricted parameter updating +may impede the alignment of triggers with target labels. In this study, we +first verify that backdoor attacks with PEFT may encounter challenges in +achieving feasible performance. To address these issues and improve the +effectiveness of backdoor attacks with PEFT, we propose a novel backdoor attack +algorithm from weak to strong based on contrastive knowledge distillation +(W2SAttack). Specifically, we poison small-scale language models through +full-parameter fine-tuning to serve as the teacher model. The teacher model +then covertly transfers the backdoor to the large-scale student model through +contrastive knowledge distillation, which employs PEFT. Theoretical analysis +reveals that W2SAttack has the potential to augment the effectiveness of +backdoor attacks. We demonstrate the superior performance of W2SAttack on +classification tasks across four language models, four backdoor attack +algorithms, and two different architectures of teacher models. Experimental +results indicate success rates close to 100% for backdoor attacks targeting +PEFT. + +
+
+
+
+
+ + ☆ On Translating Technical Terminology: A Translation Workflow for + Machine-Translated Acronyms + + +
+ The typical workflow for a professional translator to translate a document +from its source language (SL) to a target language (TL) is not always focused +on what many language models in natural language processing (NLP) do - predict +the next word in a series of words. While high-resource languages like English +and French are reported to achieve near human parity using common metrics for +measurement such as BLEU and COMET, we find that an important step is being +missed: the translation of technical terms, specifically acronyms. Some +state-of-the art machine translation systems like Google Translate which are +publicly available can be erroneous when dealing with acronyms - as much as 50% +in our findings. This article addresses acronym disambiguation for MT systems +by proposing an additional step to the SL-TL (FR-EN) translation workflow where +we first offer a new acronym corpus for public consumption and then experiment +with a search-based thresholding algorithm that achieves nearly 10% increase +when compared to Google Translate and OpusMT. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ Predicting Anchored Text from Translation Memories for Machine + Translation Using Deep Learning Methods + + +
+ Translation memories (TMs) are the backbone for professional translation +tools called computer-aided translation (CAT) tools. In order to perform a +translation using a CAT tool, a translator uses the TM to gather translations +similar to the desired segment to translate (s'). Many CAT tools offer a +fuzzy-match algorithm to locate segments (s) in the TM that are close in +distance to s'. After locating two similar segments, the CAT tool will present +parallel segments (s, t) that contain one segment in the source language along +with its translation in the target language. Additionally, CAT tools contain +fuzzy-match repair (FMR) techniques that will automatically use the parallel +segments from the TM to create new TM entries containing a modified version of +the original with the idea in mind that it will be the translation of s'. Most +FMR techniques use machine translation as a way of "repairing" those words that +have to be modified. In this article, we show that for a large part of those +words which are anchored, we can use other techniques that are based on machine +learning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we +show that for anchored words that follow the continuous bag-of-words (CBOW) +paradigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for +some cases, better results than neural machine translation for translating +anchored words from French to English. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ Intelligent Energy Management: Remaining Useful Life Prediction and + Charging Automation System Comprised of Deep Learning and the Internet of + Things + + +
+ Remaining Useful Life (RUL) of battery is an important parameter to know the +battery's remaining life and need for recharge. The goal of this research +project is to develop machine learning-based models for the battery RUL +dataset. Different ML models are developed to classify the RUL of the vehicle, +and the IoT (Internet of Things) concept is simulated for automating the +charging system and managing any faults aligning. The graphs plotted depict the +relationship between various vehicle parameters using the Blynk IoT platform. +Results show that the catboost, Multi-Layer Perceptron (MLP), Gated Recurrent +Unit (GRU), and hybrid model developed could classify RUL into three classes +with 99% more accuracy. The data is fed using the tkinter GUI for simulating +artificial intelligence (AI)-based charging, and with a pyserial backend, data +can be entered into the Esp-32 microcontroller for making charge discharge +possible with the model's predictions. Also, with an IoT system, the charging +can be disconnected, monitored, and analyzed for automation. The results show +that an accuracy of 99% can be obtained on models MLP, catboost model and +similar accuracy on GRU model can be obtained, and finally relay-based +triggering can be made by prediction through the model used for automating the +charging and energy-saving mechanism. By showcasing an exemplary Blynk +platform-based monitoring and automation phenomenon, we further present +innovative ways of monitoring parameters and automating the system. + +
+
+
+
+
+ + ☆ Pioneering Reliable Assessment in Text-to-Image Knowledge Editing: + Leveraging a Fine-Grained Dataset and an Innovative Criterion EMNLP24 + + +
+ During pre-training, the Text-to-Image (T2I) diffusion models encode factual +knowledge into their parameters. These parameterized facts enable realistic +image generation, but they may become obsolete over time, thereby +misrepresenting the current state of the world. Knowledge editing techniques +aim to update model knowledge in a targeted way. However, facing the dual +challenges posed by inadequate editing datasets and unreliable evaluation +criterion, the development of T2I knowledge editing encounter difficulties in +effectively generalizing injected knowledge. In this work, we design a T2I +knowledge editing framework by comprehensively spanning on three phases: First, +we curate a dataset \textbf{CAKE}, comprising paraphrase and multi-object test, +to enable more fine-grained assessment on knowledge generalization. Second, we +propose a novel criterion, \textbf{adaptive CLIP threshold}, to effectively +filter out false successful images under the current criterion and achieve +reliable editing evaluation. Finally, we introduce \textbf{MPE}, a simple but +effective approach for T2I knowledge editing. Instead of tuning parameters, MPE +precisely recognizes and edits the outdated part of the conditioning +text-prompt to accommodate the up-to-date knowledge. A straightforward +implementation of MPE (Based on in-context learning) exhibits better overall +performance than previous model editors. We hope these efforts can further +promote faithful evaluation of T2I knowledge editing methods. + +
+
+ comment: EMNLP24 Findings +
+
+
+
+
+ + ☆ Navigation in a simplified Urban Flow through Deep Reinforcement + Learning + + +
+ The increasing number of unmanned aerial vehicles (UAVs) in urban +environments requires a strategy to minimize their environmental impact, both +in terms of energy efficiency and noise reduction. In order to reduce these +concerns, novel strategies for developing prediction models and optimization of +flight planning, for instance through deep reinforcement learning (DRL), are +needed. Our goal is to develop DRL algorithms capable of enabling the +autonomous navigation of UAVs in urban environments, taking into account the +presence of buildings and other UAVs, optimizing the trajectories in order to +reduce both energetic consumption and noise. This is achieved using fluid-flow +simulations which represent the environment in which UAVs navigate and training +the UAV as an agent interacting with an urban environment. In this work, we +consider a domain domain represented by a two-dimensional flow field with +obstacles, ideally representing buildings, extracted from a three-dimensional +high-fidelity numerical simulation. The presented methodology, using PPO+LSTM +cells, was validated by reproducing a simple but fundamental problem in +navigation, namely the Zermelo's problem, which deals with a vessel navigating +in a turbulent flow, travelling from a starting point to a target location, +optimizing the trajectory. The current method shows a significant improvement +with respect to both a simple PPO and a TD3 algorithm, with a success rate (SR) +of the PPO+LSTM trained policy of 98.7%, and a crash rate (CR) of 0.1%, +outperforming both PPO (SR = 75.6%, CR=18.6%) and TD3 (SR=77.4% and CR=14.5%). +This is the first step towards DRL strategies which will guide UAVs in a +three-dimensional flow field using real-time signals, making the navigation +efficient in terms of flight time and avoiding damages to the vehicle. + +
+
+
+
+
+ + ☆ PhantomLiDAR: Cross-modality Signal Injection Attacks against LiDAR + + +
+ LiDAR (Light Detection and Ranging) is a pivotal sensor for autonomous +driving, offering precise 3D spatial information. Previous signal attacks +against LiDAR systems mainly exploit laser signals. In this paper, we +investigate the possibility of cross-modality signal injection attacks, i.e., +injecting intentional electromagnetic interference (IEMI) to manipulate LiDAR +output. Our insight is that the internal modules of a LiDAR, i.e., the laser +receiving circuit, the monitoring sensors, and the beam-steering modules, even +with strict electromagnetic compatibility (EMC) testing, can still couple with +the IEMI attack signals and result in the malfunction of LiDAR systems. Based +on the above attack surfaces, we propose the PhantomLiDAR attack, which +manipulates LiDAR output in terms of Points Interference, Points Injection, +Points Removal, and even LiDAR Power-Off. We evaluate and demonstrate the +effectiveness of PhantomLiDAR with both simulated and real-world experiments on +five COTS LiDAR systems. We also conduct feasibility experiments in real-world +moving scenarios. We provide potential defense measures that can be implemented +at both the sensor level and the vehicle system level to mitigate the risks +associated with IEMI attacks. Video demonstrations can be viewed at +https://sites.google.com/view/phantomlidar. + +
+
+
+
+
+ + ☆ Learning to Love Edge Cases in Formative Math Assessment: Using the + AMMORE Dataset and Chain-of-Thought Prompting to Improve Grading Accuracy + + +
+ This paper introduces AMMORE, a new dataset of 53,000 math open-response +question-answer pairs from Rori, a learning platform used by students in +several African countries and conducts two experiments to evaluate the use of +large language models (LLM) for grading particularly challenging student +answers. The AMMORE dataset enables various potential analyses and provides an +important resource for researching student math acquisition in understudied, +real-world, educational contexts. In experiment 1 we use a variety of +LLM-driven approaches, including zero-shot, few-shot, and chain-of-thought +prompting, to grade the 1% of student answers that a rule-based classifier +fails to grade accurately. We find that the best-performing approach -- +chain-of-thought prompting -- accurately scored 92% of these edge cases, +effectively boosting the overall accuracy of the grading from 98.7% to 99.9%. +In experiment 2, we aim to better understand the consequential validity of the +improved grading accuracy, by passing grades generated by the best-performing +LLM-based approach to a Bayesian Knowledge Tracing (BKT) model, which estimated +student mastery of specific lessons. We find that relatively modest +improvements in model accuracy at the individual question level can lead to +significant changes in the estimation of student mastery. Where the rules-based +classifier currently used to grade student, answers misclassified the mastery +status of 6.9% of students across their completed lessons, using the LLM +chain-of-thought approach this misclassification rate was reduced to 2.6% of +students. Taken together, these findings suggest that LLMs could be a valuable +tool for grading open-response questions in K-12 mathematics education, +potentially enabling encouraging wider adoption of open-ended questions in +formative assessment. + +
+
+
+
+
+ + ☆ Revisiting Acoustic Similarity in Emotional Speech and Music via + Self-Supervised Representations + + +
+ Emotion recognition from speech and music shares similarities due to their +acoustic overlap, which has led to interest in transferring knowledge between +these domains. However, the shared acoustic cues between speech and music, +particularly those encoded by Self-Supervised Learning (SSL) models, remain +largely unexplored, given the fact that SSL models for speech and music have +rarely been applied in cross-domain research. In this work, we revisit the +acoustic similarity between emotion speech and music, starting with an analysis +of the layerwise behavior of SSL models for Speech Emotion Recognition (SER) +and Music Emotion Recognition (MER). Furthermore, we perform cross-domain +adaptation by comparing several approaches in a two-stage fine-tuning process, +examining effective ways to utilize music for SER and speech for MER. Lastly, +we explore the acoustic similarities between emotional speech and music using +Frechet audio distance for individual emotions, uncovering the issue of emotion +bias in both speech and music SSL models. Our findings reveal that while speech +and music SSL models do capture shared acoustic features, their behaviors can +vary depending on different emotions due to their training strategies and +domain-specificities. Additionally, parameter-efficient fine-tuning can enhance +SER and MER performance by leveraging knowledge from each other. This study +provides new insights into the acoustic similarity between emotional speech and +music, and highlights the potential for cross-domain generalization to improve +SER and MER systems. + +
+
+
+
+
+ + ☆ Why Companies "Democratise" Artificial Intelligence: The Case of Open + Source Software Donations + + +
+ Companies claim to "democratise" artificial intelligence (AI) when they +donate AI open source software (OSS) to non-profit foundations or release AI +models, among others, but what does this term mean and why do they do it? As +the impact of AI on society and the economy grows, understanding the commercial +incentives behind AI democratisation efforts is crucial for ensuring these +efforts serve broader interests beyond commercial agendas. Towards this end, +this study employs a mixed-methods approach to investigate commercial +incentives for 43 AI OSS donations to the Linux Foundation. It makes +contributions to both research and practice. It contributes a taxonomy of both +individual and organisational social, economic, and technological incentives +for AI democratisation. In particular, it highlights the role of democratising +the governance and control rights of an OSS project (i.e., from one company to +open governance) as a structural enabler for downstream goals, such as +attracting external contributors, reducing development costs, and influencing +industry standards, among others. Furthermore, OSS donations are often +championed by individual developers within companies, highlighting the +importance of the bottom-up incentives for AI democratisation. The taxonomy +provides a framework and toolkit for discerning incentives for other AI +democratisation efforts, such as the release of AI models. The paper concludes +with a discussion of future research directions. + +
+
+ comment: 30 pages, 1 figure, 5 tables +
+
+
+
+
+ + ☆ DarkSAM: Fooling Segment Anything Model to Segment Nothing NeurIPS'24 + + +
+ Segment Anything Model (SAM) has recently gained much attention for its +outstanding generalization to unseen data and tasks. Despite its promising +prospect, the vulnerabilities of SAM, especially to universal adversarial +perturbation (UAP) have not been thoroughly investigated yet. In this paper, we +propose DarkSAM, the first prompt-free universal attack framework against SAM, +including a semantic decoupling-based spatial attack and a texture +distortion-based frequency attack. We first divide the output of SAM into +foreground and background. Then, we design a shadow target strategy to obtain +the semantic blueprint of the image as the attack target. DarkSAM is dedicated +to fooling SAM by extracting and destroying crucial object features from images +in both spatial and frequency domains. In the spatial domain, we disrupt the +semantics of both the foreground and background in the image to confuse SAM. In +the frequency domain, we further enhance the attack effectiveness by distorting +the high-frequency components (i.e., texture information) of the image. +Consequently, with a single UAP, DarkSAM renders SAM incapable of segmenting +objects across diverse images with varying prompts. Experimental results on +four datasets for SAM and its two variant models demonstrate the powerful +attack capability and transferability of DarkSAM. + +
+
+ comment: This paper has been accepted by the 38th Annual Conference on Neural + Information Processing Systems (NeurIPS'24) +
+
+
+
+
+ + ☆ Efficient Arbitrary Precision Acceleration for Large Language Models on + GPU Tensor Cores + + +
+ Large language models (LLMs) have been widely applied but face challenges in +efficient inference. While quantization methods reduce computational demands, +ultra-low bit quantization with arbitrary precision is hindered by limited GPU +Tensor Core support and inefficient memory management, leading to suboptimal +acceleration. To address these challenges, we propose a comprehensive +acceleration scheme for arbitrary precision LLMs. At its core, we introduce a +novel bipolar-INT data format that facilitates parallel computing and supports +symmetric quantization, effectively reducing data redundancy. Building on this, +we implement an arbitrary precision matrix multiplication scheme that +decomposes and recovers matrices at the bit level, enabling flexible precision +while maximizing GPU Tensor Core utilization. Furthermore, we develop an +efficient matrix preprocessing method that optimizes data layout for subsequent +computations. Finally, we design a data recovery-oriented memory management +system that strategically utilizes fast shared memory, significantly enhancing +kernel execution speed and minimizing memory access latency. Experimental +results demonstrate our approach's effectiveness, with up to 13\times speedup +in matrix multiplication compared to NVIDIA's CUTLASS. When integrated into +LLMs, we achieve up to 6.7\times inference acceleration. These improvements +significantly enhance LLM inference efficiency, enabling broader and more +responsive applications of LLMs. + +
+
+
+
+
+ + ☆ Implementing a Nordic-Baltic Federated Health Data Network: a case + report + + +
+ Background: Centralized collection and processing of healthcare data across +national borders pose significant challenges, including privacy concerns, data +heterogeneity and legal barriers. To address some of these challenges, we +formed an interdisciplinary consortium to develop a feder-ated health data +network, comprised of six institutions across five countries, to facilitate +Nordic-Baltic cooperation on secondary use of health data. The objective of +this report is to offer early insights into our experiences developing this +network. Methods: We used a mixed-method ap-proach, combining both experimental +design and implementation science to evaluate the factors affecting the +implementation of our network. Results: Technically, our experiments indicate +that the network functions without significant performance degradation compared +to centralized simu-lation. Conclusion: While use of interdisciplinary +approaches holds a potential to solve challeng-es associated with establishing +such collaborative networks, our findings turn the spotlight on the uncertain +regulatory landscape playing catch up and the significant operational costs. + +
+
+ comment: 24 pages (including appendices), 1 figure +
+
+
+
+
+ + ☆ A Multimodal Single-Branch Embedding Network for Recommendation in + Cold-Start and Missing Modality Scenarios + + +
+ Most recommender systems adopt collaborative filtering (CF) and provide +recommendations based on past collective interactions. Therefore, the +performance of CF algorithms degrades when few or no interactions are +available, a scenario referred to as cold-start. To address this issue, +previous work relies on models leveraging both collaborative data and side +information on the users or items. Similar to multimodal learning, these models +aim at combining collaborative and content representations in a shared +embedding space. In this work we propose a novel technique for multimodal +recommendation, relying on a multimodal Single-Branch embedding network for +Recommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction +data as well as multimodal side information using the same single-branch +embedding network on different modalities. This makes SiBraR effective in +scenarios of missing modality, including cold start. Our extensive experiments +on large-scale recommendation datasets from three different recommendation +domains (music, movie, and e-commerce) and providing multimodal content +information (audio, text, image, labels, and interactions) show that SiBraR +significantly outperforms CF as well as state-of-the-art content-based RSs in +cold-start scenarios, and is competitive in warm scenarios. We show that +SiBraR's recommendations are accurate in missing modality scenarios, and that +the model is able to map different modalities to the same region of the shared +embedding space, hence reducing the modality gap. + +
+
+ comment: Accepted at 18th ACM Conference on Recommender Systems (RecSys '24) +
+
+
+
+
+ + ☆ Machine Learning-based vs Deep Learning-based Anomaly Detection in + Multivariate Time Series for Spacecraft Attitude Sensors + + +
+ In the framework of Failure Detection, Isolation and Recovery (FDIR) on +spacecraft, new AI-based approaches are emerging in the state of the art to +overcome the limitations commonly imposed by traditional threshold checking. + The present research aims at characterizing two different approaches to the +problem of stuck values detection in multivariate time series coming from +spacecraft attitude sensors. The analysis reveals the performance differences +in the two approaches, while commenting on their interpretability and +generalization to different scenarios. + +
+
+ comment: Accepted for the ESA SPAICE Conference 2024 +
+
+
+
+
+ + ☆ Detecting and Measuring Confounding Using Causal Mechanism Shifts + + +
+ Detecting and measuring confounding effects from data is a key challenge in +causal inference. Existing methods frequently assume causal sufficiency, +disregarding the presence of unobserved confounding variables. Causal +sufficiency is both unrealistic and empirically untestable. Additionally, +existing methods make strong parametric assumptions about the underlying causal +generative process to guarantee the identifiability of confounding variables. +Relaxing the causal sufficiency and parametric assumptions and leveraging +recent advancements in causal discovery and confounding analysis with +non-i.i.d. data, we propose a comprehensive approach for detecting and +measuring confounding. We consider various definitions of confounding and +introduce tailored methodologies to achieve three objectives: (i) detecting and +measuring confounding among a set of variables, (ii) separating observed and +unobserved confounding effects, and (iii) understanding the relative strengths +of confounding bias between different sets of variables. We present useful +properties of a confounding measure and present measures that satisfy those +properties. Empirical results support the theoretical analysis. + +
+
+
+
+
+ + ☆ Language Models as Zero-shot Lossless Gradient Compressors: Towards + General Neural Parameter Prior Models NeurIPS 2024 + + +
+ Despite the widespread use of statistical prior models in various fields, +such models for neural network gradients have long been overlooked. The +inherent challenge stems from their high-dimensional structures and complex +interdependencies, which complicate effective modeling. In this work, we +demonstrate the potential of large language models (LLMs) to act as gradient +priors in a zero-shot setting. We examine the property by considering lossless +gradient compression -- a critical application in distributed learning -- that +depends heavily on precise probability modeling. To achieve this, we introduce +LM-GC, a novel method that integrates LLMs with arithmetic coding. Our +technique converts plain gradients into text-like formats, enhancing token +efficiency by up to 38 times compared to their plain representations. We ensure +that this data conversion maintains a close alignment with the structure of +plain gradients and the symbols commonly recognized by LLMs. Our experiments +indicate that LM-GC surpasses existing state-of-the-art lossless compression +methods, improving compression rates by 10\% up to 17.2\% across various +datasets and architectures. Additionally, our approach shows promising +compatibility with lossy compression techniques such as quantization and +sparsification. These findings highlight the significant potential of LLMs as a +model for effectively handling gradients. We will release the source code upon +publication. + +
+
+ comment: To appear in NeurIPS 2024 +
+
+
+
+
+ + ☆ Inference-Time Language Model Alignment via Integrated Value Guidance EMNLP 2024 + + +
+ Large language models are typically fine-tuned to align with human +preferences, but tuning large models is computationally intensive and complex. +In this work, we introduce $\textit{Integrated Value Guidance}$ (IVG), a method +that uses implicit and explicit value functions to guide language model +decoding at token and chunk-level respectively, efficiently aligning large +language models purely at inference time. This approach circumvents the +complexities of direct fine-tuning and outperforms traditional methods. +Empirically, we demonstrate the versatility of IVG across various tasks. In +controlled sentiment generation and summarization tasks, our method +significantly improves the alignment of large models using inference-time +guidance from $\texttt{gpt2}$-based value functions. Moreover, in a more +challenging instruction-following benchmark AlpacaEval 2.0, we show that both +specifically tuned and off-the-shelf value functions greatly improve the +length-controlled win rates of large models against $\texttt{gpt-4-turbo}$ +(e.g., $19.51\% \rightarrow 26.51\%$ for $\texttt{Mistral-7B-Instruct-v0.2}$ +and $25.58\% \rightarrow 33.75\%$ for $\texttt{Mixtral-8x7B-Instruct-v0.1}$ +with Tulu guidance). + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ☆ DREAMS: A python framework to train deep learning models with model card + reporting for medical and health applications + + +
+ Electroencephalography (EEG) data provides a non-invasive method for +researchers and clinicians to observe brain activity in real time. The +integration of deep learning techniques with EEG data has significantly +improved the ability to identify meaningful patterns, leading to valuable +insights for both clinical and research purposes. However, most of the +frameworks so far, designed for EEG data analysis, are either too focused on +pre-processing or in deep learning methods per, making their use for both +clinician and developer communities problematic. Moreover, critical issues such +as ethical considerations, biases, uncertainties, and the limitations inherent +in AI models for EEG data analysis are frequently overlooked, posing challenges +to the responsible implementation of these technologies. In this paper, we +introduce a comprehensive deep learning framework tailored for EEG data +processing, model training and report generation. While constructed in way to +be adapted and developed further by AI developers, it enables to report, +through model cards, the outcome and specific information of use for both +developers and clinicians. In this way, we discuss how this framework can, in +the future, provide clinical researchers and developers with the tools needed +to create transparent and accountable AI models for EEG data analysis and +diagnosis. + +
+
+
+
+
+ + ☆ Self-supervised Preference Optimization: Enhance Your Language Model + with Preference Degree Awareness EMNLP 2024 + + +
+ Recently, there has been significant interest in replacing the reward model +in Reinforcement Learning with Human Feedback (RLHF) methods for Large Language +Models (LLMs), such as Direct Preference Optimization (DPO) and its variants. +These approaches commonly use a binary cross-entropy mechanism on pairwise +samples, i.e., minimizing and maximizing the loss based on preferred or +dis-preferred responses, respectively. However, while this training strategy +omits the reward model, it also overlooks the varying preference degrees within +different responses. We hypothesize that this is a key factor hindering LLMs +from sufficiently understanding human preferences. To address this problem, we +propose a novel Self-supervised Preference Optimization (SPO) framework, which +constructs a self-supervised preference degree loss combined with the alignment +loss, thereby helping LLMs improve their ability to understand the degree of +preference. Extensive experiments are conducted on two widely used datasets of +different tasks. The results demonstrate that SPO can be seamlessly integrated +with existing preference optimization methods and significantly boost their +performance to achieve state-of-the-art performance. We also conduct detailed +analyses to offer comprehensive insights into SPO, which verifies its +effectiveness. The code is available at https://github.com/lijian16/SPO. + +
+
+ comment: Accepted at EMNLP 2024 Findings +
+
+
+
+
+ + ☆ Ophthalmic Biomarker Detection with Parallel Prediction of Transformer + and Convolutional Architecture + + +
+ Ophthalmic diseases represent a significant global health issue, +necessitating the use of advanced precise diagnostic tools. Optical Coherence +Tomography (OCT) imagery which offers high-resolution cross-sectional images of +the retina has become a pivotal imaging modality in ophthalmology. +Traditionally physicians have manually detected various diseases and biomarkers +from such diagnostic imagery. In recent times, deep learning techniques have +been extensively used for medical diagnostic tasks enabling fast and precise +diagnosis. This paper presents a novel approach for ophthalmic biomarker +detection using an ensemble of Convolutional Neural Network (CNN) and Vision +Transformer. While CNNs are good for feature extraction within the local +context of the image, transformers are known for their ability to extract +features from the global context of the image. Using an ensemble of both +techniques allows us to harness the best of both worlds. Our method has been +implemented on the OLIVES dataset to detect 6 major biomarkers from the OCT +images and shows significant improvement of the macro averaged F1 score on the +dataset. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Harnessing Shared Relations via Multimodal Mixup Contrastive Learning + for Multimodal Classification + + +
+ Deep multimodal learning has shown remarkable success by leveraging +contrastive learning to capture explicit one-to-one relations across +modalities. However, real-world data often exhibits shared relations beyond +simple pairwise associations. We propose M3CoL, a Multimodal Mixup Contrastive +Learning approach to capture nuanced shared relations inherent in multimodal +data. Our key contribution is a Mixup-based contrastive loss that learns robust +representations by aligning mixed samples from one modality with their +corresponding samples from other modalities thereby capturing shared relations +between them. For multimodal classification tasks, we introduce a framework +that integrates a fusion module with unimodal prediction modules for auxiliary +supervision during training, complemented by our proposed Mixup-based +contrastive loss. Through extensive experiments on diverse datasets (N24News, +ROSMAP, BRCA, and Food-101), we demonstrate that M3CoL effectively captures +shared multimodal relations and generalizes across domains. It outperforms +state-of-the-art methods on N24News, ROSMAP, and BRCA, while achieving +comparable performance on Food-101. Our work highlights the significance of +learning shared relations for robust multimodal learning, opening up promising +avenues for future research. + +
+
+ comment: RK and RS contributed equally to this work, 20 Pages, 8 Figures, 9 + Tables +
+
+
+
+
+ + ☆ Faithfulness and the Notion of Adversarial Sensitivity in NLP + Explanations EMNLP 2024 + + +
+ Faithfulness is arguably the most critical metric to assess the reliability +of explainable AI. In NLP, current methods for faithfulness evaluation are +fraught with discrepancies and biases, often failing to capture the true +reasoning of models. We introduce Adversarial Sensitivity as a novel approach +to faithfulness evaluation, focusing on the explainer's response when the model +is under adversarial attack. Our method accounts for the faithfulness of +explainers by capturing sensitivity to adversarial input changes. This work +addresses significant limitations in existing evaluation techniques, and +furthermore, quantifies faithfulness from a crucial yet underexplored paradigm. + +
+
+ comment: Accepted as a Full Paper at EMNLP 2024 Workshop BlackBoxNLP +
+
+
+
+
+ + ☆ Federated Learning under Attack: Improving Gradient Inversion for Batch + of Images + + +
+ Federated Learning (FL) has emerged as a machine learning approach able to +preserve the privacy of user's data. Applying FL, clients train machine +learning models on a local dataset and a central server aggregates the learned +parameters coming from the clients, training a global machine learning model +without sharing user's data. However, the state-of-the-art shows several +approaches to promote attacks on FL systems. For instance, inverting or leaking +gradient attacks can find, with high precision, the local dataset used during +the training phase of the FL. This paper presents an approach, called Deep +Leakage from Gradients with Feedback Blending (DLG-FB), which is able to +improve the inverting gradient attack, considering the spatial correlation that +typically exists in batches of images. The performed evaluation shows an +improvement of 19.18% and 48,82% in terms of attack success rate and the number +of iterations per attacked image, respectively. + +
+
+ comment: 5 pages, 7 figures +
+
+
+
+
+ + ☆ Confidence intervals uncovered: Are we ready for real-world medical + imaging AI? MICCAI 2024 + + +
+ Medical imaging is spearheading the AI transformation of healthcare. +Performance reporting is key to determine which methods should be translated +into clinical practice. Frequently, broad conclusions are simply derived from +mean performance values. In this paper, we argue that this common practice is +often a misleading simplification as it ignores performance variability. Our +contribution is threefold. (1) Analyzing all MICCAI segmentation papers (n = +221) published in 2023, we first observe that more than 50\% of papers do not +assess performance variability at all. Moreover, only one (0.5\%) paper +reported confidence intervals (CIs) for model performance. (2) To address the +reporting bottleneck, we show that the unreported standard deviation (SD) in +segmentation papers can be approximated by a second-order polynomial function +of the mean Dice similarity coefficient (DSC). Based on external validation +data from 56 previous MICCAI challenges, we demonstrate that this approximation +can accurately reconstruct the CI of a method using information provided in +publications. (3) Finally, we reconstructed 95\% CIs around the mean DSC of +MICCAI 2023 segmentation papers. The median CI width was 0.03 which is three +times larger than the median performance gap between the first and second +ranked method. For more than 60\% of papers, the mean performance of the +second-ranked method was within the CI of the first-ranked method. We conclude +that current publications typically do not provide sufficient evidence to +support which models could potentially be translated into clinical practice. + +
+
+ comment: Paper accepted at MICCAI 2024 conference +
+
+
+
+
+ + ☆ Integrating Hierarchical Semantic into Iterative Generation Model for + Entailment Tree Explanation + + +
+ Manifestly and logically displaying the line of reasoning from evidence to +answer is significant to explainable question answering (QA). The entailment +tree exhibits the lines structurally, which is different from the +self-explanation principle in large-scale language models. Existing methods +rarely consider the semantic association of sentences between and within +hierarchies within the tree structure, which is prone to apparent mistakes in +combinations. In this work, we propose an architecture of integrating the +Hierarchical Semantics of sentences under the framework of Controller-Generator +(HiSCG) to explain answers. The HiSCG designs a hierarchical mapping between +hypotheses and facts, discriminates the facts involved in tree constructions, +and optimizes single-step entailments. To the best of our knowledge, We are the +first to notice hierarchical semantics of sentences between the same layer and +adjacent layers to yield improvements. The proposed method achieves comparable +performance on all three settings of the EntailmentBank dataset. The +generalization results on two out-of-domain datasets also demonstrate the +effectiveness of our method. + +
+
+
+
+
+ + ☆ SECURE: Semantics-aware Embodied Conversation under Unawareness for + Lifelong Robot Learning + + +
+ This paper addresses a challenging interactive task learning scenario we call +rearrangement under unawareness: to manipulate a rigid-body environment in a +context where the robot is unaware of a concept that's key to solving the +instructed task. We propose SECURE, an interactive task learning framework +designed to solve such problems by fixing a deficient domain model using +embodied conversation. Through dialogue, the robot discovers and then learns to +exploit unforeseen possibilities. Using SECURE, the robot not only learns from +the user's corrective feedback when it makes a mistake, but it also learns to +make strategic dialogue decisions for revealing useful evidence about novel +concepts for solving the instructed task. Together, these abilities allow the +robot to generalise to subsequent tasks using newly acquired knowledge. We +demonstrate that a robot that is semantics-aware -- that is, it exploits the +logical consequences of both sentence and discourse semantics in the learning +and inference process -- learns to solve rearrangement under unawareness more +effectively than a robot that lacks such capabilities. + +
+
+ comment: 10 pages,4 figures, 2 tables +
+
+
+
+
+ + ☆ Byzantine-Robust Aggregation for Securing Decentralized Federated + Learning + + +
+ Federated Learning (FL) emerges as a distributed machine learning approach +that addresses privacy concerns by training AI models locally on devices. +Decentralized Federated Learning (DFL) extends the FL paradigm by eliminating +the central server, thereby enhancing scalability and robustness through the +avoidance of a single point of failure. However, DFL faces significant +challenges in optimizing security, as most Byzantine-robust algorithms proposed +in the literature are designed for centralized scenarios. In this paper, we +present a novel Byzantine-robust aggregation algorithm to enhance the security +of Decentralized Federated Learning environments, coined WFAgg. This proposal +handles the adverse conditions and strength robustness of dynamic decentralized +topologies at the same time by employing multiple filters to identify and +mitigate Byzantine attacks. Experimental results demonstrate the effectiveness +of the proposed algorithm in maintaining model accuracy and convergence in the +presence of various Byzantine attack scenarios, outperforming state-of-the-art +centralized Byzantine-robust aggregation schemes (such as Multi-Krum or +Clustering). These algorithms are evaluated on an IID image classification +problem in both centralized and decentralized scenarios. + +
+
+ comment: 18 pages, 7 figures, 1 table +
+
+
+
+
+ + ☆ AlterMOMA: Fusion Redundancy Pruning for Camera-LiDAR Fusion Models with + Alternative Modality Masking NeurIPS 2024 + + +
+ Camera-LiDAR fusion models significantly enhance perception performance in +autonomous driving. The fusion mechanism leverages the strengths of each +modality while minimizing their weaknesses. Moreover, in practice, camera-LiDAR +fusion models utilize pre-trained backbones for efficient training. However, we +argue that directly loading single-modal pre-trained camera and LiDAR backbones +into camera-LiDAR fusion models introduces similar feature redundancy across +modalities due to the nature of the fusion mechanism. Unfortunately, existing +pruning methods are developed explicitly for single-modal models, and thus, +they struggle to effectively identify these specific redundant parameters in +camera-LiDAR fusion models. In this paper, to address the issue above on +camera-LiDAR fusion models, we propose a novelty pruning framework Alternative +Modality Masking Pruning (AlterMOMA), which employs alternative masking on each +modality and identifies the redundant parameters. Specifically, when one +modality parameters are masked (deactivated), the absence of features from the +masked backbone compels the model to reactivate previous redundant features of +the other modality backbone. Therefore, these redundant features and relevant +redundant parameters can be identified via the reactivation process. The +redundant parameters can be pruned by our proposed importance score evaluation +function, Alternative Evaluation (AlterEva), which is based on the observation +of the loss changes when certain modality parameters are activated and +deactivated. Extensive experiments on the nuScene and KITTI datasets +encompassing diverse tasks, baseline models, and pruning algorithms showcase +that AlterMOMA outperforms existing pruning methods, attaining state-of-the-art +performance. + +
+
+ comment: 17 pages, 3 figures, Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Episodic Memory Verbalization using Hierarchical Representations of + Life-Long Robot Experience + + +
+ Verbalization of robot experience, i.e., summarization of and question +answering about a robot's past, is a crucial ability for improving human-robot +interaction. Previous works applied rule-based systems or fine-tuned deep +models to verbalize short (several-minute-long) streams of episodic data, +limiting generalization and transferability. In our work, we apply large +pretrained models to tackle this task with zero or few examples, and +specifically focus on verbalizing life-long experiences. For this, we derive a +tree-like data structure from episodic memory (EM), with lower levels +representing raw perception and proprioception data, and higher levels +abstracting events to natural language concepts. Given such a hierarchical +representation built from the experience stream, we apply a large language +model as an agent to interactively search the EM given a user's query, +dynamically expanding (initially collapsed) tree nodes to find the relevant +information. The approach keeps computational costs low even when scaling to +months of robot experience data. We evaluate our method on simulated household +robot data, human egocentric videos, and real-world robot recordings, +demonstrating its flexibility and scalability. + +
+
+ comment: Code, data and demo videos at https://hierarchical-emv.github.io +
+
+
+
+
+ + ☆ MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard + for Prompt Attacks + + +
+ The proliferation of Large Language Models (LLMs) in diverse applications +underscores the pressing need for robust security measures to thwart potential +jailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger +data integrity and user privacy. Guardrails serve as crucial protective +mechanisms against such threats, but existing models often fall short in terms +of both detection accuracy, and computational efficiency. This paper advocates +for the significance of jailbreak attack prevention on LLMs, and emphasises the +role of input guardrails in safeguarding these models. We introduce MoJE +(Mixture of Jailbreak Expert), a novel guardrail architecture designed to +surpass current limitations in existing state-of-the-art guardrails. By +employing simple linguistic statistical techniques, MoJE excels in detecting +jailbreak attacks while maintaining minimal computational overhead during model +inference. Through rigorous experimentation, MoJE demonstrates superior +performance capable of detecting 90% of the attacks without compromising benign +prompts, enhancing LLMs security against jailbreak attacks. + +
+
+
+
+
+ + ☆ The application of GPT-4 in grading design university students' + assignment and providing feedback: An exploratory study + + +
+ This study aims to investigate whether GPT-4 can effectively grade +assignments for design university students and provide useful feedback. In +design education, assignments do not have a single correct answer and often +involve solving an open-ended design problem. This subjective nature of design +projects often leads to grading problems,as grades can vary between different +raters,for instance instructor from engineering background or architecture +background. This study employs an iterative research approach in developing a +Custom GPT with the aim of achieving more reliable results and testing whether +it can provide design students with constructive feedback. The findings +include: First,through several rounds of iterations the inter-reliability +between GPT and human raters reached a level that is generally accepted by +educators. This indicates that by providing accurate prompts to GPT,and +continuously iterating to build a Custom GPT, it can be used to effectively +grade students' design assignments, serving as a reliable complement to human +raters. Second, the intra-reliability of GPT's scoring at different times is +between 0.65 and 0.78. This indicates that, with adequate instructions, a +Custom GPT gives consistent results which is a precondition for grading +students. As consistency and comparability are the two main rules to ensure the +reliability of educational assessment, this study has looked at whether a +Custom GPT can be developed that adheres to these two rules. We finish the +paper by testing whether Custom GPT can provide students with useful feedback +and reflecting on how educators can develop and iterate a Custom GPT to serve +as a complementary rater. + +
+
+ comment: 25 pages, 5 figures +
+
+
+
+
+ + ☆ MIO: A Foundation Model on Multimodal Tokens + + +
+ In this paper, we introduce MIO, a novel foundation model built on multimodal +tokens, capable of understanding and generating speech, text, images, and +videos in an end-to-end, autoregressive manner. While the emergence of large +language models (LLMs) and multimodal large language models (MM-LLMs) propels +advancements in artificial general intelligence through their versatile +capabilities, they still lack true any-to-any understanding and generation. +Recently, the release of GPT-4o has showcased the remarkable potential of +any-to-any LLMs for complex real-world tasks, enabling omnidirectional input +and output across images, speech, and text. However, it is closed-source and +does not support the generation of multimodal interleaved sequences. To address +this gap, we present MIO, which is trained on a mixture of discrete tokens +across four modalities using causal multimodal modeling. MIO undergoes a +four-stage training process: (1) alignment pre-training, (2) interleaved +pre-training, (3) speech-enhanced pre-training, and (4) comprehensive +supervised fine-tuning on diverse textual, visual, and speech tasks. Our +experimental results indicate that MIO exhibits competitive, and in some cases +superior, performance compared to previous dual-modal baselines, any-to-any +model baselines, and even modality-specific baselines. Moreover, MIO +demonstrates advanced capabilities inherent to its any-to-any feature, such as +interleaved video-text generation, chain-of-visual-thought reasoning, visual +guideline generation, instructional image editing, etc. + +
+
+ comment: Technical Report. Codes and models will be available soon +
+
+
+
+
+ + ☆ Efficient Bias Mitigation Without Privileged Information ECCV + 2024 + + +
+ Deep neural networks trained via empirical risk minimisation often exhibit +significant performance disparities across groups, particularly when group and +task labels are spuriously correlated (e.g., "grassy background" and "cows"). +Existing bias mitigation methods that aim to address this issue often either +rely on group labels for training or validation, or require an extensive +hyperparameter search. Such data and computational requirements hinder the +practical deployment of these methods, especially when datasets are too large +to be group-annotated, computational resources are limited, and models are +trained through already complex pipelines. In this paper, we propose Targeted +Augmentations for Bias Mitigation (TAB), a simple hyperparameter-free framework +that leverages the entire training history of a helper model to identify +spurious samples, and generate a group-balanced training set from which a +robust model can be trained. We show that TAB improves worst-group performance +without any group information or model selection, outperforming existing +methods while maintaining overall accuracy. + +
+
+ comment: Accepted at the 18th European Conference on Computer Vision (ECCV + 2024) as an Oral presentation +
+
+
+
+
+ + ☆ Graph Edit Distance with General Costs Using Neural Set Divergence NeurIPS 2024 + + +
+ Graph Edit Distance (GED) measures the (dis-)similarity between two given +graphs, in terms of the minimum-cost edit sequence that transforms one graph to +the other. However, the exact computation of GED is NP-Hard, which has recently +motivated the design of neural methods for GED estimation. However, they do not +explicitly account for edit operations with different costs. In response, we +propose GRAPHEDX, a neural GED estimator that can work with general costs +specified for the four edit operations, viz., edge deletion, edge addition, +node deletion and node addition. We first present GED as a quadratic assignment +problem (QAP) that incorporates these four costs. Then, we represent each graph +as a set of node and edge embeddings and use them to design a family of neural +set divergence surrogates. We replace the QAP terms corresponding to each +operation with their surrogates. Computing such neural set divergence require +aligning nodes and edges of the two graphs. We learn these alignments using a +Gumbel-Sinkhorn permutation generator, additionally ensuring that the node and +edge alignments are consistent with each other. Moreover, these alignments are +cognizant of both the presence and absence of edges between node-pairs. +Experiments on several datasets, under a variety of edit cost settings, show +that GRAPHEDX consistently outperforms state-of-the-art methods and heuristics +in terms of prediction error. + +
+
+ comment: Published at NeurIPS 2024 +
+
+
+
+
+ + ☆ Artificial Data Point Generation in Clustered Latent Space for Small + Medical Datasets + + +
+ One of the growing trends in machine learning is the use of data generation +techniques, since the performance of machine learning models is dependent on +the quantity of the training dataset. However, in many medical applications, +collecting large datasets is challenging due to resource constraints, which +leads to overfitting and poor generalization. This paper introduces a novel +method, Artificial Data Point Generation in Clustered Latent Space (AGCL), +designed to enhance classification performance on small medical datasets +through synthetic data generation. The AGCL framework involves feature +extraction, K-means clustering, cluster evaluation based on a class separation +metric, and the generation of synthetic data points from clusters with distinct +class representations. This method was applied to Parkinson's disease +screening, utilizing facial expression data, and evaluated across multiple +machine learning classifiers. Experimental results demonstrate that AGCL +significantly improves classification accuracy compared to baseline, GN and +kNNMTD. AGCL achieved the highest overall test accuracy of 83.33% and +cross-validation accuracy of 90.90% in majority voting over different emotions, +confirming its effectiveness in augmenting small datasets. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ Preserving logical and functional dependencies in synthetic tabular data + + +
+ Dependencies among attributes are a common aspect of tabular data. However, +whether existing tabular data generation algorithms preserve these dependencies +while generating synthetic data is yet to be explored. In addition to the +existing notion of functional dependencies, we introduce the notion of logical +dependencies among the attributes in this article. Moreover, we provide a +measure to quantify logical dependencies among attributes in tabular data. +Utilizing this measure, we compare several state-of-the-art synthetic data +generation algorithms and test their capability to preserve logical and +functional dependencies on several publicly available datasets. We demonstrate +that currently available synthetic tabular data generation algorithms do not +fully preserve functional dependencies when they generate synthetic datasets. +In addition, we also showed that some tabular synthetic data generation models +can preserve inter-attribute logical dependencies. Our review and comparison of +the state-of-the-art reveal research needs and opportunities to develop +task-specific synthetic tabular data generation models. + +
+
+ comment: Submitted to Pattern Recognition Journal +
+
+
+
+
+ + ☆ Zero- and Few-shot Named Entity Recognition and Text Expansion in + Medication Prescriptions using ChatGPT + + +
+ Introduction: Medication prescriptions are often in free text and include a +mix of two languages, local brand names, and a wide range of idiosyncratic +formats and abbreviations. Large language models (LLMs) have shown promising +ability to generate text in response to input prompts. We use ChatGPT 3.5 to +automatically structure and expand medication statements in discharge summaries +and thus make them easier to interpret for people and machines. Methods: +Named-entity Recognition (NER) and Text Expansion (EX) are used in a zero- and +few-shot setting with different prompt strategies. 100 medication statements +were manually annotated and curated. NER performance was measured by using +strict and partial matching. For the task EX, two experts interpreted the +results by assessing semantic equivalence between original and expanded +statements. The model performance was measured by precision, recall, and F1 +score. Results: For NER, the best-performing prompt reached an average F1 score +of 0.94 in the test set. For EX, the few-shot prompt showed superior +performance among other prompts, with an average F1 score of 0.87. Conclusion: +Our study demonstrates good performance for NER and EX tasks in free-text +medication statements using ChatGPT. Compared to a zero-shot baseline, a +few-shot approach prevented the system from hallucinating, which would be +unacceptable when processing safety-relevant medication data. + +
+
+
+
+
+ + ☆ Explanation Bottleneck Models + + +
+ Recent concept-based interpretable models have succeeded in providing +meaningful explanations by pre-defined concept sets. However, the dependency on +the pre-defined concepts restricts the application because of the limited +number of concepts for explanations. This paper proposes a novel interpretable +deep neural network called explanation bottleneck models (XBMs). XBMs generate +a text explanation from the input without pre-defined concepts and then predict +a final task prediction based on the generated explanation by leveraging +pre-trained vision-language encoder-decoder models. To achieve both the target +task performance and the explanation quality, we train XBMs through the target +task loss with the regularization penalizing the explanation decoder via the +distillation from the frozen pre-trained decoder. Our experiments, including a +comparison to state-of-the-art concept bottleneck models, confirm that XBMs +provide accurate and fluent natural language explanations without pre-defined +concept sets. Code will be available at https://github.com/yshinya6/xbm/. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ A Fuzzy-based Approach to Predict Human Interaction by Functional + Near-Infrared Spectroscopy + + +
+ The paper introduces a Fuzzy-based Attention (Fuzzy Attention Layer) +mechanism, a novel computational approach to enhance the interpretability and +efficacy of neural models in psychological research. The proposed Fuzzy +Attention Layer mechanism is integrated as a neural network layer within the +Transformer Encoder model to facilitate the analysis of complex psychological +phenomena through neural signals, such as those captured by functional +Near-Infrared Spectroscopy (fNIRS). By leveraging fuzzy logic, the Fuzzy +Attention Layer is capable of learning and identifying interpretable patterns +of neural activity. This capability addresses a significant challenge when +using Transformer: the lack of transparency in determining which specific brain +activities most contribute to particular predictions. Our experimental results +demonstrated on fNIRS data from subjects engaged in social interactions +involving handholding reveal that the Fuzzy Attention Layer not only learns +interpretable patterns of neural activity but also enhances model performance. +Additionally, the learned patterns provide deeper insights into the neural +correlates of interpersonal touch and emotional exchange. The application of +our model shows promising potential in deciphering the subtle complexities of +human social behaviors, thereby contributing significantly to the fields of +social neuroscience and psychological AI. + +
+
+
+
+
+ + ☆ Hierarchical End-to-End Autonomous Driving: Integrating BEV Perception + with Deep Reinforcement Learning + + +
+ End-to-end autonomous driving offers a streamlined alternative to the +traditional modular pipeline, integrating perception, prediction, and planning +within a single framework. While Deep Reinforcement Learning (DRL) has recently +gained traction in this domain, existing approaches often overlook the critical +connection between feature extraction of DRL and perception. In this paper, we +bridge this gap by mapping the DRL feature extraction network directly to the +perception phase, enabling clearer interpretation through semantic +segmentation. By leveraging Bird's-Eye-View (BEV) representations, we propose a +novel DRL-based end-to-end driving framework that utilizes multi-sensor inputs +to construct a unified three-dimensional understanding of the environment. This +BEV-based system extracts and translates critical environmental features into +high-level abstract states for DRL, facilitating more informed control. +Extensive experimental evaluations demonstrate that our approach not only +enhances interpretability but also significantly outperforms state-of-the-art +methods in autonomous driving control tasks, reducing the collision rate by +20%. + +
+
+
+
+
+ + ☆ Prototype based Masked Audio Model for Self-Supervised Learning of Sound + Event Detection ICASSP2025 + + +
+ A significant challenge in sound event detection (SED) is the effective +utilization of unlabeled data, given the limited availability of labeled data +due to high annotation costs. Semi-supervised algorithms rely on labeled data +to learn from unlabeled data, and the performance is constrained by the quality +and size of the former. In this paper, we introduce the Prototype based Masked +Audio Model~(PMAM) algorithm for self-supervised representation learning in +SED, to better exploit unlabeled data. Specifically, semantically rich +frame-level pseudo labels are constructed from a Gaussian mixture model (GMM) +based prototypical distribution modeling. These pseudo labels supervise the +learning of a Transformer-based masked audio model, in which binary +cross-entropy loss is employed instead of the widely used InfoNCE loss, to +provide independent loss contributions from different prototypes, which is +important in real scenarios in which multiple labels may apply to unsupervised +data frames. A final stage of fine-tuning with just a small amount of labeled +data yields a very high performing SED model. On like-for-like tests using the +DESED task, our method achieves a PSDS1 score of 62.5\%, surpassing current +state-of-the-art models and demonstrating the superiority of the proposed +technique. + +
+
+ comment: Submitted to ICASSP2025; The code for this paper will be available at + https://github.com/cai525/Transformer4SED after the paper is accepted +
+
+
+
+
+ + ☆ AssistantX: An LLM-Powered Proactive Assistant in Collaborative + Human-Populated Environment + + +
+ The increasing demand for intelligent assistants in human-populated +environments has motivated significant research in autonomous robotic systems. +Traditional service robots and virtual assistants, however, struggle with +real-world task execution due to their limited capacity for dynamic reasoning +and interaction, particularly when human collaboration is required. Recent +developments in Large Language Models have opened new avenues for improving +these systems, enabling more sophisticated reasoning and natural interaction +capabilities. In this paper, we introduce AssistantX, an LLM-powered proactive +assistant designed to operate autonomously in a physical office environment. +Unlike conventional service robots, AssistantX leverages a novel multi-agent +architecture, PPDR4X, which provides advanced inference capabilities and +comprehensive collaboration awareness. By effectively bridging the gap between +virtual operations and physical interactions, AssistantX demonstrates robust +performance in managing complex real-world scenarios. Our evaluation highlights +the architecture's effectiveness, showing that AssistantX can respond to clear +instructions, actively retrieve supplementary information from memory, and +proactively seek collaboration from team members to ensure successful task +completion. More details and videos can be found at +https://assistantx-agent.github.io/AssistantX/. + +
+
+ comment: 6 pages, 8 figures, 4 tables +
+
+
+
+
+ + ☆ FactorSim: Generative Simulation via Factorized Representation + + +
+ Generating simulations to train intelligent agents in game-playing and +robotics from natural language input, from user input or task documentation, +remains an open-ended challenge. Existing approaches focus on parts of this +challenge, such as generating reward functions or task hyperparameters. Unlike +previous work, we introduce FACTORSIM that generates full simulations in code +from language input that can be used to train agents. Exploiting the structural +modularity specific to coded simulations, we propose to use a factored +partially observable Markov decision process representation that allows us to +reduce context dependence during each step of the generation. For evaluation, +we introduce a generative simulation benchmark that assesses the generated +simulation code's accuracy and effectiveness in facilitating zero-shot +transfers in reinforcement learning settings. We show that FACTORSIM +outperforms existing methods in generating simulations regarding prompt +alignment (e.g., accuracy), zero-shot transfer abilities, and human evaluation. +We also demonstrate its effectiveness in generating robotic tasks. + +
+
+ comment: neurips 2024, project website: + https://cs.stanford.edu/~sunfanyun/factorsim/ +
+
+
+
+
+ + ☆ Digital Twin Ecosystem for Oncology Clinical Operations + + +
+ Artificial Intelligence (AI) and Large Language Models (LLMs) hold +significant promise in revolutionizing healthcare, especially in clinical +applications. Simultaneously, Digital Twin technology, which models and +simulates complex systems, has gained traction in enhancing patient care. +However, despite the advances in experimental clinical settings, the potential +of AI and digital twins to streamline clinical operations remains largely +untapped. This paper introduces a novel digital twin framework specifically +designed to enhance oncology clinical operations. We propose the integration of +multiple specialized digital twins, such as the Medical Necessity Twin, Care +Navigator Twin, and Clinical History Twin, to enhance workflow efficiency and +personalize care for each patient based on their unique data. Furthermore, by +synthesizing multiple data sources and aligning them with the National +Comprehensive Cancer Network (NCCN) guidelines, we create a dynamic Cancer Care +Path, a continuously evolving knowledge base that enables these digital twins +to provide precise, tailored clinical recommendations. + +
+
+ comment: Pre Print +
+
+
+
+
+ + ☆ AI Delegates with a Dual Focus: Ensuring Privacy and Strategic + Self-Disclosure + + +
+ Large language model (LLM)-based AI delegates are increasingly utilized to +act on behalf of users, assisting them with a wide range of tasks through +conversational interfaces. Despite their advantages, concerns arise regarding +the potential risk of privacy leaks, particularly in scenarios involving social +interactions. While existing research has focused on protecting privacy by +limiting the access of AI delegates to sensitive user information, many social +scenarios require disclosing private details to achieve desired outcomes, +necessitating a balance between privacy protection and disclosure. To address +this challenge, we conduct a pilot study to investigate user preferences for AI +delegates across various social relations and task scenarios, and then propose +a novel AI delegate system that enables privacy-conscious self-disclosure. Our +user study demonstrates that the proposed AI delegate strategically protects +privacy, pioneering its use in diverse and dynamic social interactions. + +
+
+
+
+
+ + ☆ T3: A Novel Zero-shot Transfer Learning Framework Iteratively Training + on an Assistant Task for a Target Task + + +
+ Long text summarization, gradually being essential for efficiently processing +large volumes of information, stays challenging for Large Language Models +(LLMs) such as GPT and LLaMA families because of the insufficient open-sourced +training datasets and the high requirement of contextual details dealing. To +address the issue, we design a novel zero-shot transfer learning framework, +abbreviated as T3, to iteratively training a baseline LLM on an assistant task +for the target task, where the former should own richer data resources and +share structural or semantic similarity with the latter. In practice, T3 is +approached to deal with the long text summarization task by utilizing question +answering as the assistant task, and further validated its effectiveness on the +BBC summary, NarraSum, FairytaleQA, and NLQuAD datasets, with up to nearly 14% +improvement in ROUGE, 35% improvement in BLEU, and 16% improvement in Factscore +compared to three baseline LLMs, demonstrating its potential for more +assistant-target task combinations. + +
+
+
+
+
+ + ☆ P4Q: Learning to Prompt for Quantization in Visual-language Models + + +
+ Large-scale pre-trained Vision-Language Models (VLMs) have gained prominence +in various visual and multimodal tasks, yet the deployment of VLMs on +downstream application platforms remains challenging due to their prohibitive +requirements of training samples and computing resources. Fine-tuning and +quantization of VLMs can substantially reduce the sample and computation costs, +which are in urgent need. There are two prevailing paradigms in quantization, +Quantization-Aware Training (QAT) can effectively quantize large-scale VLMs but +incur a huge training cost, while low-bit Post-Training Quantization (PTQ) +suffers from a notable performance drop. We propose a method that balances +fine-tuning and quantization named ``Prompt for Quantization'' (P4Q), in which +we design a lightweight architecture to leverage contrastive loss supervision +to enhance the recognition performance of a PTQ model. Our method can +effectively reduce the gap between image features and text features caused by +low-bit quantization, based on learnable prompts to reorganize textual +representations and a low-bit adapter to realign the distributions of image and +text features. We also introduce a distillation loss based on cosine similarity +predictions to distill the quantized model using a full-precision teacher. +Extensive experimental results demonstrate that our P4Q method outperforms +prior arts, even achieving comparable results to its full-precision +counterparts. For instance, our 8-bit P4Q can theoretically compress the +CLIP-ViT/B-32 by 4 $\times$ while achieving 66.94\% Top-1 accuracy, +outperforming the learnable prompt fine-tuned full-precision model by 2.24\% +with negligible additional parameters on the ImageNet dataset. + +
+
+
+
+
+ + ☆ Hand-object reconstruction via interaction-aware graph attention + mechanism ICIP 2024 + + +
+ Estimating the poses of both a hand and an object has become an important +area of research due to the growing need for advanced vision computing. The +primary challenge involves understanding and reconstructing how hands and +objects interact, such as contact and physical plausibility. Existing +approaches often adopt a graph neural network to incorporate spatial +information of hand and object meshes. However, these approaches have not fully +exploited the potential of graphs without modification of edges within and +between hand- and object-graphs. We propose a graph-based refinement method +that incorporates an interaction-aware graph-attention mechanism to account for +hand-object interactions. Using edges, we establish connections among closely +correlated nodes, both within individual graphs and across different graphs. +Experiments demonstrate the effectiveness of our proposed method with notable +improvements in the realm of physical plausibility. + +
+
+ comment: 7 pages, Accepted by ICIP 2024 +
+
+
+
+
+ + ☆ Neural P$^3$M: A Long-Range Interaction Modeling Enhancer for Geometric + GNNs NeurIPS 2024 + + +
+ Geometric graph neural networks (GNNs) have emerged as powerful tools for +modeling molecular geometry. However, they encounter limitations in effectively +capturing long-range interactions in large molecular systems. To address this +challenge, we introduce Neural P$^3$M, a versatile enhancer of geometric GNNs +to expand the scope of their capabilities by incorporating mesh points +alongside atoms and reimaging traditional mathematical operations in a +trainable manner. Neural P$^3$M exhibits flexibility across a wide range of +molecular systems and demonstrates remarkable accuracy in predicting energies +and forces, outperforming on benchmarks such as the MD22 dataset. It also +achieves an average improvement of 22% on the OE62 dataset while integrating +with various architectures. + +
+
+ comment: Published as a conference paper at NeurIPS 2024 +
+
+
+
+
+ + ☆ Dirichlet-Based Coarse-to-Fine Example Selection For Open-Set Annotation + + +
+ Active learning (AL) has achieved great success by selecting the most +valuable examples from unlabeled data. However, they usually deteriorate in +real scenarios where open-set noise gets involved, which is studied as open-set +annotation (OSA). In this paper, we owe the deterioration to the unreliable +predictions arising from softmax-based translation invariance and propose a +Dirichlet-based Coarse-to-Fine Example Selection (DCFS) strategy accordingly. +Our method introduces simplex-based evidential deep learning (EDL) to break +translation invariance and distinguish known and unknown classes by considering +evidence-based data and distribution uncertainty simultaneously. Furthermore, +hard known-class examples are identified by model discrepancy generated from +two classifier heads, where we amplify and alleviate the model discrepancy +respectively for unknown and known classes. Finally, we combine the discrepancy +with uncertainties to form a two-stage strategy, selecting the most informative +examples from known classes. Extensive experiments on various openness ratio +datasets demonstrate that DCFS achieves state-of-art performance. + +
+
+
+
+
+ + ☆ Open Digital Rights Enforcement Framework (ODRE): from descriptive to + enforceable policies + + +
+ From centralised platforms to decentralised ecosystems, like Data Spaces, +sharing data has become a paramount challenge. For this reason, the definition +of data usage policies has become crucial in these domains, highlighting the +necessity of effective policy enforcement mechanisms. The Open Digital Rights +Language (ODRL) is a W3C standard ontology designed to describe data usage +policies, however, it lacks built-in enforcement capabilities, limiting its +practical application. This paper introduces the Open Digital Rights +Enforcement (ODRE) framework, whose goal is to provide ODRL with enforcement +capabilities. The ODRE framework proposes a novel approach to express ODRL +policies that integrates the descriptive ontology terms of ODRL with other +languages that allow behaviour specification, such as dynamic data handling or +function evaluation. The framework includes an enforcement algorithm for ODRL +policies and two open-source implementations in Python and Java. The ODRE +framework is also designed to support future extensions of ODRL to specific +domain scenarios. In addition, current limitations of ODRE, ODRL, and current +challenges are reported. Finally, to demonstrate the enforcement capabilities +of the implementations, their performance, and their extensibility features, +several experiments have been carried out with positive results. + +
+
+ comment: 20 pages, 3 Figures, Submitted to Computers & Security journal +
+
+
+
+
+ + ☆ TA-Cleaner: A Fine-grained Text Alignment Backdoor Defense Strategy for + Multimodal Contrastive Learning + + +
+ Pre-trained large models for multimodal contrastive learning, such as CLIP, +have been widely recognized in the industry as highly susceptible to +data-poisoned backdoor attacks. This poses significant risks to downstream +model training. In response to such potential threats, finetuning offers a +simpler and more efficient defense choice compared to retraining large models +with augmented data. In the supervised learning domain, fine-tuning defense +strategies can achieve excellent defense performance. However, in the +unsupervised and semi-supervised domain, we find that when CLIP faces some +complex attack techniques, the existing fine-tuning defense strategy, +CleanCLIP, has some limitations on defense performance. The synonym +substitution of its text-augmentation is insufficient to enhance the text +feature space. To compensate for this weakness, we improve it by proposing a +fine-grained \textbf{T}ext \textbf{A}lignment \textbf{C}leaner (TA-Cleaner) to +cut off feature connections of backdoor triggers. We randomly select a few +samples for positive and negative subtext generation at each epoch of +CleanCLIP, and align the subtexts to the images to strengthen the text +self-supervision. We evaluate the effectiveness of our TA-Cleaner against six +attack algorithms and conduct comprehensive zero-shot classification tests on +ImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves +state-of-the-art defensiveness among finetuning-based defense techniques. Even +when faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms +CleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\% and 63.88\%, +respectively. + +
+
+
+
+
+ + ☆ Subjective and Objective Quality-of-Experience Evaluation Study for Live + Video Streaming + + +
+ In recent years, live video streaming has gained widespread popularity across +various social media platforms. Quality of experience (QoE), which reflects +end-users' satisfaction and overall experience, plays a critical role for media +service providers to optimize large-scale live compression and transmission +strategies to achieve perceptually optimal rate-distortion trade-off. Although +many QoE metrics for video-on-demand (VoD) have been proposed, there remain +significant challenges in developing QoE metrics for live video streaming. To +bridge this gap, we conduct a comprehensive study of subjective and objective +QoE evaluations for live video streaming. For the subjective QoE study, we +introduce the first live video streaming QoE dataset, TaoLive QoE, which +consists of $42$ source videos collected from real live broadcasts and $1,155$ +corresponding distorted ones degraded due to a variety of streaming +distortions, including conventional streaming distortions such as compression, +stalling, as well as live streaming-specific distortions like frame skipping, +variable frame rate, etc. Subsequently, a human study was conducted to derive +subjective QoE scores of videos in the TaoLive QoE dataset. For the objective +QoE study, we benchmark existing QoE models on the TaoLive QoE dataset as well +as publicly available QoE datasets for VoD scenarios, highlighting that current +models struggle to accurately assess video QoE, particularly for live content. +Hence, we propose an end-to-end QoE evaluation model, Tao-QoE, which integrates +multi-scale semantic features and optical flow-based motion features to +predicting a retrospective QoE score, eliminating reliance on statistical +quality of service (QoS) features. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Deep Manifold Part 1: Anatomy of Neural Network Manifold + + +
+ Based on the numerical manifold method principle, we developed a mathematical +framework of a neural network manifold: Deep Manifold and discovered that +neural networks: 1) is numerical computation combining forward and inverse; 2) +have near infinite degrees of freedom; 3) exponential learning capacity with +depth; 4) have self-progressing boundary conditions; 5) has training hidden +bottleneck. We also define two concepts: neural network learning space and deep +manifold space and introduce two concepts: neural network intrinsic pathway and +fixed point. We raise three fundamental questions: 1). What is the training +completion definition; 2). where is the deep learning convergence point (neural +network fixed point); 3). How important is token timestamp in training data +given negative time is critical in inverse problem. + +
+
+
+
+
+ + ☆ Improving Fast Adversarial Training via Self-Knowledge Guidance + + +
+ Adversarial training has achieved remarkable advancements in defending +against adversarial attacks. Among them, fast adversarial training (FAT) is +gaining attention for its ability to achieve competitive robustness with fewer +computing resources. Existing FAT methods typically employ a uniform strategy +that optimizes all training data equally without considering the influence of +different examples, which leads to an imbalanced optimization. However, this +imbalance remains unexplored in the field of FAT. In this paper, we conduct a +comprehensive study of the imbalance issue in FAT and observe an obvious class +disparity regarding their performances. This disparity could be embodied from a +perspective of alignment between clean and robust accuracy. Based on the +analysis, we mainly attribute the observed misalignment and disparity to the +imbalanced optimization in FAT, which motivates us to optimize different +training data adaptively to enhance robustness. Specifically, we take disparity +and misalignment into consideration. First, we introduce self-knowledge guided +regularization, which assigns differentiated regularization weights to each +class based on its training state, alleviating class disparity. Additionally, +we propose self-knowledge guided label relaxation, which adjusts label +relaxation according to the training accuracy, alleviating the misalignment and +improving robustness. By combining these methods, we formulate the +Self-Knowledge Guided FAT (SKG-FAT), leveraging naturally generated knowledge +during training to enhance the adversarial robustness without compromising +training efficiency. Extensive experiments on four standard datasets +demonstrate that the SKG-FAT improves the robustness and preserves competitive +clean accuracy, outperforming the state-of-the-art methods. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Multimodal Banking Dataset: Understanding Client Needs through Event + Sequences + + +
+ Financial organizations collect a huge amount of data about clients that +typically has a temporal (sequential) structure and is collected from various +sources (modalities). Due to privacy issues, there are no large-scale +open-source multimodal datasets of event sequences, which significantly limits +the research in this area. In this paper, we present the industrial-scale +publicly available multimodal banking dataset, MBD, that contains more than +1.5M corporate clients with several modalities: 950M bank transactions, 1B geo +position events, 5M embeddings of dialogues with technical support and monthly +aggregated purchases of four bank's products. All entries are properly +anonymized from real proprietary bank data. Using this dataset, we introduce a +novel benchmark with two business tasks: campaigning (purchase prediction in +the next month) and matching of clients. We provide numerical results that +demonstrate the superiority of our multi-modal baselines over single-modal +techniques for each task. As a result, the proposed dataset can open new +perspectives and facilitate the future development of practically important +large-scale multimodal algorithms for event sequences. + HuggingFace Link: https://huggingface.co/datasets/ai-lab/MBD + Github Link: https://github.com/Dzhambo/MBD + +
+
+
+
+
+ + ☆ Let the Quantum Creep In: Designing Quantum Neural Network Models by + Gradually Swapping Out Classical Components + + +
+ Artificial Intelligence (AI), with its multiplier effect and wide +applications in multiple areas, could potentially be an important application +of quantum computing. Since modern AI systems are often built on neural +networks, the design of quantum neural networks becomes a key challenge in +integrating quantum computing into AI. To provide a more fine-grained +characterisation of the impact of quantum components on the performance of +neural networks, we propose a framework where classical neural network layers +are gradually replaced by quantum layers that have the same type of input and +output while keeping the flow of information between layers unchanged, +different from most current research in quantum neural network, which favours +an end-to-end quantum model. We start with a simple three-layer classical +neural network without any normalisation layers or activation functions, and +gradually change the classical layers to the corresponding quantum versions. We +conduct numerical experiments on image classification datasets such as the +MNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of +performance brought by the systematic introduction of quantum components. +Through this framework, our research sheds new light on the design of future +quantum neural network models where it could be more favourable to search for +methods and frameworks that harness the advantages from both the classical and +quantum worlds. + +
+
+ comment: 50 pages (including Appendix), many figures, accepted as a poster on + QTML2024. Code available at + https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In +
+
+
+
+
+ + ☆ A Scalable Data-Driven Framework for Systematic Analysis of SEC 10-K + Filings Using Large Language Models + + +
+ The number of companies listed on the NYSE has been growing exponentially, +creating a significant challenge for market analysts, traders, and stockholders +who must monitor and assess the performance and strategic shifts of a large +number of companies regularly. There is an increasing need for a fast, +cost-effective, and comprehensive method to evaluate the performance and detect +and compare many companies' strategy changes efficiently. We propose a novel +data-driven approach that leverages large language models (LLMs) to +systematically analyze and rate the performance of companies based on their SEC +10-K filings. These filings, which provide detailed annual reports on a +company's financial performance and strategic direction, serve as a rich source +of data for evaluating various aspects of corporate health, including +confidence, environmental sustainability, innovation, and workforce management. +We also introduce an automated system for extracting and preprocessing 10-K +filings. This system accurately identifies and segments the required sections +as outlined by the SEC, while also isolating key textual content that contains +critical information about the company. This curated data is then fed into +Cohere's Command-R+ LLM to generate quantitative ratings across various +performance metrics. These ratings are subsequently processed and visualized to +provide actionable insights. The proposed scheme is then implemented on an +interactive GUI as a no-code solution for running the data pipeline and +creating the visualizations. The application showcases the rating results and +provides year-on-year comparisons of company performance. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ Enhancing Structured-Data Retrieval with GraphRAG: Soccer Data Case + Study + + +
+ Extracting meaningful insights from large and complex datasets poses +significant challenges, particularly in ensuring the accuracy and relevance of +retrieved information. Traditional data retrieval methods such as sequential +search and index-based retrieval often fail when handling intricate and +interconnected data structures, resulting in incomplete or misleading outputs. +To overcome these limitations, we introduce Structured-GraphRAG, a versatile +framework designed to enhance information retrieval across structured datasets +in natural language queries. Structured-GraphRAG utilizes multiple knowledge +graphs, which represent data in a structured format and capture complex +relationships between entities, enabling a more nuanced and comprehensive +retrieval of information. This graph-based approach reduces the risk of errors +in language model outputs by grounding responses in a structured format, +thereby enhancing the reliability of results. We demonstrate the effectiveness +of Structured-GraphRAG by comparing its performance with that of a recently +published method using traditional retrieval-augmented generation. Our findings +show that Structured-GraphRAG significantly improves query processing +efficiency and reduces response times. While our case study focuses on soccer +data, the framework's design is broadly applicable, offering a powerful tool +for data analysis and enhancing language model applications across various +structured domains. + +
+
+
+
+
+ + ☆ Dr. GPT in Campus Counseling: Understanding Higher Education Students' + Opinions on LLM-assisted Mental Health Services + + +
+ In response to the increasing mental health challenges faced by college +students, we sought to understand their perspectives on how AI applications, +particularly Large Language Models (LLMs), can be leveraged to enhance their +mental well-being. Through pilot interviews with ten diverse students, we +explored their opinions on the use of LLMs across five fictional scenarios: +General Information Inquiry, Initial Screening, Reshaping Patient-Expert +Dynamics, Long-term Care, and Follow-up Care. Our findings revealed that +students' acceptance of LLMs varied by scenario, with participants highlighting +both potential benefits, such as proactive engagement and personalized +follow-up care, and concerns, including limitations in training data and +emotional support. These insights inform how AI technology should be designed +and implemented to effectively support and enhance students' mental well-being, +particularly in scenarios where LLMs can complement traditional methods, while +maintaining empathy and respecting individual preferences. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Showing Many Labels in Multi-label Classification Models: An Empirical + Study of Adversarial Examples + + +
+ With the rapid development of Deep Neural Networks (DNNs), they have been +applied in numerous fields. However, research indicates that DNNs are +susceptible to adversarial examples, and this is equally true in the +multi-label domain. To further investigate multi-label adversarial examples, we +introduce a novel type of attacks, termed "Showing Many Labels". The objective +of this attack is to maximize the number of labels included in the classifier's +prediction results. In our experiments, we select nine attack algorithms and +evaluate their performance under "Showing Many Labels". Eight of the attack +algorithms were adapted from the multi-class environment to the multi-label +environment, while the remaining one was specifically designed for the +multi-label environment. We choose ML-LIW and ML-GCN as target models and train +them on four popular multi-label datasets: VOC2007, VOC2012, NUS-WIDE, and +COCO. We record the success rate of each algorithm when it shows the expected +number of labels in eight different scenarios. Experimental results indicate +that under the "Showing Many Labels", iterative attacks perform significantly +better than one-step attacks. Moreover, it is possible to show all labels in +the dataset. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Pixel-Space Post-Training of Latent Diffusion Models + + +
+ Latent diffusion models (LDMs) have made significant advancements in the +field of image generation in recent years. One major advantage of LDMs is their +ability to operate in a compressed latent space, allowing for more efficient +training and deployment. However, despite these advantages, challenges with +LDMs still remain. For example, it has been observed that LDMs often generate +high-frequency details and complex compositions imperfectly. We hypothesize +that one reason for these flaws is due to the fact that all pre- and +post-training of LDMs are done in latent space, which is typically $8 \times 8$ +lower spatial-resolution than the output images. To address this issue, we +propose adding pixel-space supervision in the post-training process to better +preserve high-frequency details. Experimentally, we show that adding a +pixel-space objective significantly improves both supervised quality +fine-tuning and preference-based post-training by a large margin on a +state-of-the-art DiT transformer and U-Net diffusion models in both visual +quality and visual flaw metrics, while maintaining the same text alignment +quality. + +
+
+
+
+
+ + ☆ Triple Point Masking + + +
+ Existing 3D mask learning methods encounter performance bottlenecks under +limited data, and our objective is to overcome this limitation. In this paper, +we introduce a triple point masking scheme, named TPM, which serves as a +scalable framework for pre-training of masked autoencoders to achieve +multi-mask learning for 3D point clouds. Specifically, we augment the baselines +with two additional mask choices (i.e., medium mask and low mask) as our core +insight is that the recovery process of an object can manifest in diverse ways. +Previous high-masking schemes focus on capturing the global representation but +lack the fine-grained recovery capability, so that the generated pre-trained +weights tend to play a limited role in the fine-tuning process. With the +support of the proposed TPM, available methods can exhibit more flexible and +accurate completion capabilities, enabling the potential autoencoder in the +pre-training stage to consider multiple representations of a single 3D object. +In addition, an SVM-guided weight selection module is proposed to fill the +encoder parameters for downstream networks with the optimal weight during the +fine-tuning stage, maximizing linear accuracy and facilitating the acquisition +of intricate representations for new objects. Extensive experiments show that +the four baselines equipped with the proposed TPM achieve comprehensive +performance improvements on various downstream tasks. + +
+
+
+
+
+ + ☆ Modulated Intervention Preference Optimization (MIPO): Keey the Easy, + Refine the Difficult AAAI 2025 + + +
+ Preference optimization methods typically begin training with a well-trained +SFT model as a reference model. In RLHF and DPO, a regularization term is used +during the preference optimization process to prevent the policy model from +deviating too far from the reference model's distribution, thereby avoiding the +generation of anomalous responses. When the reference model is already +well-aligned with the given data or only requires slight adjustments, this +approach can produce a well-aligned model. However, if the reference model is +not aligned with the given data and requires significant deviation from its +current state, a regularization term may actually hinder the model alignment. +In this study, we propose \textbf{Modulated Intervention Preference +Optimization (MIPO)} to address this issue. MIPO modulates the degree of +intervention from the reference model based on how well the given data is +aligned with it. If the data is well-aligned, the intervention is increased to +prevent the policy model from diverging significantly from reference model. +Conversely, if the alignment is poor, the interference is reduced to facilitate +more extensive training. We compare the performance of MIPO and DPO using +Mistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental +results demonstrate that MIPO consistently outperforms DPO across various +evaluation scenarios. + +
+
+ comment: 8pages, submitted to AAAI 2025 +
+
+
+
+
+ + ☆ On the Implicit Relation Between Low-Rank Adaptation and Differential + Privacy + + +
+ A significant approach in natural language processing involves large-scale +pre-training on general domain data followed by adaptation to specific tasks or +domains. As models grow in size, full fine-tuning all parameters becomes +increasingly impractical. To address this, some methods for low-rank task +adaptation of language models have been proposed, e.g. LoRA and FLoRA. These +methods keep the pre-trained model weights fixed and incorporate trainable +low-rank decomposition matrices into some layers of the transformer +architecture, called adapters. This approach significantly reduces the number +of trainable parameters required for downstream tasks compared to full +fine-tuning all parameters. In this work, we look at low-rank adaptation from +the lens of data privacy. We show theoretically that the low-rank adaptation +used in LoRA and FLoRA is equivalent to injecting some random noise into the +batch gradients w.r.t the adapter parameters coming from their full +fine-tuning, and we quantify the variance of the injected noise. By +establishing a Berry-Esseen type bound on the total variation distance between +the noise distribution and a Gaussian distribution with the same variance, we +show that the dynamics of LoRA and FLoRA are very close to differentially +private full fine-tuning the adapters, which suggests that low-rank adaptation +implicitly provides privacy w.r.t the fine-tuning data. Finally, using +Johnson-Lindenstrauss lemma, we show that when augmented with gradient +clipping, low-rank adaptation is almost equivalent to differentially private +full fine-tuning adapters with a fixed noise scale. + +
+
+
+
+
+ + ☆ Just say what you want: only-prompting self-rewarding online preference + optimization + + +
+ We address the challenge of online Reinforcement Learning from Human Feedback +(RLHF) with a focus on self-rewarding alignment methods. In online RLHF, +obtaining feedback requires interaction with the environment, which can be +costly when using additional reward models or the GPT-4 API. Current +self-rewarding approaches rely heavily on the discriminator's judgment +capabilities, which are effective for large-scale models but challenging to +transfer to smaller ones. To address these limitations, we propose a novel, +only-prompting self-rewarding online algorithm that generates preference +datasets without relying on judgment capabilities. Additionally, we employ +fine-grained arithmetic control over the optimality gap between positive and +negative examples, generating more hard negatives in the later stages of +training to help the model better capture subtle human preferences. Finally, we +conduct extensive experiments on two base models, Mistral-7B and +Mistral-Instruct-7B, which significantly bootstrap the performance of the +reference model, achieving 34.5% in the Length-controlled Win Rates of +AlpacaEval 2.0. + +
+
+
+
+
+ + ☆ SimVG: A Simple Framework for Visual Grounding with Decoupled + Multi-modal Fusion NeurIPS2024 + + +
+ Visual grounding is a common vision task that involves grounding descriptive +sentences to the corresponding regions of an image. Most existing methods use +independent image-text encoding and apply complex hand-crafted modules or +encoder-decoder architectures for modal interaction and query reasoning. +However, their performance significantly drops when dealing with complex +textual expressions. This is because the former paradigm only utilizes limited +downstream data to fit the multi-modal feature fusion. Therefore, it is only +effective when the textual expressions are relatively simple. In contrast, +given the wide diversity of textual expressions and the uniqueness of +downstream training data, the existing fusion module, which extracts multimodal +content from a visual-linguistic context, has not been fully investigated. In +this paper, we present a simple yet robust transformer-based framework, SimVG, +for visual grounding. Specifically, we decouple visual-linguistic feature +fusion from downstream tasks by leveraging existing multimodal pre-trained +models and incorporating additional object tokens to facilitate deep +integration of downstream and pre-training tasks. Furthermore, we design a +dynamic weight-balance distillation method in the multi-branch synchronous +learning process to enhance the representation capability of the simpler +branch. This branch only consists of a lightweight MLP, which simplifies the +structure and improves reasoning speed. Experiments on six widely used VG +datasets, i.e., RefCOCO/+/g, ReferIt, Flickr30K, and GRefCOCO, demonstrate the +superiority of SimVG. Finally, the proposed method not only achieves +improvements in efficiency and convergence speed but also attains new +state-of-the-art performance on these benchmarks. Codes and models will be +available at \url{https://github.com/Dmmm1997/SimVG}. + +
+
+ comment: 21pages, 11figures, NeurIPS2024 +
+
+
+
+
+ + ☆ Drone Stereo Vision for Radiata Pine Branch Detection and Distance + Measurement: Integrating SGBM and Segmentation Models + + +
+ Manual pruning of radiata pine trees presents significant safety risks due to +their substantial height and the challenging terrains in which they thrive. To +address these risks, this research proposes the development of a drone-based +pruning system equipped with specialized pruning tools and a stereo vision +camera, enabling precise detection and trimming of branches. Deep learning +algorithms, including YOLO and Mask R-CNN, are employed to ensure accurate +branch detection, while the Semi-Global Matching algorithm is integrated to +provide reliable distance estimation. The synergy between these techniques +facilitates the precise identification of branch locations and enables +efficient, targeted pruning. Experimental results demonstrate that the combined +implementation of YOLO and SGBM enables the drone to accurately detect branches +and measure their distances from the drone. This research not only improves the +safety and efficiency of pruning operations but also makes a significant +contribution to the advancement of drone technology in the automation of +agricultural and forestry practices, laying a foundational framework for +further innovations in environmental management. + +
+
+
+
+
+ + ☆ A Generalized LLM-Augmented BIM Framework: Application to a + Speech-to-BIM system + + +
+ Performing building information modeling (BIM) tasks is a complex process +that imposes a steep learning curve and a heavy cognitive load due to the +necessity of remembering sequences of numerous commands. With the rapid +advancement of large language models (LLMs), it is foreseeable that BIM tasks, +including querying and managing BIM data, 4D and 5D BIM, design compliance +checking, or authoring a design, using written or spoken natural language +(i.e., text-to-BIM or speech-to-BIM), will soon supplant traditional graphical +user interfaces. This paper proposes a generalized LLM-augmented BIM framework +to expedite the development of LLM-enhanced BIM applications by providing a +step-by-step development process. The proposed framework consists of six steps: +interpret-fill-match-structure-execute-check. The paper demonstrates the +applicability of the proposed framework through implementing a speech-to-BIM +application, NADIA-S (Natural-language-based Architectural Detailing through +Interaction with Artificial Intelligence via Speech), using exterior wall +detailing as an example. + +
+
+
+
+
+ + ☆ Improving Agent Behaviors with RL Fine-tuning for Autonomous Driving + + +
+ A major challenge in autonomous vehicle research is modeling agent behaviors, +which has critical applications including constructing realistic and reliable +simulations for off-board evaluation and forecasting traffic agents motion for +onboard planning. While supervised learning has shown success in modeling +agents across various domains, these models can suffer from distribution shift +when deployed at test-time. In this work, we improve the reliability of agent +behaviors by closed-loop fine-tuning of behavior models with reinforcement +learning. Our method demonstrates improved overall performance, as well as +improved targeted metrics such as collision rate, on the Waymo Open Sim Agents +challenge. Additionally, we present a novel policy evaluation benchmark to +directly assess the ability of simulated agents to measure the quality of +autonomous vehicle planners and demonstrate the effectiveness of our approach +on this new benchmark. + +
+
+
+
+
+ + ☆ DRL-STNet: Unsupervised Domain Adaptation for Cross-modality Medical + Image Segmentation via Disentangled Representation Learning MICCAI 2024 + + +
+ Unsupervised domain adaptation (UDA) is essential for medical image +segmentation, especially in cross-modality data scenarios. UDA aims to transfer +knowledge from a labeled source domain to an unlabeled target domain, thereby +reducing the dependency on extensive manual annotations. This paper presents +DRL-STNet, a novel framework for cross-modality medical image segmentation that +leverages generative adversarial networks (GANs), disentangled representation +learning (DRL), and self-training (ST). Our method leverages DRL within a GAN +to translate images from the source to the target modality. Then, the +segmentation model is initially trained with these translated images and +corresponding source labels and then fine-tuned iteratively using a combination +of synthetic and real images with pseudo-labels and real labels. The proposed +framework exhibits superior performance in abdominal organ segmentation on the +FLARE challenge dataset, surpassing state-of-the-art methods by 11.4% in the +Dice similarity coefficient and by 13.1% in the Normalized Surface Dice metric, +achieving scores of 74.21% and 80.69%, respectively. The average running time +is 41 seconds, and the area under the GPU memory-time curve is 11,292 MB. These +results indicate the potential of DRL-STNet for enhancing cross-modality +medical image segmentation tasks. + +
+
+ comment: MICCAI 2024 Challenge, FLARE Challenge, Unsupervised domain + adaptation, Organ segmentation, Feature disentanglement, Self-training +
+
+
+
+
+ + ☆ AER-LLM: Ambiguity-aware Emotion Recognition Leveraging Large Language + Models + + +
+ Recent advancements in Large Language Models (LLMs) have demonstrated great +success in many Natural Language Processing (NLP) tasks. In addition to their +cognitive intelligence, exploring their capabilities in emotional intelligence +is also crucial, as it enables more natural and empathetic conversational AI. +Recent studies have shown LLMs' capability in recognizing emotions, but they +often focus on single emotion labels and overlook the complex and ambiguous +nature of human emotions. This study is the first to address this gap by +exploring the potential of LLMs in recognizing ambiguous emotions, leveraging +their strong generalization capabilities and in-context learning. We design +zero-shot and few-shot prompting and incorporate past dialogue as context +information for ambiguous emotion recognition. Experiments conducted using +three datasets indicate significant potential for LLMs in recognizing ambiguous +emotions, and highlight the substantial benefits of including context +information. Furthermore, our findings indicate that LLMs demonstrate a high +degree of effectiveness in recognizing less ambiguous emotions and exhibit +potential for identifying more ambiguous emotions, paralleling human perceptual +capabilities. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ A Fairness-Driven Method for Learning Human-Compatible Negotiation + Strategies EMNLP + + +
+ Despite recent advancements in AI and NLP, negotiation remains a difficult +domain for AI agents. Traditional game theoretic approaches that have worked +well for two-player zero-sum games struggle in the context of negotiation due +to their inability to learn human-compatible strategies. On the other hand, +approaches that only use human data tend to be domain-specific and lack the +theoretical guarantees provided by strategies grounded in game theory. +Motivated by the notion of fairness as a criterion for optimality in general +sum games, we propose a negotiation framework called FDHC which incorporates +fairness into both the reward design and search to learn human-compatible +negotiation strategies. Our method includes a novel, RL+search technique called +LGM-Zero which leverages a pre-trained language model to retrieve +human-compatible offers from large action spaces. Our results show that our +method is able to achieve more egalitarian negotiation outcomes and improve +negotiation quality. + +
+
+ comment: EMNLP Findings 2024 +
+
+
+
+
+ + ☆ Input-Dependent Power Usage in GPUs + + +
+ GPUs are known to be power-hungry, and due to the boom in artificial +intelligence, they are currently the major contributors to the high power +demands of upcoming datacenters. Most GPU usage in these popular workloads +consist of large general matrix-matrix multiplications (GEMMs), which have +therefore been optimized to achieve high utilization of hardware resources. In +this work, we show that modifying the input data to GEMMs, while maintaining +the matrix shapes and sizes can notably change the power consumption of these +kernels. We experiment with four kinds of input variations: value distribution, +bit similarity, placement, and sparsity, across different data types. Our +findings indicate that these variations can change the GPU power usage during +GEMM by almost 40%. We hypothesize that input-dependent power usage variations +occur due to changes in the number of bit flips in the GPUs. We propose +leveraging this property through compiler and scheduler optimizations to manage +power and reduce energy consumption. + +
+
+
+
+
+ + ☆ Cross-Institutional Structured Radiology Reporting for Lung Cancer + Screening Using a Dynamic Template-Constrained Large Language Model + + +
+ Structured radiology reporting is advantageous for optimizing clinical +workflows and patient outcomes. Current LLMs in creating structured reports +face the challenges of formatting errors, content hallucinations, and privacy +leakage concerns when uploaded to external servers. We aim to develop an +enhanced open-source LLM for creating structured and standardized LCS reports +from free-text descriptions. After institutional IRB approvals, 5,442 +de-identified LCS reports from two institutions were retrospectively analyzed. +500 reports were randomly selected from the two institutions evenly and then +manually labeled for evaluation. Two radiologists from the two institutions +developed a standardized template including 29 features for lung nodule +reporting. We proposed template-constrained decoding to enhance +state-of-the-art open-source LLMs, including LLAMA, Qwen, and Mistral. The LLM +performance was extensively evaluated in terms of F1 score, confidence +interval, McNemar test, and z-test. Based on the structured reports created +from the large-scale dataset, a nodule-level retrieval system was prototyped +and an automatic statistical analysis was performed. Our software, +vLLM-structure, is publicly available for local deployment with enhanced LLMs. +Our template-constrained decoding approach consistently enhanced the LLM +performance on multi-institutional datasets, with neither formatting errors nor +content hallucinations. Our method improved the best open-source LLAMA-3.1 405B +by up to 10.42%, and outperformed GPT-4o by 17.19%. A novel nodule retrieval +system was successfully prototyped and demonstrated on a large-scale multimodal +database using our enhanced LLM technologies. The automatically derived +statistical distributions were closely consistent with the prior findings in +terms of nodule type, location, size, status, and Lung-RADS. + +
+
+
+
+
+ + ☆ Embodied-RAG: General non-parametric Embodied Memory for Retrieval and + Generation + + +
+ There is no limit to how much a robot might explore and learn, but all of +that knowledge needs to be searchable and actionable. Within language research, +retrieval augmented generation (RAG) has become the workhouse of large-scale +non-parametric knowledge, however existing techniques do not directly transfer +to the embodied domain, which is multimodal, data is highly correlated, and +perception requires abstraction. + To address these challenges, we introduce Embodied-RAG, a framework that +enhances the foundational model of an embodied agent with a non-parametric +memory system capable of autonomously constructing hierarchical knowledge for +both navigation and language generation. Embodied-RAG handles a full range of +spatial and semantic resolutions across diverse environments and query types, +whether for a specific object or a holistic description of ambiance. At its +core, Embodied-RAG's memory is structured as a semantic forest, storing +language descriptions at varying levels of detail. This hierarchical +organization allows the system to efficiently generate context-sensitive +outputs across different robotic platforms. We demonstrate that Embodied-RAG +effectively bridges RAG to the robotics domain, successfully handling over 200 +explanation and navigation queries across 19 environments, highlighting its +promise for general-purpose non-parametric system for embodied agents. + +
+
+ comment: Web: https://quanting-xie.github.io/Embodied-RAG-web/ +
+
+
+
+
+ + ☆ Harnessing Wavelet Transformations for Generalizable Deepfake Forgery + Detection + + +
+ The evolution of digital image manipulation, particularly with the +advancement of deep generative models, significantly challenges existing +deepfake detection methods, especially when the origin of the deepfake is +obscure. To tackle the increasing complexity of these forgeries, we propose +\textbf{Wavelet-CLIP}, a deepfake detection framework that integrates wavelet +transforms with features derived from the ViT-L/14 architecture, pre-trained in +the CLIP fashion. Wavelet-CLIP utilizes Wavelet Transforms to deeply analyze +both spatial and frequency features from images, thus enhancing the model's +capability to detect sophisticated deepfakes. To verify the effectiveness of +our approach, we conducted extensive evaluations against existing +state-of-the-art methods for cross-dataset generalization and detection of +unseen images generated by standard diffusion models. Our method showcases +outstanding performance, achieving an average AUC of 0.749 for cross-data +generalization and 0.893 for robustness against unseen deepfakes, outperforming +all compared methods. The code can be reproduced from the repo: +\url{https://github.com/lalithbharadwajbaru/Wavelet-CLIP} + +
+
+
+
+
+ + ☆ SOAR: Self-supervision Optimized UAV Action Recognition with Efficient + Object-Aware Pretraining + + +
+ We introduce SOAR, a novel Self-supervised pretraining algorithm for aerial +footage captured by Unmanned Aerial Vehicles (UAVs). We incorporate human +object knowledge throughout the pretraining process to enhance UAV video +pretraining efficiency and downstream action recognition performance. This is +in contrast to prior works that primarily incorporate object information during +the fine-tuning stage. Specifically, we first propose a novel object-aware +masking strategy designed to retain the visibility of certain patches related +to objects throughout the pretraining phase. Second, we introduce an +object-aware loss function that utilizes object information to adjust the +reconstruction loss, preventing bias towards less informative background +patches. In practice, SOAR with a vanilla ViT backbone, outperforms best UAV +action recognition models, recording a 9.7% and 21.4% boost in top-1 accuracy +on the NEC-Drone and UAV-Human datasets, while delivering an inference speed of +18.7ms per video, making it 2x to 5x faster. Additionally, SOAR obtains +comparable accuracy to prior self-supervised learning (SSL) methods while +requiring 87.5% less pretraining time and 25% less memory usage + +
+
+
+
+
+ + ☆ Flat'n'Fold: A Diverse Multi-Modal Dataset for Garment Perception and + Manipulation + + +
+ We present Flat'n'Fold, a novel large-scale dataset for garment manipulation +that addresses critical gaps in existing datasets. Comprising 1,212 human and +887 robot demonstrations of flattening and folding 44 unique garments across 8 +categories, Flat'n'Fold surpasses prior datasets in size, scope, and diversity. +Our dataset uniquely captures the entire manipulation process from crumpled to +folded states, providing synchronized multi-view RGB-D images, point clouds, +and action data, including hand or gripper positions and rotations. We quantify +the dataset's diversity and complexity compared to existing benchmarks and show +that our dataset features natural and diverse manipulations of real-world +demonstrations of human and robot demonstrations in terms of visual and action +information. To showcase Flat'n'Fold's utility, we establish new benchmarks for +grasping point prediction and subtask decomposition. Our evaluation of +state-of-the-art models on these tasks reveals significant room for +improvement. This underscores Flat'n'Fold's potential to drive advances in +robotic perception and manipulation of deformable objects. Our dataset can be +downloaded at https://cvas-ug.github.io/flat-n-fold + +
+
+
+
+
+ + ☆ Enhancing Lossy Compression Through Cross-Field Information for + Scientific Applications + + +
+ Lossy compression is one of the most effective methods for reducing the size +of scientific data containing multiple data fields. It reduces information +density through prediction or transformation techniques to compress the data. +Previous approaches use local information from a single target field when +predicting target data points, limiting their potential to achieve higher +compression ratios. In this paper, we identified significant cross-field +correlations within scientific datasets. We propose a novel hybrid prediction +model that utilizes CNN to extract cross-field information and combine it with +existing local field information. Our solution enhances the prediction accuracy +of lossy compressors, leading to improved compression ratios without +compromising data quality. We evaluate our solution on three scientific +datasets, demonstrating its ability to improve compression ratios by up to 25% +under specific error bounds. Additionally, our solution preserves more data +details and reduces artifacts compared to baseline approaches. + +
+
+ comment: 9 pages, 9 figures, accepted by DRBSD-10 +
+
+
+
+
+ + ☆ Retrospective Comparative Analysis of Prostate Cancer In-Basket + Messages: Responses from Closed-Domain LLM vs. Clinical Teams + + +
+ In-basket message interactions play a crucial role in physician-patient +communication, occurring during all phases (pre-, during, and post) of a +patient's care journey. However, responding to these patients' inquiries has +become a significant burden on healthcare workflows, consuming considerable +time for clinical care teams. To address this, we introduce RadOnc-GPT, a +specialized Large Language Model (LLM) powered by GPT-4 that has been designed +with a focus on radiotherapeutic treatment of prostate cancer with advanced +prompt engineering, and specifically designed to assist in generating +responses. We integrated RadOnc-GPT with patient electronic health records +(EHR) from both the hospital-wide EHR database and an internal, +radiation-oncology-specific database. RadOnc-GPT was evaluated on 158 +previously recorded in-basket message interactions. Quantitative natural +language processing (NLP) analysis and two grading studies with clinicians and +nurses were used to assess RadOnc-GPT's responses. Our findings indicate that +RadOnc-GPT slightly outperformed the clinical care team in "Clarity" and +"Empathy," while achieving comparable scores in "Completeness" and +"Correctness." RadOnc-GPT is estimated to save 5.2 minutes per message for +nurses and 2.4 minutes for clinicians, from reading the inquiry to sending the +response. Employing RadOnc-GPT for in-basket message draft generation has the +potential to alleviate the workload of clinical care teams and reduce +healthcare costs by producing high-quality, timely responses. + +
+
+
+
+
+ + ☆ Criticality and Safety Margins for Reinforcement Learning + + +
+ State of the art reinforcement learning methods sometimes encounter unsafe +situations. Identifying when these situations occur is of interest both for +post-hoc analysis and during deployment, where it might be advantageous to call +out to a human overseer for help. Efforts to gauge the criticality of different +points in time have been developed, but their accuracy is not well established +due to a lack of ground truth, and they are not designed to be easily +interpretable by end users. Therefore, we seek to define a criticality +framework with both a quantifiable ground truth and a clear significance to +users. We introduce true criticality as the expected drop in reward when an +agent deviates from its policy for n consecutive random actions. We also +introduce the concept of proxy criticality, a low-overhead metric that has a +statistically monotonic relationship to true criticality. Safety margins make +these interpretable, when defined as the number of random actions for which +performance loss will not exceed some tolerance with high confidence. We +demonstrate this approach in several environment-agent combinations; for an A3C +agent in an Atari Beamrider environment, the lowest 5% of safety margins +contain 47% of agent losses; i.e., supervising only 5% of decisions could +potentially prevent roughly half of an agent's errors. This criticality +framework measures the potential impacts of bad decisions, even before those +decisions are made, allowing for more effective debugging and oversight of +autonomous agents. + +
+
+ comment: 17 pages, 10 figures. This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ♻ ☆ UDC: A Unified Neural Divide-and-Conquer Framework for Large-Scale + Combinatorial Optimization Problems + + +
+ Single-stage neural combinatorial optimization solvers have achieved +near-optimal results on various small-scale combinatorial optimization (CO) +problems without needing expert knowledge. However, these solvers exhibit +significant performance degradation when applied to large-scale CO problems. +Recently, two-stage neural methods with divide-and-conquer strategies have +shown efficiency in addressing large-scale CO problems. Nevertheless, the +performance of these methods highly relies on problem-specific heuristics in +either the divide or the conquer procedure, which limits their applicability to +general CO problems. Moreover, these methods employ separate training schemes +and ignore the interdependencies between the dividing and conquering +strategies, which often leads to sub-optimal solutions. To tackle these +drawbacks, this article develops a unified neural divide-and-conquer framework +(i.e., UDC) for solving general large-scale CO problems. UDC offers a +Divide-Conquer-Reunion (DCR) training method to eliminate the negative impact +of a sub-optimal dividing policy. Employing a high-efficiency Graph Neural +Network (GNN) for global instance dividing and a fixed-length sub-path solver +for conquering divided sub-problems, the proposed UDC framework demonstrates +extensive applicability, achieving superior performance in 10 representative +large-scale CO problems. The code is available at +https://github.com/CIAM-Group/NCO_code/tree/main/single_objective/UDC-Large-scale-CO-master. + +
+
+
+
+
+ + ♻ ☆ Is It Good Data for Multilingual Instruction Tuning or Just Bad + Multilingual Evaluation for Large Language Models? EMNLP 2024 + + +
+ Multilingual large language models are designed, claimed, and expected to +cater to speakers of varied languages. We hypothesise that the current +practices of fine-tuning and evaluating these models may not perfectly align +with this objective owing to a heavy reliance on translation, which cannot +cover language-specific knowledge but can introduce translation defects. It +remains unknown whether the nature of the instruction data has an impact on the +model output; conversely, it is questionable whether translated test sets can +capture such nuances. Due to the often coupled practices of using translated +data in both stages, such imperfections could have been overlooked. This work +investigates these issues using controlled native or translated data during the +instruction tuning and evaluation stages. We show that native or generation +benchmarks reveal a notable difference between native and translated +instruction data especially when model performance is high, whereas other types +of test sets cannot. The comparison between round-trip and single-pass +translations reflects the importance of knowledge from language-native +resources. Finally, we demonstrate that regularization is beneficial to +bridging this gap on structured but not generative tasks. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ AI-driven View Guidance System in Intra-cardiac Echocardiography Imaging + + +
+ Intra-cardiac Echocardiography (ICE) is a crucial imaging modality used in +electrophysiology (EP) and structural heart disease (SHD) interventions, +providing real-time, high-resolution views from within the heart. Despite its +advantages, effective manipulation of the ICE catheter requires significant +expertise, which can lead to inconsistent outcomes, particularly among less +experienced operators. To address this challenge, we propose an AI-driven +closed-loop view guidance system with human-in-the-loop feedback, designed to +assist users in navigating ICE imaging without requiring specialized knowledge. +Our method models the relative position and orientation vectors between +arbitrary views and clinically defined ICE views in a spatial coordinate +system, guiding users on how to manipulate the ICE catheter to transition from +the current view to the desired view over time. Operating in a closed-loop +configuration, the system continuously predicts and updates the necessary +catheter manipulations, ensuring seamless integration into existing clinical +workflows. The effectiveness of the proposed system is demonstrated through a +simulation-based evaluation, achieving an 89% success rate with the 6532 test +dataset, highlighting its potential to improve the accuracy and efficiency of +ICE imaging procedures. + +
+
+
+
+
+ + ♻ ☆ Learning Interactive Real-World Simulators + + +
+ Generative models trained on internet data have revolutionized how text, +image, and video content can be created. Perhaps the next milestone for +generative models is to simulate realistic experience in response to actions +taken by humans, robots, and other interactive agents. Applications of a +real-world simulator range from controllable content creation in games and +movies, to training embodied agents purely in simulation that can be directly +deployed in the real world. We explore the possibility of learning a universal +simulator (UniSim) of real-world interaction through generative modeling. We +first make the important observation that natural datasets available for +learning a real-world simulator are often rich along different dimensions +(e.g., abundant objects in image data, densely sampled actions in robotics +data, and diverse movements in navigation data). With careful orchestration of +diverse datasets, each providing a different aspect of the overall experience, +we can simulate the visual outcome of both high-level instructions such as +"open the drawer" and low-level controls from otherwise static scenes and +objects. We use the simulator to train both high-level vision-language policies +and low-level reinforcement learning policies, each of which can be deployed in +the real world in zero shot after training purely in simulation. We also show +that other types of intelligence such as video captioning models can benefit +from training with simulated experience, opening up even wider applications. +Video demos can be found at https://universal-simulator.github.io. + +
+
+ comment: https://universal-simulator.github.io +
+
+
+
+
+ + ♻ ☆ Ascend HiFloat8 Format for Deep Learning + + +
+ This preliminary white paper proposes a novel 8-bit floating-point data +format HiFloat8 (abbreviated as HiF8) for deep learning. HiF8 features tapered +precision. For normal value encoding, it provides 7 exponent values with 3-bit +mantissa, 8 exponent values with 2-bit mantissa, and 16 exponent values with +1-bit mantissa. For denormal value encoding, it extends the dynamic range by 7 +extra powers of 2, from 31 to 38 binades (notice that FP16 covers 40 binades). +Meanwhile, HiF8 encodes all the special values except that positive zero and +negative zero are represented by only one bit-pattern. Thanks to the better +balance between precision and dynamic range, HiF8 can be simultaneously used in +both forward and backward passes of AI training. In this paper, we will +describe the definition and rounding methods of HiF8, as well as the tentative +training and inference solutions. To demonstrate the efficacy of HiF8, massive +simulation results on various neural networks, including traditional neural +networks and large language models (LLMs), will also be presented. + +
+
+ comment: 13 Pages, 4 Figures, 9 Tables +
+
+
+
+
+ + ♻ ☆ KAG: Boosting LLMs in Professional Domains via Knowledge Augmented + Generation + + +
+ The recently developed retrieval-augmented generation (RAG) technology has +enabled the efficient construction of domain-specific applications. However, it +also has limitations, including the gap between vector similarity and the +relevance of knowledge reasoning, as well as insensitivity to knowledge logic, +such as numerical values, temporal relations, expert rules, and others, which +hinder the effectiveness of professional knowledge services. In this work, we +introduce a professional domain knowledge service framework called Knowledge +Augmented Generation (KAG). KAG is designed to address the aforementioned +challenges with the motivation of making full use of the advantages of +knowledge graph(KG) and vector retrieval, and to improve generation and +reasoning performance by bidirectionally enhancing large language models (LLMs) +and KGs through five key aspects: (1) LLM-friendly knowledge representation, +(2) mutual-indexing between knowledge graphs and original chunks, (3) +logical-form-guided hybrid reasoning engine, (4) knowledge alignment with +semantic reasoning, and (5) model capability enhancement for KAG. We compared +KAG with existing RAG methods in multihop question answering and found that it +significantly outperforms state-of-theart methods, achieving a relative +improvement of 19.6% on 2wiki and 33.5% on hotpotQA in terms of F1 score. We +have successfully applied KAG to two professional knowledge Q&A tasks of Ant +Group, including E-Government Q&A and E-Health Q&A, achieving significant +improvement in professionalism compared to RAG methods. + +
+
+ comment: 33 pages +
+
+
+
+
+ + ♻ ☆ HAICOSYSTEM: An Ecosystem for Sandboxing Safety Risks in Human-AI + Interactions + + +
+ AI agents are increasingly autonomous in their interactions with human users +and tools, leading to increased interactional safety risks. We present +HAICOSYSTEM, a framework examining AI agent safety within diverse and complex +social interactions. HAICOSYSTEM features a modular sandbox environment that +simulates multi-turn interactions between human users and AI agents, where the +AI agents are equipped with a variety of tools (e.g., patient management +platforms) to navigate diverse scenarios (e.g., a user attempting to access +other patients' profiles). To examine the safety of AI agents in these +interactions, we develop a comprehensive multi-dimensional evaluation framework +that uses metrics covering operational, content-related, societal, and legal +risks. Through running 1840 simulations based on 92 scenarios across seven +domains (e.g., healthcare, finance, education), we demonstrate that HAICOSYSTEM +can emulate realistic user-AI interactions and complex tool use by AI agents. +Our experiments show that state-of-the-art LLMs, both proprietary and +open-sourced, exhibit safety risks in over 50\% cases, with models generally +showing higher risks when interacting with simulated malicious users. Our +findings highlight the ongoing challenge of building agents that can safely +navigate complex interactions, particularly when faced with malicious users. To +foster the AI agent safety ecosystem, we release a code platform that allows +practitioners to create custom scenarios, simulate interactions, and evaluate +the safety and performance of their agents. + +
+
+ comment: Both the second and third authors contributed equally +
+
+
+
+
+ + ♻ ☆ TypeFly: Flying Drones with Large Language Model + + +
+ Recent advancements in robot control using large language models (LLMs) have +demonstrated significant potential, primarily due to LLMs' capabilities to +understand natural language commands and generate executable plans in various +languages. However, in real-time and interactive applications involving mobile +robots, particularly drones, the sequential token generation process inherent +to LLMs introduces substantial latency, i.e. response time, in control plan +generation. + In this paper, we present a system called ChatFly that tackles this problem +using a combination of a novel programming language called MiniSpec and its +runtime to reduce the plan generation time and drone response time. That is, +instead of asking an LLM to write a program (robotic plan) in the popular but +verbose Python, ChatFly gets it to do it in MiniSpec specially designed for +token efficiency and stream interpretation. Using a set of challenging drone +tasks, we show that design choices made by ChatFly can reduce up to 62% +response time and provide a more consistent user experience, enabling +responsive and intelligent LLM-based drone control with efficient completion. + +
+
+
+
+
+ + ♻ ☆ LingoQA: Visual Question Answering for Autonomous Driving ECCV 2024 + + +
+ We introduce LingoQA, a novel dataset and benchmark for visual question +answering in autonomous driving. The dataset contains 28K unique short video +scenarios, and 419K annotations. Evaluating state-of-the-art vision-language +models on our benchmark shows that their performance is below human +capabilities, with GPT-4V responding truthfully to 59.6% of the questions +compared to 96.6% for humans. For evaluation, we propose a truthfulness +classifier, called Lingo-Judge, that achieves a 0.95 Spearman correlation +coefficient to human evaluations, surpassing existing techniques like METEOR, +BLEU, CIDEr, and GPT-4. We establish a baseline vision-language model and run +extensive ablation studies to understand its performance. We release our +dataset and benchmark as an evaluation platform for vision-language models in +autonomous driving. + +
+
+ comment: Accepted to ECCV 2024. Benchmark and dataset are available at + https://github.com/wayveai/LingoQA/ +
+
+
+
+
+ + ♻ ☆ Language agents achieve superhuman synthesis of scientific knowledge + + +
+ Language models are known to hallucinate incorrect information, and it is +unclear if they are sufficiently accurate and reliable for use in scientific +research. We developed a rigorous human-AI comparison methodology to evaluate +language model agents on real-world literature search tasks covering +information retrieval, summarization, and contradiction detection tasks. We +show that PaperQA2, a frontier language model agent optimized for improved +factuality, matches or exceeds subject matter expert performance on three +realistic literature research tasks without any restrictions on humans (i.e., +full access to internet, search tools, and time). PaperQA2 writes cited, +Wikipedia-style summaries of scientific topics that are significantly more +accurate than existing, human-written Wikipedia articles. We also introduce a +hard benchmark for scientific literature research called LitQA2 that guided +design of PaperQA2, leading to it exceeding human performance. Finally, we +apply PaperQA2 to identify contradictions within the scientific literature, an +important scientific task that is challenging for humans. PaperQA2 identifies +2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of +which 70% are validated by human experts. These results demonstrate that +language model agents are now capable of exceeding domain experts across +meaningful tasks on scientific literature. + +
+
+
+
+
+ + ♻ ☆ TabGraphs: A Benchmark and Strong Baselines for Learning on Graphs with + Tabular Node Features + + +
+ Tabular machine learning is an important field for industry and science. In +this field, table rows are usually treated as independent data samples, but +additional information about relations between them is sometimes available and +can be used to improve predictive performance. Such information can be +naturally modeled with a graph, thus tabular machine learning may benefit from +graph machine learning methods. However, graph machine learning models are +typically evaluated on datasets with homogeneous node features, which have +little in common with heterogeneous mixtures of numerical and categorical +features present in tabular datasets. Thus, there is a critical difference +between the data used in tabular and graph machine learning studies, which does +not allow one to understand how successfully graph models can be transferred to +tabular data. To bridge this gap, we propose a new benchmark of diverse graphs +with heterogeneous tabular node features and realistic prediction tasks. We use +this benchmark to evaluate a vast set of models, including simple methods +previously overlooked in the literature. Our experiments show that graph neural +networks (GNNs) can indeed often bring gains in predictive performance for +tabular data, but standard tabular models also can be adapted to work with +graph data by using simple feature preprocessing, which sometimes enables them +to compete with and even outperform GNNs. Based on our empirical study, we +provide insights for researchers and practitioners in both tabular and graph +machine learning fields. + +
+
+
+
+
+ + ♻ ☆ Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of + Anomalous Behavior in Bio-regenerative Life Support System Telemetry ECML + + +
+ The detection of abnormal or critical system states is essential in condition +monitoring. While much attention is given to promptly identifying anomalies, a +retrospective analysis of these anomalies can significantly enhance our +comprehension of the underlying causes of observed undesired behavior. This +aspect becomes particularly critical when the monitored system is deployed in a +vital environment. In this study, we delve into anomalies within the domain of +Bio-Regenerative Life Support Systems (BLSS) for space exploration and analyze +anomalies found in telemetry data stemming from the EDEN ISS space greenhouse +in Antarctica. We employ time series clustering on anomaly detection results to +categorize various types of anomalies in both uni- and multivariate settings. +We then assess the effectiveness of these methods in identifying systematic +anomalous behavior. Additionally, we illustrate that the anomaly detection +methods MDI and DAMP produce complementary results, as previously indicated by +research. + +
+
+ comment: 12 pages, + Supplemental Materials, Published at Machine Learning and + Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024 +
+
+
+
+
+ + ♻ ☆ A Comprehensive Framework for Evaluating API-oriented Code Generation in + Large Language Models + + +
+ Large language models (LLMs) like GitHub Copilot and ChatGPT have emerged as +powerful tools for code generation, significantly enhancing productivity and +accelerating software development. However, existing benchmarks primarily focus +on general code generation without considering API-oriented code generation, +i.e., generating code that invokes APIs from specific libraries. Given the +growing demand for API-oriented code generation, there is a pressing need for a +systematic and automated approach to evaluate LLM on API-oriented code +generation. To address this gap, we propose AutoAPIEval, a lightweight and +automated framework designed to evaluate the capabilities of LLMs in +API-oriented code generation. Our framework works with any library that +provides API documentation and focuses on two unit tasks: API recommendation +and code example generation, along with four metrics to evaluate the generated +APIs and code examples, such as the proportion of incorrect API recommendations +for Task 1, and the proportion of code examples where no specific API is +invoked and uncompilable/unexecutable code examples for Task 2. In addition, we +conducted a case study on three LLMs (ChatGPT, MagiCoder, and DeepSeek Coder) +and Java Runtime Environment 8 to demonstrate the framework's effectiveness. +Our findings reveal substantial variability in LLM performance across tasks, +with ChatGPT adhering better to instructions, while sharing similar +effectiveness in code example generation with its counterparts (i.e., MagiCoder +and DeekSeek Coder). We also identify key factors associated with code quality, +such as API popularity and model confidence, and build classifiers that achieve +high accuracy in detecting incorrect API recommendations and erroneous code +examples. Retrieval-augmented generation enhances the quality of code generated +by LLMs, though its effectiveness varies across different LLMs. + +
+
+
+
+
+ + ♻ ☆ Transformers, Contextualism, and Polysemy + + +
+ The transformer architecture, introduced by Vaswani et al. (2017), is at the +heart of the remarkable recent progress in the development of language models, +including widely-used chatbots such as Chat-GPT and Claude. In this paper, I +argue that we can extract from the way the transformer architecture works a +theory of the relationship between context and meaning. I call this the +transformer theory, and I argue that it is novel with regard to two related +philosophical debates: the contextualism debate regarding the extent of +context-sensitivity across natural language, and the polysemy debate regarding +how polysemy should be captured within an account of word meaning. + +
+
+
+
+
+ + ♻ ☆ Opponent Shaping for Antibody Development + + +
+ Anti-viral therapies are typically designed to target the current strains of +a virus. Game theoretically, this corresponds to a short-sighted, or myopic, +response. However, therapy-induced selective pressures act on viral antigens to +drive the emergence of mutated strains, against which initial therapies have +reduced efficacy. Building on a computational model of binding between +antibodies and viral antigens (the Absolut! framework), we design and implement +a genetic simulation of such viral evolutionary escape. Crucially, this allows +our antibody optimisation algorithm to consider and influence the entire escape +curve of the virus, i.e. to guide (or ''shape'') the viral evolution. This is +inspired by opponent shaping which, in general-sum learning, accounts for the +adaptation of the co-player rather than playing a myopic best response. Hence +we call the optimised antibodies shapers. Within our simulations, we +demonstrate that our shapers target both current and simulated future viral +variants, outperforming the antibodies chosen in a myopic way. Furthermore, we +show that shapers exert specific evolutionary pressure on the virus compared to +myopic antibodies. Altogether, shapers modify the evolutionary trajectories of +viral strains and minimise the viral escape compared to their myopic +counterparts. While this is a simplified model, we hope that our proposed +paradigm will enable the discovery of better long-lived vaccines and antibody +therapies in the future, enabled by rapid advancements in the capabilities of +simulation tools. Our code is available at +https://github.com/olakalisz/antibody-shapers. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ Discrete, compositional, and symbolic representations through attractor + dynamics + + +
+ Symbolic systems are powerful frameworks for modeling cognitive processes as +they encapsulate the rules and relationships fundamental to many aspects of +human reasoning and behavior. Central to these models are systematicity, +compositionality, and productivity, making them invaluable in both cognitive +science and artificial intelligence. However, certain limitations remain. For +instance, the integration of structured symbolic processes and latent +sub-symbolic processes has been implemented at the computational level through +fiat methods such as quantization or softmax sampling, which assume, rather +than derive, the operations underpinning discretization and symbolicization. In +this work, we introduce a novel neural stochastic dynamical systems model that +integrates attractor dynamics with symbolic representations to model cognitive +processes akin to the probabilistic language of thought (PLoT). Our model +segments the continuous representational space into discrete basins, with +attractor states corresponding to symbolic sequences, that reflect the +semanticity and compositionality characteristic of symbolic systems through +unsupervised learning, rather than relying on pre-defined primitives. Moreover, +like PLoT, our model learns to sample a diverse distribution of attractor +states that reflect the mutual information between the input data and the +symbolic encodings. This approach establishes a unified framework that +integrates both symbolic and sub-symbolic processing through neural dynamics, a +neuro-plausible substrate with proven expressivity in AI, offering a more +comprehensive model that mirrors the complex duality of cognitive operations. + +
+
+
+
+
+ + ♻ ☆ ZSC-Eval: An Evaluation Toolkit and Benchmark for Multi-agent Zero-shot + Coordination NeurIPS 2024 + + +
+ Zero-shot coordination (ZSC) is a new cooperative multi-agent reinforcement +learning (MARL) challenge that aims to train an ego agent to work with diverse, +unseen partners during deployment. The significant difference between the +deployment-time partners' distribution and the training partners' distribution +determined by the training algorithm makes ZSC a unique out-of-distribution +(OOD) generalization challenge. The potential distribution gap between +evaluation and deployment-time partners leads to inadequate evaluation, which +is exacerbated by the lack of appropriate evaluation metrics. In this paper, we +present ZSC-Eval, the first evaluation toolkit and benchmark for ZSC +algorithms. ZSC-Eval consists of: 1) Generation of evaluation partner +candidates through behavior-preferring rewards to approximate deployment-time +partners' distribution; 2) Selection of evaluation partners by Best-Response +Diversity (BR-Div); 3) Measurement of generalization performance with various +evaluation partners via the Best-Response Proximity (BR-Prox) metric. We use +ZSC-Eval to benchmark ZSC algorithms in Overcooked and Google Research Football +environments and get novel empirical findings. We also conduct a human +experiment of current ZSC algorithms to verify the ZSC-Eval's consistency with +human evaluation. ZSC-Eval is now available at +https://github.com/sjtu-marl/ZSC-Eval. + +
+
+ comment: Accepted in NeurIPS 2024 Dataset and Benchmark Track +
+
+
+
+
+ + ♻ ☆ Empowering Agrifood System with Artificial Intelligence: A Survey of the + Progress, Challenges and Opportunities + + +
+ With the world population rapidly increasing, transforming our agrifood +systems to be more productive, efficient, safe, and sustainable is crucial to +mitigate potential food shortages. Recently, artificial intelligence (AI) +techniques such as deep learning (DL) have demonstrated their strong abilities +in various areas, including language, vision, remote sensing (RS), and agrifood +systems applications. However, the overall impact of AI on agrifood systems +remains unclear. In this paper, we thoroughly review how AI techniques can +transform agrifood systems and contribute to the modern agrifood industry. +Firstly, we summarize the data acquisition methods in agrifood systems, +including acquisition, storage, and processing techniques. Secondly, we present +a progress review of AI methods in agrifood systems, specifically in +agriculture, animal husbandry, and fishery, covering topics such as agrifood +classification, growth monitoring, yield prediction, and quality assessment. +Furthermore, we highlight potential challenges and promising research +opportunities for transforming modern agrifood systems with AI. We hope this +survey could offer an overall picture to newcomers in the field and serve as a +starting point for their further research. The project website is +https://github.com/Frenkie14/Agrifood-Survey. + +
+
+ comment: Accepted by ACM Computing Surveys +
+
+
+
+
+ + ♻ ☆ Investigating OCR-Sensitive Neurons to Improve Entity Recognition in + Historical Documents + + +
+ This paper investigates the presence of OCR-sensitive neurons within the +Transformer architecture and their influence on named entity recognition (NER) +performance on historical documents. By analysing neuron activation patterns in +response to clean and noisy text inputs, we identify and then neutralise +OCR-sensitive neurons to improve model performance. Based on two open access +large language models (Llama2 and Mistral), experiments demonstrate the +existence of OCR-sensitive regions and show improvements in NER performance on +historical newspapers and classical commentaries, highlighting the potential of +targeted neuron modulation to improve models' performance on noisy text. + +
+
+
+
+
+ + ♻ ☆ Scenario of Use Scheme: Threat Model Specification for Speaker Privacy + Protection in the Medical Domain + + +
+ Speech recordings are being more frequently used to detect and monitor +disease, leading to privacy concerns. Beyond cryptography, protection of speech +can be addressed by approaches, such as perturbation, disentanglement, and +re-synthesis, that eliminate sensitive information of the speaker, leaving the +information necessary for medical analysis purposes. In order for such privacy +protective approaches to be developed, clear and systematic specifications of +assumptions concerning medical settings and the needs of medical professionals +are necessary. In this paper, we propose a Scenario of Use Scheme that +incorporates an Attacker Model, which characterizes the adversary against whom +the speaker's privacy must be defended, and a Protector Model, which specifies +the defense. We discuss the connection of the scheme with previous work on +speech privacy. Finally, we present a concrete example of a specified Scenario +of Use and a set of experiments about protecting speaker data against gender +inference attacks while maintaining utility for Parkinson's detection. + +
+
+ comment: Accepted and published at SPSC Symposium 2024 4th Symposium on + Security and Privacy in Speech Communication. Interspeech 2024 +
+
+
+
+
+ + ♻ ☆ Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation CoRL 2024 + + +
+ Given the high cost of collecting robotic data in the real world, sample +efficiency is a consistently compelling pursuit in robotics. In this paper, we +introduce SGRv2, an imitation learning framework that enhances sample +efficiency through improved visual and action representations. Central to the +design of SGRv2 is the incorporation of a critical inductive bias-action +locality, which posits that robot's actions are predominantly influenced by the +target object and its interactions with the local environment. Extensive +experiments in both simulated and real-world settings demonstrate that action +locality is essential for boosting sample efficiency. SGRv2 excels in RLBench +tasks with keyframe control using merely 5 demonstrations and surpasses the RVT +baseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and +MimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR. +In real-world environments, with only eight demonstrations, SGRv2 can perform a +variety of tasks at a markedly higher success rate compared to baseline models. +Project website: http://sgrv2-robot.github.io + +
+
+ comment: CoRL 2024. Project website: http://sgrv2-robot.github.io +
+
+
+
+
+ + ♻ ☆ Explainable AI needs formal notions of explanation correctness + + +
+ The use of machine learning (ML) in critical domains such as medicine poses +risks and requires regulation. One requirement is that decisions of ML systems +in high-risk applications should be human-understandable. The field of +"explainable artificial intelligence" (XAI) seemingly addresses this need. +However, in its current form, XAI is unfit to provide quality control for ML; +it itself needs scrutiny. Popular XAI methods cannot reliably answer important +questions about ML models, their training data, or a given test input. We +recapitulate results demonstrating that popular XAI methods systematically +attribute importance to input features that are independent of the prediction +target. This limits their utility for purposes such as model and data +(in)validation, model improvement, and scientific discovery. We argue that the +fundamental reason for this limitation is that current XAI methods do not +address well-defined problems and are not evaluated against objective criteria +of explanation correctness. Researchers should formally define the problems +they intend to solve first and then design methods accordingly. This will lead +to notions of explanation correctness that can be theoretically verified and +objective metrics of explanation performance that can be assessed using +ground-truth data. + +
+
+
+
+
+ + ♻ ☆ Learning to Receive Help: Intervention-Aware Concept Embedding Models NeurIPS 2023 + + +
+ Concept Bottleneck Models (CBMs) tackle the opacity of neural architectures +by constructing and explaining their predictions using a set of high-level +concepts. A special property of these models is that they permit concept +interventions, wherein users can correct mispredicted concepts and thus improve +the model's performance. Recent work, however, has shown that intervention +efficacy can be highly dependent on the order in which concepts are intervened +on and on the model's architecture and training hyperparameters. We argue that +this is rooted in a CBM's lack of train-time incentives for the model to be +appropriately receptive to concept interventions. To address this, we propose +Intervention-aware Concept Embedding models (IntCEMs), a novel CBM-based +architecture and training paradigm that improves a model's receptiveness to +test-time interventions. Our model learns a concept intervention policy in an +end-to-end fashion from where it can sample meaningful intervention +trajectories at train-time. This conditions IntCEMs to effectively select and +receive concept interventions when deployed at test-time. Our experiments show +that IntCEMs significantly outperform state-of-the-art concept-interpretable +models when provided with test-time concept interventions, demonstrating the +effectiveness of our approach. + +
+
+ comment: Accepted as a spotlight at the Thirty-seventh Conference on Neural + Information Processing Systems (NeurIPS 2023) +
+
+
+
+
+ + ♻ ☆ EfficientRAG: Efficient Retriever for Multi-Hop Question Answering + + +
+ Retrieval-augmented generation (RAG) methods encounter difficulties when +addressing complex questions like multi-hop queries. While iterative retrieval +methods improve performance by gathering additional information, current +approaches often rely on multiple calls of large language models (LLMs). In +this paper, we introduce EfficientRAG, an efficient retriever for multi-hop +question answering. EfficientRAG iteratively generates new queries without the +need for LLM calls at each iteration and filters out irrelevant information. +Experimental results demonstrate that EfficientRAG surpasses existing RAG +methods on three open-domain multi-hop question-answering datasets. + +
+
+ comment: 20 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Unsupervisedly Learned Representations: Should the Quest be Over? + + +
+ After four decades of research there still exists a Classification accuracy +gap of about 20% between our best Unsupervisedly Learned Representations +methods and the accuracy rates achieved by intelligent animals. It thus may +well be that we are looking in the wrong direction. A possible solution to this +puzzle is presented. We demonstrate that Reinforcement Learning can learn +representations which achieve the same accuracy as that of animals. Our main +modest contribution lies in the observations that: a. when applied to a real +world environment Reinforcement Learning does not require labels, and thus may +be legitimately considered as Unsupervised Learning, and b. in contrast, when +Reinforcement Learning is applied in a simulated environment it does inherently +require labels and should thus be generally be considered as Supervised +Learning. The corollary of these observations is that further search for +Unsupervised Learning competitive paradigms which may be trained in simulated +environments may be futile. + +
+
+ comment: To be published at The 6th International Conference on Machine + Learning, Optimization and Data Science - LOD 2020 +
+
+
+
+
+ + ♻ ☆ Fast Sampling Through The Reuse Of Attention Maps In Diffusion Models + + +
+ Text-to-image diffusion models have demonstrated unprecedented capabilities +for flexible and realistic image synthesis. Nevertheless, these models rely on +a time-consuming sampling procedure, which has motivated attempts to reduce +their latency. When improving efficiency, researchers often use the original +diffusion model to train an additional network designed specifically for fast +image generation. In contrast, our approach seeks to reduce latency directly, +without any retraining, fine-tuning, or knowledge distillation. In particular, +we find the repeated calculation of attention maps to be costly yet redundant, +and instead suggest reusing them during sampling. Our specific reuse strategies +are based on ODE theory, which implies that the later a map is reused, the +smaller the distortion in the final image. We empirically compare these reuse +strategies with few-step sampling procedures of comparable latency, finding +that reuse generates images that are closer to those produced by the original +high-latency diffusion model. + +
+
+
+
+
+ + ♻ ☆ An Empirical Study on Cross-lingual Vocabulary Adaptation for Efficient + Language Model Inference EMNLP 2024 + + +
+ The development of state-of-the-art generative large language models (LLMs) +disproportionately relies on English-centric tokenizers, vocabulary and +pre-training data. Despite the fact that some LLMs have multilingual +capabilities, recent studies have shown that their inference efficiency +deteriorates when generating text in languages other than English. This results +in increased inference time and costs. Cross-lingual vocabulary adaptation +(CVA) methods have been proposed for adapting models to a target language +aiming to improve downstream performance. However, the effectiveness of these +methods on increasing inference efficiency of generative LLMs has yet to be +explored. In this paper, we perform an empirical study of five CVA methods on +four generative LLMs (including monolingual and multilingual models) across +four typologically-diverse languages and four natural language understanding +tasks. We find that CVA substantially contributes to LLM inference speedups of +up to 271.5\%. We also show that adapting LLMs that have been pre-trained on +more balanced multilingual data results in downstream performance comparable to +the original models. + +
+
+ comment: Accepted at EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Abstraction-of-Thought Makes Language Models Better Reasoners EMNLP 2024 + + +
+ Abstract reasoning, the ability to reason from the abstract essence of a +problem, serves as a key to generalization in human reasoning. However, +eliciting language models to perform reasoning with abstraction remains +unexplored. This paper seeks to bridge this gap by introducing a novel +structured reasoning format called Abstraction-of-Thought (AoT). The uniqueness +of AoT lies in its explicit requirement for varying levels of abstraction +within the reasoning process. This approach could elicit language models to +first contemplate on the abstract level before incorporating concrete details, +which is overlooked by the prevailing step-by-step Chain-of-Thought (CoT) +method. To align models with the AoT format, we present AoT Collection, a +generic finetuning dataset consisting of 348k high-quality samples with AoT +reasoning processes, collected via an automated and scalable pipeline. We +finetune a wide range of language models with AoT Collection and conduct +extensive evaluations on 23 unseen tasks from the challenging benchmark +Big-Bench Hard. Experimental results indicate that models aligned to AoT +reasoning format substantially outperform those aligned to CoT in many +reasoning tasks. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ General-purpose Clothes Manipulation with Semantic Keypoints + + +
+ Clothes manipulation is a critical skill for household robots. Recent +advancements have been made in task-specific clothes manipulation, such as +folding, flattening, and hanging. However, due to clothes' complex geometries +and deformability, creating a general-purpose robot system that can manipulate +a diverse range of clothes in many ways remains challenging. Since clothes are +typically designed with specific structures, we propose identifying these +specific features like ``left sleeve'' as semantic keypoints. Semantic +keypoints can provide semantic cues for task planning and geometric cues for +low-level action generation. With this insight, we develop a hierarchical +learning framework using the large language model (LLM) for general-purpose +CLothes mAnipulation with Semantic keyPoints (CLASP). Extensive simulation +experiments show that CLASP outperforms baseline methods on both seen and +unseen tasks across various clothes manipulation tasks. Real-world experiments +show that CLASP can be directly deployed in the real world and applied to a +wide variety of clothes. + +
+
+
+
+
+ + ♻ ☆ A Concept-Value Network as a Brain Model + + +
+ This paper suggests a statistical framework for describing the relations +between the physical and conceptual entities of a brain-like model. Features +and concept instances are put into context, where the paper suggests that +features may be the electrical wiring, although chemical connections are also +possible. With this idea, the actual length of the connection is important, +because it is related to firing rates and neuron synchronization, but the +signal type is less important. The paper then suggests that concepts are neuron +groups that link feature sets and concept instances are determined by chemical +signals from those groups. Therefore, features become the static horizontal +framework of the neural system and concepts are vertically interconnected +combinations of these. With regards to functionality, the neuron is then +considered to be functional and the more horizontal memory structures can even +be glial. This would also suggest that features can be distributed entities and +not concentrated to a single area. Another aspect could be signal 'breaks' that +compartmentalise a pattern and may help with neural binding. + +
+
+
+
+
+ + ♻ ☆ Augmented neural forms with parametric boundary-matching operators for + solving ordinary differential equations + + +
+ Approximating solutions of ordinary and partial differential equations +constitutes a significant challenge. Based on functional expressions that +inherently depend on neural networks, neural forms are specifically designed to +precisely satisfy the prescribed initial or boundary conditions of the problem, +while providing the approximate solutions in closed form. Departing from the +important class of ordinary differential equations, the present work aims to +refine and validate the neural forms methodology, paving the ground for further +developments in more challenging fields. The main contributions are as follows. +First, it introduces a formalism for systematically crafting proper neural +forms with adaptable boundary matches that are amenable to optimization. +Second, it describes a novel technique for converting problems with Neumann or +Robin conditions into equivalent problems with parametric Dirichlet conditions. +Third, it outlines a method for determining an upper bound on the absolute +deviation from the exact solution. The proposed augmented neural forms approach +was tested on a set of diverse problems, encompassing first- and second-order +ordinary differential equations, as well as first-order systems. Stiff +differential equations have been considered as well. The resulting solutions +were subjected to assessment against existing exact solutions, solutions +derived through the common penalized neural method, and solutions obtained via +contemporary numerical analysis methods. The reported results demonstrate that +the augmented neural forms not only satisfy the boundary and initial conditions +exactly, but also provide closed-form solutions that facilitate high-quality +interpolation and controllable overall precision. These attributes are +essential for expanding the application field of neural forms to more +challenging problems that are described by partial differential equations. + +
+
+
+
+
+ + ♻ ☆ SR-CurvANN: Advancing 3D Surface Reconstruction through Curvature-Aware + Neural Networks + + +
+ Incomplete or missing data in three-dimensional (3D) models can lead to +erroneous or flawed renderings, limiting their usefulness in applications such +as visualization, geometric computation, and 3D printing. Conventional +surface-repair techniques often fail to infer complex geometric details in +missing areas. Neural networks successfully address hole-filling tasks in 2D +images using inpainting techniques. The combination of surface reconstruction +algorithms, guided by the model's curvature properties and the creativity of +neural networks in the inpainting processes should provide realistic results in +the hole completion task. In this paper, we propose a novel method entitled +SR-CurvANN (Surface Reconstruction Based on Curvature-Aware Neural Networks) +that incorporates neural network-based 2D inpainting to effectively reconstruct +3D surfaces. We train the neural networks with images that represent planar +representations of the curvature at vertices of hundreds of 3D models. Once the +missing areas have been inferred, a coarse-to-fine surface deformation process +ensures that the surface fits the reconstructed curvature image. Our proposal +makes it possible to learn and generalize patterns from a wide variety of +training 3D models, generating comprehensive inpainted curvature images and +surfaces. Experiments conducted on 959 models with several holes have +demonstrated that SR-CurvANN excels in the shape completion process, filling +holes with a remarkable level of realism and precision. + +
+
+ comment: Major changes in title, paper structure, text and figures. Improved + results. 23 pages, 14 figures. Decision about submission not taken yet +
+
+
+
+
+ + ♻ ☆ On the Design and Analysis of LLM-Based Algorithms + + +
+ We initiate a formal investigation into the design and analysis of LLM-based +algorithms, i.e. algorithms that contain one or multiple calls of large +language models (LLMs) as sub-routines and critically rely on the capabilities +of LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt +engineering to complicated LLM-powered agent systems and compound AI systems, +have achieved remarkable empirical success, the design and optimization of them +have mostly relied on heuristics and trial-and-errors, which is largely due to +a lack of formal and analytical study for these algorithms. To fill this gap, +we start by identifying the computational-graph representation of LLM-based +algorithms, the design principle of task decomposition, and some key +abstractions, which then facilitate our formal analysis for the accuracy and +efficiency of LLM-based algorithms, despite the black-box nature of LLMs. +Through extensive analytical and empirical investigation in a series of case +studies, we demonstrate that the proposed framework is broadly applicable to a +wide range of scenarios and diverse patterns of LLM-based algorithms, such as +parallel, hierarchical and recursive task decomposition. Our proposed framework +holds promise for advancing LLM-based algorithms, by revealing the reasons +behind curious empirical phenomena, guiding the choices of hyperparameters, +predicting the empirical performance of algorithms, and inspiring new algorithm +design. To promote further study of LLM-based algorithms, we release our source +code at +https://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm. + +
+
+
+
+
+ + ♻ ☆ In-Context Ensemble Improves Video-Language Models for Low-Level + Workflow Understanding from Human Demonstrations + + +
+ A Standard Operating Procedure (SOP) defines a low-level, step-by-step +written guide for a business software workflow based on a video demonstration. +SOPs are a crucial step toward automating end-to-end software workflows. +Manually creating SOPs can be time-consuming. Recent advancements in large +video-language models offer the potential for automating SOP generation by +analyzing recordings of human demonstrations. However, current large +video-language models face challenges with zero-shot SOP generation. We explore +in-context learning with video-language models for SOP generation. We report +that in-context learning sometimes helps video-language models at SOP +generation. We then propose an in-context ensemble learning to further enhance +the capabilities of the models in SOP generation. + +
+
+ comment: multimodal in-context ensemble learning, video-language models, SOP + generation, pseudo-labels, in-context learning, prompt engineering +
+
+
+
+
+ + ♻ ☆ Leveraging summary of radiology reports with transformers + + +
+ Two fundamental problems in health-care stem from patient handoff and triage. +Doctors are often required to perform complex findings summarization to +facilitate efficient communication with specialists and decision making on the +urgency of each case. To address these challenges, we present a state of the +art radiology report summarization model utilizing adjusted bidirectional +encoder representation from transformers BERTtoBERT encoder and decoder +architecture. We also provide a data processing pipeline for future models +developed on the the MIMIC CXR dataset. Our approach includes a novel method +for augmenting medical data and a comprehensive performance analysis. Our best +performing model achieved a recall oriented understudy for gisting evaluation L +F1 score of 58.75/100, outperforming specialized checkpoints with more +sophisticated attention mechanisms. We also provide a data processing pipeline +for future models developed on the MIMIC chest X-ray dataset. The model +introduced in this paper demonstrates significantly improved capacity in +radiology report summarization, highlighting the potential for ensuring better +clinical workflows and enhanced patient care. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Tenplex: Dynamic Parallelism for Deep Learning using Parallelizable + Tensor Collections + + +
+ Deep learning (DL) jobs use multi-dimensional parallelism, i.e. combining +data, model, and pipeline parallelism, to use large GPU clusters efficiently. +Long-running jobs may experience changes to their GPU allocation: (i) resource +elasticity during training adds or removes GPUs; (ii) hardware maintenance may +require redeployment on different GPUs; and (iii) GPU failures force jobs to +run with fewer devices. Current DL frameworks tie jobs to a set of GPUs and +thus lack support for these scenarios. In particular, they cannot change the +multi-dimensional parallelism of an already-running job in an efficient and +model-independent way. + We describe Scalai, a state management library for DL systems that enables +jobs to change their parallelism dynamically after the GPU allocation is +updated at runtime. Scalai achieves this through a new abstraction, a +parallelizable tensor collection (PTC), that externalizes the job state during +training. After a GPU change, Scalai uses the PTC to transform the job state: +the PTC repartitions the dataset state under data parallelism and exposes it to +DL workers through a virtual file system; and the PTC obtains the model state +as partitioned checkpoints and transforms them to reflect the new +parallelization configuration. For efficiency, Scalai executes PTC +transformations in parallel with minimum data movement between workers. Our +experiments show that Scalai enables DL jobs to support dynamic parallelization +with low overhead. + +
+
+ comment: The 30th Symposium on Operating Systems Principles (SOSP24) +
+
+
+
+
+ + ♻ ☆ IDP-PGFE: An Interpretable Disruption Predictor based on Physics-Guided + Feature Extraction + + +
+ Disruption prediction has made rapid progress in recent years, especially in +machine learning (ML)-based methods. Understanding why a predictor makes a +certain prediction can be as crucial as the prediction's accuracy for future +tokamak disruption predictors. The purpose of most disruption predictors is +accuracy or cross-machine capability. However, if a disruption prediction model +can be interpreted, it can tell why certain samples are classified as +disruption precursors. This allows us to tell the types of incoming disruption +and gives us insight into the mechanism of disruption. This paper designs a +disruption predictor called Interpretable Disruption Predictor based On +Physics-guided feature extraction (IDP-PGFE) on J-TEXT. The prediction +performance of the model is effectively improved by extracting physics-guided +features. A high-performance model is required to ensure the validity of the +interpretation results. The interpretability study of IDP-PGFE provides an +understanding of J-TEXT disruption and is generally consistent with existing +comprehension of disruption. IDP-PGFE has been applied to the disruption due to +continuously increasing density towards density limit experiments on J-TEXT. +The time evolution of the PGFE features contribution demonstrates that the +application of ECRH triggers radiation-caused disruption, which lowers the +density at disruption. While the application of RMP indeed raises the density +limit in J-TEXT. The interpretability study guides intuition on the physical +mechanisms of density limit disruption that RMPs affect not only the MHD +instabilities but also the radiation profile, which delays density limit +disruption. + +
+
+ comment: 17 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ AI-enhanced Collective Intelligence + + +
+ Current societal challenges exceed the capacity of humans operating either +alone or collectively. As AI evolves, its role within human collectives will +vary from an assistive tool to a participatory member. Humans and AI possess +complementary capabilities that, together, can surpass the collective +intelligence of either humans or AI in isolation. However, the interactions in +human-AI systems are inherently complex, involving intricate processes and +interdependencies. This review incorporates perspectives from complex network +science to conceptualize a multilayer representation of human-AI collective +intelligence, comprising cognition, physical, and information layers. Within +this multilayer network, humans and AI agents exhibit varying characteristics; +humans differ in diversity from surface-level to deep-level attributes, while +AI agents range in degrees of functionality and anthropomorphism. We explore +how agents' diversity and interactions influence the system's collective +intelligence and analyze real-world instances of AI-enhanced collective +intelligence. We conclude by considering potential challenges and future +developments in this field. + +
+
+ comment: 43 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ SatFed: A Resource-Efficient LEO Satellite-Assisted Heterogeneous + Federated Learning Framework + + +
+ Traditional federated learning (FL) frameworks rely heavily on terrestrial +networks, where coverage limitations and increasing bandwidth congestion +significantly hinder model convergence. Fortunately, the advancement of +low-Earth orbit (LEO) satellite networks offers promising new communication +avenues to augment traditional terrestrial FL. Despite this potential, the +limited satellite-ground communication bandwidth and the heterogeneous +operating environments of ground devices-including variations in data, +bandwidth, and computing power-pose substantial challenges for effective and +robust satellite-assisted FL. To address these challenges, we propose SatFed, a +resource-efficient satellite-assisted heterogeneous FL framework. SatFed +implements freshness-based model prioritization queues to optimize the use of +highly constrained satellite-ground bandwidth, ensuring the transmission of the +most critical models. Additionally, a multigraph is constructed to capture +real-time heterogeneous relationships between devices, including data +distribution, terrestrial bandwidth, and computing capability. This multigraph +enables SatFed to aggregate satellite-transmitted models into peer guidance, +enhancing local training in heterogeneous environments. Extensive experiments +with real-world LEO satellite networks demonstrate that SatFed achieves +superior performance and robustness compared to state-of-the-art benchmarks. + +
+
+ comment: 10 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ AutoScraper: A Progressive Understanding Web Agent for Web Scraper + Generation EMNLP 2024 + + +
+ Web scraping is a powerful technique that extracts data from websites, +enabling automated data collection, enhancing data analysis capabilities, and +minimizing manual data entry efforts. Existing methods, wrappers-based methods +suffer from limited adaptability and scalability when faced with a new website, +while language agents, empowered by large language models (LLMs), exhibit poor +reusability in diverse web environments. In this work, we introduce the +paradigm of generating web scrapers with LLMs and propose AutoScraper, a +two-stage framework that can handle diverse and changing web environments more +efficiently. AutoScraper leverages the hierarchical structure of HTML and +similarity across different web pages for generating web scrapers. Besides, we +propose a new executability metric for better measuring the performance of web +scraper generation tasks. We conduct comprehensive experiments with multiple +LLMs and demonstrate the effectiveness of our framework. Resources of this +paper can be found at \url{https://github.com/EZ-hwh/AutoScraper} + +
+
+ comment: 19 pages, 4 figures, 18 tables. Accepted to EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ VARADE: a Variational-based AutoRegressive model for Anomaly Detection + on the Edge + + +
+ Detecting complex anomalies on massive amounts of data is a crucial task in +Industry 4.0, best addressed by deep learning. However, available solutions are +computationally demanding, requiring cloud architectures prone to latency and +bandwidth issues. This work presents VARADE, a novel solution implementing a +light autoregressive framework based on variational inference, which is best +suited for real-time execution on the edge. The proposed approach was validated +on a robotic arm, part of a pilot production line, and compared with several +state-of-the-art algorithms, obtaining the best trade-off between anomaly +detection accuracy, power consumption and inference frequency on two different +edge platforms. + +
+
+
+
+
+ + ♻ ☆ Fixed-length Dense Descriptor for Efficient Fingerprint Matching + + +
+ In fingerprint matching, fixed-length descriptors generally offer greater +efficiency compared to minutiae set, but the recognition accuracy is not as +good as that of the latter. Although much progress has been made in deep +learning based fixed-length descriptors recently, they often fall short when +dealing with incomplete or partial fingerprints, diverse fingerprint poses, and +significant background noise. In this paper, we propose a three-dimensional +representation called Fixed-length Dense Descriptor (FDD) for efficient +fingerprint matching. FDD features great spatial properties, enabling it to +capture the spatial relationships of the original fingerprints, thereby +enhancing interpretability and robustness. Our experiments on various +fingerprint datasets reveal that FDD outperforms other fixed-length +descriptors, especially in matching fingerprints of different areas, +cross-modal fingerprint matching, and fingerprint matching with background +noise. + +
+
+ comment: Accepted by WIFS 2024 +
+
+
+
+
+ + ♻ ☆ Time and State Dependent Neural Delay Differential Equations + + +
+ Discontinuities and delayed terms are encountered in the governing equations +of a large class of problems ranging from physics and engineering to medicine +and economics. These systems cannot be properly modelled and simulated with +standard Ordinary Differential Equations (ODE), or data-driven approximations +such as Neural Ordinary Differential Equations (NODE). To circumvent this +issue, latent variables are typically introduced to solve the dynamics of the +system in a higher dimensional space and obtain the solution as a projection to +the original space. However, this solution lacks physical interpretability. In +contrast, Delay Differential Equations (DDEs), and their data-driven +approximated counterparts, naturally appear as good candidates to characterize +such systems. In this work we revisit the recently proposed Neural DDE by +introducing Neural State-Dependent DDE (SDDDE), a general and flexible +framework that can model multiple and state- and time-dependent delays. We show +that our method is competitive and outperforms other continuous-class models on +a wide variety of delayed dynamical systems. Code is available at the +repository +\href{https://github.com/thibmonsel/Time-and-State-Dependent-Neural-Delay-Differential-Equations}{here}. + +
+
+
+
+
+ + ♻ ☆ Neuro-Symbolic Integration Brings Causal and Reliable Reasoning Proofs + + +
+ Two lines of approaches are adopted for complex reasoning with LLMs. One line +of work prompts LLMs with various reasoning structures, while the structural +outputs can be naturally regarded as intermediate reasoning steps. Another line +of work adopt LLM-free declarative solvers to do the reasoning task, rendering +higher reasoning accuracy but lacking interpretability due to the black-box +nature of the solvers. Aiming to resolve the trade-off between answer accuracy +and interpretability, we present a simple extension to the latter line of work. +Specifically, we showcase that the intermediate search logs generated by Prolog +interpreters can be accessed and interpreted into human-readable reasoning +proofs. As long as LLMs correctly translate problem descriptions into Prolog +representations, the corresponding reasoning proofs are ensured to be causal +and reliable. On two logical reasoning and one arithmetic reasoning datasets, +our framework obtains significant improvements in terms of both answer accuracy +and reasoning proof accuracy. Our code is released at +https://github.com/DAMO-NLP-SG/CaRing + +
+
+
+
+
+ + ♻ ☆ SeCoKD: Aligning Large Language Models for In-Context Learning with + Fewer Shots + + +
+ Previous studies have shown that demonstrations can significantly help Large +Language Models (LLMs ) perform better on the given tasks. However, this +so-called In-Context Learning ( ICL ) ability is very sensitive to the +presenting context, and often dozens of demonstrations are needed. In this +work, we investigate if we can reduce the shot number while still maintaining a +competitive performance. We present SeCoKD, a self-Knowledge Distillation ( KD +) training framework that aligns the student model with a heavily prompted +variation, thereby increasing the utilization of a single demonstration. We +experiment with the SeCoKD across three LLMs and six benchmarks focusing mainly +on reasoning tasks. Results show that our method outperforms the base model and +Supervised Fine-tuning ( SFT ), especially in zero-shot and one-shot settings +by 30% and 10%, respectively. Moreover, SeCoKD brings little negative artifacts +when evaluated on new tasks, which is more robust than Supervised Fine-tuning. + +
+
+ comment: preprint +
+
+
+
+
+ + ♻ ☆ Archon: An Architecture Search Framework for Inference-Time Techniques + + +
+ Inference-time techniques are emerging as highly effective tools to increase +large language model (LLM) capabilities. However, there is still limited +understanding of the best practices for developing systems that combine +inference-time techniques with one or more LLMs, with challenges including: (1) +effectively allocating inference compute budget, (2) understanding the +interactions between different combinations of inference-time techniques and +their impact on downstream performance, and 3) efficiently searching over the +large space of model choices, inference-time techniques, and their +compositions. To address these challenges, we introduce Archon, an automated +framework for designing inference-time architectures. Archon defines an +extensible design space, encompassing methods such as generation ensembling, +multi-sampling, ranking, fusion, critiquing, verification, and unit testing. It +then transforms the problem of selecting and combining LLMs and inference-time +techniques into a hyperparameter optimization objective. To optimize this +objective, we introduce automated Inference-Time Architecture Search (ITAS) +algorithms. Given target benchmark(s), an inference compute budget, and +available LLMs, ITAS outputs optimized architectures. We evaluate Archon +architectures across a wide range of instruction-following and reasoning +benchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval, +MixEval Hard, MATH, and CodeContests. We show that automatically designed +inference-time architectures by Archon outperform strong models such as GPT-4o +and Claude 3.5 Sonnet on these benchmarks, achieving an average increase of +15.1 and 11.2 percentage points with all-source models and open-source models, +respectively. We make our code and datasets available publicly on Github: +https://github.com/ScalingIntelligence/Archon. + +
+
+
+
+
+ + ♻ ☆ Hybrid Spiking Neural Networks for Low-Power Intra-Cortical + Brain-Machine Interfaces + + +
+ Intra-cortical brain-machine interfaces (iBMIs) have the potential to +dramatically improve the lives of people with paraplegia by restoring their +ability to perform daily activities. However, current iBMIs suffer from +scalability and mobility limitations due to bulky hardware and wiring. Wireless +iBMIs offer a solution but are constrained by a limited data rate. To overcome +this challenge, we are investigating hybrid spiking neural networks for +embedded neural decoding in wireless iBMIs. The networks consist of a temporal +convolution-based compression followed by recurrent processing and a final +interpolation back to the original sequence length. As recurrent units, we +explore gated recurrent units (GRUs), leaky integrate-and-fire (LIF) neurons, +and a combination of both - spiking GRUs (sGRUs) and analyze their differences +in terms of accuracy, footprint, and activation sparsity. To that end, we train +decoders on the "Nonhuman Primate Reaching with Multichannel Sensorimotor +Cortex Electrophysiology" dataset and evaluate it using the NeuroBench +framework, targeting both tracks of the IEEE BioCAS Grand Challenge on Neural +Decoding. Our approach achieves high accuracy in predicting velocities of +primate reaching movements from multichannel primary motor cortex recordings +while maintaining a low number of synaptic operations, surpassing the current +baseline models in the NeuroBench framework. This work highlights the potential +of hybrid neural networks to facilitate wireless iBMIs with high decoding +precision and a substantial increase in the number of monitored neurons, paving +the way toward more advanced neuroprosthetic technologies. + +
+
+ comment: This work has been accepted at the 2024 IEEE Biomedical Circuits and + Systems Conference +
+
+
+
+
+ + ♻ ☆ Unused information in token probability distribution of generative LLM: + improving LLM reading comprehension through calculation of expected values + + +
+ LLM text decoding is key component for perceived LLM quality. We demonstrate +two experiments showing that decoding methods could be improved by manipulation +of token probabilities. First, we test few LLM on SummEval summary scoring +dataset, to measure reading comprehension. We compare scores from greedy +decoding to expected values over the next token distribution. We scale logits +by large temperature to increase the entropy of scores. This allows strong +improvement of performance on SummEval (in terms of correlations to human +judgement). We see improvement from 6-8% to 13-28% for 7B Mistral and from +20%-46% to 37%-56% for Mixtral, beating GPT 4 0314 result on two metrics. Part +of the gain seems related to positional bias. Secondly, we use +probability-based tree sampling algorithm, to examine all most probable +generations for given prompt. + +
+
+ comment: 7 pages, 1 figure, presented at FEDCSIS 2024 conference, +
+
+
+
+
+ + ♻ ☆ ND-SDF: Learning Normal Deflection Fields for High-Fidelity Indoor + Reconstruction + + +
+ Neural implicit reconstruction via volume rendering has demonstrated its +effectiveness in recovering dense 3D surfaces. However, it is non-trivial to +simultaneously recover meticulous geometry and preserve smoothness across +regions with differing characteristics. To address this issue, previous methods +typically employ geometric priors, which are often constrained by the +performance of the prior models. In this paper, we propose ND-SDF, which learns +a Normal Deflection field to represent the angular deviation between the scene +normal and the prior normal. Unlike previous methods that uniformly apply +geometric priors on all samples, introducing significant bias in accuracy, our +proposed normal deflection field dynamically learns and adapts the utilization +of samples based on their specific characteristics, thereby improving both the +accuracy and effectiveness of the model. Our method not only obtains smooth +weakly textured regions such as walls and floors but also preserves the +geometric details of complex structures. In addition, we introduce a novel ray +sampling strategy based on the deflection angle to facilitate the unbiased +rendering process, which significantly improves the quality and accuracy of +intricate surfaces, especially on thin structures. Consistent improvements on +various challenging datasets demonstrate the superiority of our method. + +
+
+
+
+
+ + ♻ ☆ INT-FlashAttention: Enabling Flash Attention for INT8 Quantization + + +
+ As the foundation of large language models (LLMs), self-attention module +faces the challenge of quadratic time and memory complexity with respect to +sequence length. FlashAttention accelerates attention computation and reduces +its memory usage by leveraging the GPU memory hierarchy. A promising research +direction is to integrate FlashAttention with quantization methods. This paper +introduces INT-FlashAttention, the first INT8 quantization architecture +compatible with the forward workflow of FlashAttention, which significantly +improves the inference speed of FlashAttention on Ampere GPUs. We implement our +INT-FlashAttention prototype with fully INT8 activations and general +matrix-multiplication (GEMM) kernels, making it the first attention operator +with fully INT8 input. As a general token-level post-training quantization +framework, INT-FlashAttention is also compatible with other data formats like +INT4, etc. Experimental results show INT-FlashAttention achieves 72% faster +inference speed and 82% smaller quantization error compared to standard +FlashAttention with FP16 and FP8 data format. + +
+
+
+
+
+ + ♻ ☆ SliceIt! -- A Dual Simulator Framework for Learning Robot Food Slicing ICRA 2024 + + +
+ Cooking robots can enhance the home experience by reducing the burden of +daily chores. However, these robots must perform their tasks dexterously and +safely in shared human environments, especially when handling dangerous tools +such as kitchen knives. This study focuses on enabling a robot to autonomously +and safely learn food-cutting tasks. More specifically, our goal is to enable a +collaborative robot or industrial robot arm to perform food-slicing tasks by +adapting to varying material properties using compliance control. Our approach +involves using Reinforcement Learning (RL) to train a robot to compliantly +manipulate a knife, by reducing the contact forces exerted by the food items +and by the cutting board. However, training the robot in the real world can be +inefficient, and dangerous, and result in a lot of food waste. Therefore, we +proposed SliceIt!, a framework for safely and efficiently learning robot +food-slicing tasks in simulation. Following a real2sim2real approach, our +framework consists of collecting a few real food slicing data, calibrating our +dual simulation environment (a high-fidelity cutting simulator and a robotic +simulator), learning compliant control policies on the calibrated simulation +environment, and finally, deploying the policies on the real robot. + +
+
+ comment: Accepted to ICRA 2024 +
+
+
+
+
+ + ♻ ☆ Learning Variable Compliance Control From a Few Demonstrations for + Bimanual Robot with Haptic Feedback Teleoperation System IROS 2024 + + +
+ Automating dexterous, contact-rich manipulation tasks using rigid robots is a +significant challenge in robotics. Rigid robots, defined by their actuation +through position commands, face issues of excessive contact forces due to their +inability to adapt to contact with the environment, potentially causing damage. +While compliance control schemes have been introduced to mitigate these issues +by controlling forces via external sensors, they are hampered by the need for +fine-tuning task-specific controller parameters. Learning from Demonstrations +(LfD) offers an intuitive alternative, allowing robots to learn manipulations +through observed actions. In this work, we introduce a novel system to enhance +the teaching of dexterous, contact-rich manipulations to rigid robots. Our +system is twofold: firstly, it incorporates a teleoperation interface utilizing +Virtual Reality (VR) controllers, designed to provide an intuitive and +cost-effective method for task demonstration with haptic feedback. Secondly, we +present Comp-ACT (Compliance Control via Action Chunking with Transformers), a +method that leverages the demonstrations to learn variable compliance control +from a few demonstrations. Our methods have been validated across various +complex contact-rich manipulation tasks using single-arm and bimanual robot +setups in simulated and real-world environments, demonstrating the +effectiveness of our system in teaching robots dexterous manipulations with +enhanced adaptability and safety. Code available at: +https://github.com/omron-sinicx/CompACT + +
+
+ comment: Accepted to IROS 2024 +
+
+
+
+
+ + ♻ ☆ AsyncDiff: Parallelizing Diffusion Models by Asynchronous Denoising NeurIPS 2024 + + +
+ Diffusion models have garnered significant interest from the community for +their great generative ability across various applications. However, their +typical multi-step sequential-denoising nature gives rise to high cumulative +latency, thereby precluding the possibilities of parallel computation. To +address this, we introduce AsyncDiff, a universal and plug-and-play +acceleration scheme that enables model parallelism across multiple devices. Our +approach divides the cumbersome noise prediction model into multiple +components, assigning each to a different device. To break the dependency chain +between these components, it transforms the conventional sequential denoising +into an asynchronous process by exploiting the high similarity between hidden +states in consecutive diffusion steps. Consequently, each component is +facilitated to compute in parallel on separate devices. The proposed strategy +significantly reduces inference latency while minimally impacting the +generative quality. Specifically, for the Stable Diffusion v2.1, AsyncDiff +achieves a 2.7x speedup with negligible degradation and a 4.0x speedup with +only a slight reduction of 0.38 in CLIP Score, on four NVIDIA A5000 GPUs. Our +experiments also demonstrate that AsyncDiff can be readily applied to video +diffusion models with encouraging performances. The code is available at +https://github.com/czg1225/AsyncDiff. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ IRSC: A Zero-shot Evaluation Benchmark for Information Retrieval through + Semantic Comprehension in Retrieval-Augmented Generation Scenarios + + +
+ In Retrieval-Augmented Generation (RAG) tasks using Large Language Models +(LLMs), the quality of retrieved information is critical to the final output. +This paper introduces the IRSC benchmark for evaluating the performance of +embedding models in multilingual RAG tasks. The benchmark encompasses five +retrieval tasks: query retrieval, title retrieval, part-of-paragraph retrieval, +keyword retrieval, and summary retrieval. Our research addresses the current +lack of comprehensive testing and effective comparison methods for embedding +models in RAG scenarios. We introduced new metrics: the Similarity of Semantic +Comprehension Index (SSCI) and the Retrieval Capability Contest Index (RCCI), +and evaluated models such as Snowflake-Arctic, BGE, GTE, and M3E. Our +contributions include: 1) the IRSC benchmark, 2) the SSCI and RCCI metrics, and +3) insights into the cross-lingual limitations of embedding models. The IRSC +benchmark aims to enhance the understanding and development of accurate +retrieval systems in RAG tasks. All code and datasets are available at: +https://github.com/Jasaxion/IRSC_Benchmark + +
+
+
+
+
+ + ♻ ☆ Image Denoising with Machine Learning: A Novel Approach to Improve + Quantum Image Processing Quality and Reliability + + +
+ Quantum Image Processing (QIP) is a field that aims to utilize the benefits +of quantum computing for manipulating and analyzing images. However, QIP faces +two challenges: the limitation of qubits and the presence of noise in a quantum +machine. In this research, we propose a novel approach to address the issue of +noise in QIP. By training and employing a machine learning model that +identifies and corrects the noise in quantum-processed images, we can +compensate for the noisiness caused by the machine and retrieve a processing +result similar to that performed by a classical computer with higher +efficiency. The model is trained by learning a dataset consisting of both +existing processed images and quantum-processed images from open-access +datasets. This model will be capable of providing us with the confidence level +for each pixel and its potential original value. To assess the model's accuracy +in compensating for loss and decoherence in QIP, we evaluate it using three +metrics: Peak Signal to Noise Ratio (PSNR), Structural Similarity Index (SSIM), +and Mean Opinion Score (MOS). Additionally, we discuss the applicability of our +model across domains well as its cost effectiveness compared to alternative +methods. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Serving Deep Learning Model in Relational Databases + + +
+ Serving deep learning (DL) models on relational data has become a critical +requirement across diverse commercial and scientific domains, sparking growing +interest recently. In this visionary paper, we embark on a comprehensive +exploration of representative architectures to address the requirement. We +highlight three pivotal paradigms: The state-of-the-art DL-centric architecture +offloads DL computations to dedicated DL frameworks. The potential UDF-centric +architecture encapsulates one or more tensor computations into User Defined +Functions (UDFs) within the relational database management system (RDBMS). The +potential relation-centric architecture aims to represent a large-scale tensor +computation through relational operators. While each of these architectures +demonstrates promise in specific use scenarios, we identify urgent requirements +for seamless integration of these architectures and the middle ground +in-between these architectures. We delve into the gaps that impede the +integration and explore innovative strategies to close them. We present a +pathway to establish a novel RDBMS for enabling a broad class of data-intensive +DL inference applications. + +
+
+ comment: * Authors are ordered alphabetically; Jia Zou is the corresponding + author +
+
+
+
+
+ + ♻ ☆ RoLoRA: Fine-tuning Rotated Outlier-free LLMs for Effective + Weight-Activation Quantization EMNLP 2024 + + +
+ Low-Rank Adaptation (LoRA), as a representative Parameter-Efficient +Fine-Tuning (PEFT)method, significantly enhances the training efficiency by +updating only a small portion of the weights in Large Language Models (LLMs). +Recently, weight-only quantization techniques have also been applied to LoRA +methods to reduce the memory footprint of fine-tuning. However, applying +weight-activation quantization to the LoRA pipeline is under-explored, and we +observe substantial performance degradation primarily due to the presence of +activation outliers. In this work, we propose RoLoRA, the first LoRA-based +scheme for effective weight-activation quantization. RoLoRA utilizes rotation +for outlier elimination and proposes rotation-aware fine-tuning to preserve the +outlier-free characteristics in rotated LLMs. Experimental results show RoLoRA +consistently improves low-bit LoRA convergence and post-training quantization +robustness in weight-activation settings. We evaluate RoLoRA across +LLaMA2-7B/13B, LLaMA3-8B models, achieving up to 29.5% absolute accuracy gain +of 4-bit weight-activation quantized LLaMA2- 13B on commonsense reasoning tasks +compared to LoRA baseline. We further demonstrate its effectiveness on Large +Multimodal Models (LLaVA-1.5-7B). Codes are available at +https://github.com/HuangOwen/RoLoRA + +
+
+ comment: EMNLP 2024 Findings, Codes: https://github.com/HuangOwen/RoLoRA, + Models: + https://huggingface.co/collections/ScarletAce/rolora-66f5f228a90681c7c4512b28 +
+
+
+
+
+ + ♻ ☆ Fast ODE-based Sampling for Diffusion Models in Around 5 Steps CVPR 2024 + + +
+ Sampling from diffusion models can be treated as solving the corresponding +ordinary differential equations (ODEs), with the aim of obtaining an accurate +solution with as few number of function evaluations (NFE) as possible. +Recently, various fast samplers utilizing higher-order ODE solvers have emerged +and achieved better performance than the initial first-order one. However, +these numerical methods inherently result in certain approximation errors, +which significantly degrades sample quality with extremely small NFE (e.g., +around 5). In contrast, based on the geometric observation that each sampling +trajectory almost lies in a two-dimensional subspace embedded in the ambient +space, we propose Approximate MEan-Direction Solver (AMED-Solver) that +eliminates truncation errors by directly learning the mean direction for fast +diffusion sampling. Besides, our method can be easily used as a plugin to +further improve existing ODE-based samplers. Extensive experiments on image +synthesis with the resolution ranging from 32 to 512 demonstrate the +effectiveness of our method. With only 5 NFE, we achieve 6.61 FID on CIFAR-10, +10.74 FID on ImageNet 64$\times$64, and 13.20 FID on LSUN Bedroom. Our code is +available at https://github.com/zju-pi/diff-sampler. + +
+
+ comment: Accepted by CVPR 2024 (Spotlight) +
+
+
+
+
+ + ♻ ☆ LEMMA-RCA: A Large Multi-modal Multi-domain Dataset for Root Cause + Analysis + + +
+ Root cause analysis (RCA) is crucial for enhancing the reliability and +performance of complex systems. However, progress in this field has been +hindered by the lack of large-scale, open-source datasets tailored for RCA. To +bridge this gap, we introduce LEMMA-RCA, a large dataset designed for diverse +RCA tasks across multiple domains and modalities. LEMMA-RCA features various +real-world fault scenarios from IT and OT operation systems, encompassing +microservices, water distribution, and water treatment systems, with hundreds +of system entities involved. We evaluate the quality of LEMMA-RCA by testing +the performance of eight baseline methods on this dataset under various +settings, including offline and online modes as well as single and multiple +modalities. Our experimental results demonstrate the high quality of LEMMA-RCA. +The dataset is publicly available at https://lemma-rca.github.io/. + +
+
+
+
+
+ + ♻ Eagle and Finch: RWKV with Matrix-Valued States and Dynamic Recurrence + + +
+ We present Eagle (RWKV-5) and Finch (RWKV-6), sequence models improving upon +the RWKV (RWKV-4) architecture. Our architectural design advancements include +multi-headed matrix-valued states and a dynamic recurrence mechanism that +improve expressivity while maintaining the inference efficiency characteristics +of RNNs. We introduce a new multilingual corpus with 1.12 trillion tokens and a +fast tokenizer based on greedy matching for enhanced multilinguality. We +trained four Eagle models, ranging from 0.46 to 7.5 billion parameters, and two +Finch models with 1.6 and 3.1 billion parameters and find that they achieve +competitive performance across a wide variety of benchmarks. We release all our +models on HuggingFace under the Apache 2.0 license. Models at: +https://huggingface.co/RWKV Training code at: https://github.com/RWKV/RWKV-LM +Inference code at: https://github.com/RWKV/ChatRWKV Time-parallel training code +at: https://github.com/RWKV/RWKV-infctx-trainer + +
+
+
+
+
+ + ♻ ☆ Contrastive Learning for Knowledge-Based Question Generation in Large + Language Models + + +
+ With the rapid development of artificial intelligence technology, especially +the increasingly widespread application of question-and-answer systems, +high-quality question generation has become a key component in supporting the +development of these systems. This article focuses on knowledge-based question +generation technology, which aims to enable computers to simulate the human +questioning process based on understanding specific texts or knowledge bases. +In light of the issues of hallucination and knowledge gaps present in +large-scale language models when applied to knowledge-intensive tasks, this +paper proposes an enhanced question generation method that incorporates +contrastive learning. This method utilizes multiple models to jointly mine +domain knowledge and uses contrastive learning to guide the model in reducing +noise and hallucinations in generation. Experimental results show that by +designing prompts containing contrasting examples, the model's performance in +question generation improves considerably, particularly when contrasting +instructions and examples are used simultaneously, leading to the highest +quality of generated questions and improved accuracy. These results demonstrate +that the method proposed in this study, which combines contrasting context and +chain-of-thought prompts, can effectively improve both the quality and the +practicality of question generation. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Improvements to SDXL in NovelAI Diffusion V3 + + +
+ In this technical report, we document the changes we made to SDXL in the +process of training NovelAI Diffusion V3, our state of the art anime image +generation model. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Harnessing Multimodal Large Language Models for Multimodal Sequential + Recommendation + + +
+ Recent advances in Large Language Models (LLMs) have demonstrated significant +potential in the field of Recommendation Systems (RSs). Most existing studies +have focused on converting user behavior logs into textual prompts and +leveraging techniques such as prompt tuning to enable LLMs for recommendation +tasks. Meanwhile, research interest has recently grown in multimodal +recommendation systems that integrate data from images, text, and other sources +using modality fusion techniques. This introduces new challenges to the +existing LLM-based recommendation paradigm which relies solely on text modality +information. Moreover, although Multimodal Large Language Models (MLLMs) +capable of processing multi-modal inputs have emerged, how to equip MLLMs with +multi-modal recommendation capabilities remains largely unexplored. To this +end, in this paper, we propose the Multimodal Large Language Model-enhanced +Multimodaln Sequential Recommendation (MLLM-MSR) model. To capture the dynamic +user preference, we design a two-stage user preference summarization method. +Specifically, we first utilize an MLLM-based item-summarizer to extract image +feature given an item and convert the image into text. Then, we employ a +recurrent user preference summarization generation paradigm to capture the +dynamic changes in user preferences based on an LLM-based user-summarizer. +Finally, to enable the MLLM for multi-modal recommendation task, we propose to +fine-tune a MLLM-based recommender using Supervised Fine-Tuning (SFT) +techniques. Extensive evaluations across various datasets validate the +effectiveness of MLLM-MSR, showcasing its superior ability to capture and adapt +to the evolving dynamics of user preferences. + +
+
+
+
+
+ + ♻ ☆ A Distributed Privacy Preserving Model for the Detection of Alzheimer's + Disease + + +
+ In the era of rapidly advancing medical technologies, the segmentation of +medical data has become inevitable, necessitating the development of privacy +preserving machine learning algorithms that can train on distributed data. +Consolidating sensitive medical data is not always an option particularly due +to the stringent privacy regulations imposed by the Health Insurance +Portability and Accountability Act (HIPAA). In this paper, I introduce a HIPAA +compliant framework that can train from distributed data. I then propose a +multimodal vertical federated model for Alzheimer's Disease (AD) detection, a +serious neurodegenerative condition that can cause dementia, severely impairing +brain function and hindering simple tasks, especially without preventative +care. This vertical federated learning (VFL) model offers a distributed +architecture that enables collaborative learning across diverse sources of +medical data while respecting privacy constraints imposed by HIPAA. The VFL +architecture proposed herein offers a novel distributed architecture, enabling +collaborative learning across diverse sources of medical data while respecting +statutory privacy constraints. By leveraging multiple modalities of data, the +robustness and accuracy of AD detection can be enhanced. This model not only +contributes to the advancement of federated learning techniques but also holds +promise for overcoming the hurdles posed by data segmentation in medical +research. + +
+
+ comment: 15 pages, 7 figures, 2 tables +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 68 + +
+
+
+ + ☆ Real-World Data Inspired Interactive Connected Traffic Scenario + Generation + + +
+ Simulation is a crucial step in ensuring accurate, efficient, and realistic +Connected and Autonomous Vehicles (CAVs) testing and validation. As the +adoption of CAV accelerates, the integration of real-world data into simulation +environments becomes increasingly critical. Among various technologies utilized +by CAVs, Vehicle-to-Everything (V2X) communication plays a crucial role in +ensuring a seamless transmission of information between CAVs, infrastructure, +and other road users. However, most existing studies have focused on developing +and testing communication protocols, resource allocation strategies, and data +dissemination techniques in V2X. There is a gap where real-world V2X data is +integrated into simulations to generate diverse and high-fidelity traffic +scenarios. To fulfill this research gap, we leverage real-world Signal Phase +and Timing (SPaT) data from Roadside Units (RSUs) to enhance the fidelity of +CAV simulations. Moreover, we developed an algorithm that enables Autonomous +Vehicles (AVs) to respond dynamically to real-time traffic signal data, +simulating realistic V2X communication scenarios. Such high-fidelity simulation +environments can generate multimodal data, including trajectory, semantic +camera, depth camera, and bird's eye view data for various traffic scenarios. +The generated scenarios and data provide invaluable insights into AVs' +interactions with traffic infrastructure and other road users. This work aims +to bridge the gap between theoretical research and practical deployment of +CAVs, facilitating the development of smarter and safer transportation systems. + +
+
+
+
+
+ + ☆ An Anatomy-Aware Shared Control Approach for Assisted Teleoperation of + Lung Ultrasound Examinations + + +
+ The introduction of artificial intelligence and robotics in telehealth is +enabling personalised treatment and supporting teleoperated procedures such as +lung ultrasound, which has gained attention during the COVID-19 pandemic. +Although fully autonomous systems face challenges due to anatomical +variability, teleoperated systems appear to be more practical in current +healthcare settings. This paper presents an anatomy-aware control framework for +teleoperated lung ultrasound. Using biomechanically accurate 3D models such as +SMPL and SKEL, the system provides a real-time visual feedback and applies +virtual constraints to assist in precise probe placement tasks. Evaluations on +five subjects show the accuracy of the biomechanical models and the efficiency +of the system in improving probe placement and reducing procedure time compared +to traditional teleoperation. The results demonstrate that the proposed +framework enhances the physician's capabilities in executing remote lung +ultrasound examinations, towards more objective and repeatable acquisitions. + +
+
+
+
+
+ + ☆ Safe Leaf Manipulation for Accurate Shape and Pose Estimation of + Occluded Fruits ICRA 2025 + + +
+ Fruit monitoring plays an important role in crop management, and rising +global fruit consumption combined with labor shortages necessitates automated +monitoring with robots. However, occlusions from plant foliage often hinder +accurate shape and pose estimation. Therefore, we propose an active fruit shape +and pose estimation method that physically manipulates occluding leaves to +reveal hidden fruits. This paper introduces a framework that plans robot +actions to maximize visibility and minimize leaf damage. We developed a novel +scene-consistent shape completion technique to improve fruit estimation under +heavy occlusion and utilize a perception-driven deformation graph model to +predict leaf deformation during planning. Experiments on artificial and real +sweet pepper plants demonstrate that our method enables robots to safely move +leaves aside, exposing fruits for accurate shape and pose estimation, +outperforming baseline methods. Project page: +https://shaoxiongyao.github.io/lmap-ssc/. + +
+
+ comment: Shaoxiong Yao and Sicong Pan have equal contributions. Submitted to + ICRA 2025 +
+
+
+
+
+ + ☆ Decentralized Nonlinear Model Predictive Control for Safe Collision + Avoidance in Quadrotor Teams with Limited Detection Range ICRA + + +
+ Multi-quadrotor systems face significant challenges in decentralized control, +particularly with safety and coordination under sensing and communication +limitations. State-of-the-art methods leverage Control Barrier Functions (CBFs) +to provide safety guarantees but often neglect actuation constraints and +limited detection range. To address these gaps, we propose a novel +decentralized Nonlinear Model Predictive Control (NMPC) that integrates +Exponential CBFs (ECBFs) to enhance safety and optimality in multi-quadrotor +systems. We provide both conservative and practical minimum bounds of the range +that preserve the safety guarantees of the ECBFs. We validate our approach +through extensive simulations with up to 10 quadrotors and 20 obstacles, as +well as real-world experiments with 3 quadrotors. Results demonstrate the +effectiveness of the proposed framework in realistic settings, highlighting its +potential for reliable quadrotor teams operations. + +
+
+ comment: 7 pages, 5 figures, Submitted to the IEEE International Conference on + Robotics and Automation (ICRA) 2025 +
+
+
+
+
+ + ☆ Data-driven Probabilistic Trajectory Learning with High Temporal + Resolution in Terminal Airspace + + +
+ Predicting flight trajectories is a research area that holds significant +merit. In this paper, we propose a data-driven learning framework, that +leverages the predictive and feature extraction capabilities of the mixture +models and seq2seq-based neural networks while addressing prevalent challenges +caused by error propagation and dimensionality reduction. After training with +this framework, the learned model can improve long-step prediction accuracy +significantly given the past trajectories and the context information. The +accuracy and effectiveness of the approach are evaluated by comparing the +predicted trajectories with the ground truth. The results indicate that the +proposed method has outperformed the state-of-the-art predicting methods on a +terminal airspace flight trajectory dataset. The trajectories generated by the +proposed method have a higher temporal resolution(1 timestep per second vs 0.1 +timestep per second) and are closer to the ground truth. + +
+
+ comment: Submitted to AIAA-JAIS +
+
+
+
+
+ + ☆ SeaSplat: Representing Underwater Scenes with 3D Gaussian Splatting and + a Physically Grounded Image Formation Model + + +
+ We introduce SeaSplat, a method to enable real-time rendering of underwater +scenes leveraging recent advances in 3D radiance fields. Underwater scenes are +challenging visual environments, as rendering through a medium such as water +introduces both range and color dependent effects on image capture. We +constrain 3D Gaussian Splatting (3DGS), a recent advance in radiance fields +enabling rapid training and real-time rendering of full 3D scenes, with a +physically grounded underwater image formation model. Applying SeaSplat to the +real-world scenes from SeaThru-NeRF dataset, a scene collected by an underwater +vehicle in the US Virgin Islands, and simulation-degraded real-world scenes, +not only do we see increased quantitative performance on rendering novel +viewpoints from the scene with the medium present, but are also able to recover +the underlying true color of the scene and restore renders to be without the +presence of the intervening medium. We show that the underwater image formation +helps learn scene structure, with better depth maps, as well as show that our +improvements maintain the significant computational improvements afforded by +leveraging a 3D Gaussian representation. + +
+
+ comment: Project page here: https://seasplat.github.io +
+
+
+
+
+ + ☆ Koopman-driven grip force prediction through EMG sensing + + +
+ Loss of hand function due to conditions like stroke or multiple sclerosis +significantly impacts daily activities. Robotic rehabilitation provides tools +to restore hand function, while novel methods based on surface electromyography +(sEMG) enable the adaptation of the device's force output according to the +user's condition, thereby improving rehabilitation outcomes. This study aims to +achieve accurate force estimations during medium wrap grasps using a single +sEMG sensor pair, thereby addressing the challenge of escalating sensor +requirements for precise predictions. We conducted sEMG measurements on 13 +subjects at two forearm positions, validating results with a hand dynamometer. +We established flexible signal-processing steps, yielding high peak +cross-correlations between the processed sEMG signal (representing meaningful +muscle activity) and grip force. Influential parameters were subsequently +identified through sensitivity analysis. Leveraging a novel data-driven Koopman +operator theory-based approach and problem-specific data lifting techniques, we +devised a methodology for the estimation and short-term prediction of grip +force from processed sEMG signals. A weighted mean absolute percentage error +(wMAPE) of approx. 5.5% was achieved for the estimated grip force, whereas +predictions with a 0.5-second prediction horizon resulted in a wMAPE of approx. +17.9%. The methodology proved robust regarding precise electrode positioning, +as the effect of sensing position on error metrics was non-significant. The +algorithm executes exceptionally fast, processing, estimating, and predicting a +0.5-second sEMG signal batch in just approx. 30 ms, facilitating real-time +implementation. + +
+
+ comment: 11 pages, 8 figures, journal +
+
+
+
+
+ + ☆ Building Real-time Awareness of Out-of-distribution in Trajectory + Prediction for Autonomous Vehicles + + +
+ Trajectory prediction describes the motions of surrounding moving obstacles +for an autonomous vehicle; it plays a crucial role in enabling timely +decision-making, such as collision avoidance and trajectory replanning. +Accurate trajectory planning is the key to reliable vehicle deployments in +open-world environment, where unstructured obstacles bring in uncertainties +that are impossible to fully capture by training data. For traditional machine +learning tasks, such uncertainties are often addressed reasonably well via +methods such as continual learning. On the one hand, naively applying those +methods to trajectory prediction can result in continuous data collection and +frequent model updates, which can be resource-intensive. On the other hand, the +predicted trajectories can be far away from the true trajectories, leading to +unsafe decision-making. In this paper, we aim to establish real-time awareness +of out-of-distribution in trajectory prediction for autonomous vehicles. We +focus on the challenging and practically relevant setting where the +out-of-distribution is deceptive, that is, the one not easily detectable by +human intuition. Drawing on the well-established techniques of sequential +analysis, we build real-time awareness of out-of-distribution by monitoring +prediction errors using the quickest change point detection (QCD). Our +solutions are lightweight and can handle the occurrence of out-of-distribution +at any time during trajectory prediction inference. Experimental results on +multiple real-world datasets using a benchmark trajectory prediction model +demonstrate the effectiveness of our methods. + +
+
+
+
+
+ + ☆ CROSS-GAiT: Cross-Attention-Based Multimodal Representation Fusion for + Parametric Gait Adaptation in Complex Terrains + + +
+ We present CROSS-GAiT, a novel algorithm for quadruped robots that uses Cross +Attention to fuse terrain representations derived from visual and time-series +inputs, including linear accelerations, angular velocities, and joint efforts. +These fused representations are used to adjust the robot's step height and hip +splay, enabling adaptive gaits that respond dynamically to varying terrain +conditions. We generate these terrain representations by processing visual +inputs through a masked Vision Transformer (ViT) encoder and time-series data +through a dilated causal convolutional encoder. The cross-attention mechanism +then selects and integrates the most relevant features from each modality, +combining terrain characteristics with robot dynamics for better-informed gait +adjustments. CROSS-GAiT uses the combined representation to dynamically adjust +gait parameters in response to varying and unpredictable terrains. We train +CROSS-GAiT on data from diverse terrains, including asphalt, concrete, brick +pavements, grass, dense vegetation, pebbles, gravel, and sand. Our algorithm +generalizes well and adapts to unseen environmental conditions, enhancing +real-time navigation performance. CROSS-GAiT was implemented on a Ghost +Robotics Vision 60 robot and extensively tested in complex terrains with high +vegetation density, uneven/unstable surfaces, sand banks, deformable +substrates, etc. We observe at least a 7.04% reduction in IMU energy density +and a 27.3% reduction in total joint effort, which directly correlates with +increased stability and reduced energy usage when compared to state-of-the-art +methods. Furthermore, CROSS-GAiT demonstrates at least a 64.5% increase in +success rate and a 4.91% reduction in time to reach the goal in four complex +scenarios. Additionally, the learned representations perform 4.48% better than +the state-of-the-art on a terrain classification task. + +
+
+
+
+
+ + ☆ Enhancing robot reliability for health-care facilities by means of + Human-Aware Navigation Planning + + +
+ With the aim of enabling robots to cooperate with humans, carry out +human-like tasks, or navigate among humans, we need to ensure that they are +equipped with the ability to comprehend human behaviors and use the extracted +knowledge for intelligent decision-making. This ability is particularly +important in the safety-critical and human-centred environment of health-care +institutions. In the field of robotic navigation, the most cutting-edge +approaches to enhancing robot reliability in the application domain of +healthcare facilities and in general pertain to augmenting navigation systems +with human-aware properties. To implement this in our work, the Co-operative +Human-Aware Navigation planner has been integrated into the ROS-based +differential-drive robot MARRtina and exhaustively challenged within various +simulated contexts and scenarios (mainly modelling the situations relevant in +the medical domain) to draw attention to the integrated system's benefits and +identify its drawbacks or instances of poor performance while exploring the +scope of system capabilities and creating a full characterization of its +applicability. The simulation results are then presented to medical experts, +and the enhanced robot acceptability within the domain is validated with them +as the robot is further planned for deployment. + +
+
+
+
+
+ + ☆ Blox-Net: Generative Design-for-Robot-Assembly Using VLM Supervision, + Physics Simulation, and a Robot with Reset + + +
+ Generative AI systems have shown impressive capabilities in creating text, +code, and images. Inspired by the rich history of research in industrial +''Design for Assembly'', we introduce a novel problem: Generative +Design-for-Robot-Assembly (GDfRA). The task is to generate an assembly based on +a natural language prompt (e.g., ''giraffe'') and an image of available +physical components, such as 3D-printed blocks. The output is an assembly, a +spatial arrangement of these components, and instructions for a robot to build +this assembly. The output must 1) resemble the requested object and 2) be +reliably assembled by a 6 DoF robot arm with a suction gripper. We then present +Blox-Net, a GDfRA system that combines generative vision language models with +well-established methods in computer vision, simulation, perturbation analysis, +motion planning, and physical robot experimentation to solve a class of GDfRA +problems with minimal human supervision. Blox-Net achieved a Top-1 accuracy of +63.5% in the ''recognizability'' of its designed assemblies (eg, resembling +giraffe as judged by a VLM). These designs, after automated perturbation +redesign, were reliably assembled by a robot, achieving near-perfect success +across 10 consecutive assembly iterations with human intervention only during +reset prior to assembly. Surprisingly, this entire design process from textual +word (''giraffe'') to reliable physical assembly is performed with zero human +intervention. + +
+
+ comment: 8 pages, 7 Figures +
+
+
+
+
+ + ☆ PokeFlex: Towards a Real-World Dataset of Deformable Objects for Robotic + Manipulation ICRA + + +
+ Advancing robotic manipulation of deformable objects can enable automation of +repetitive tasks across multiple industries, from food processing to textiles +and healthcare. Yet robots struggle with the high dimensionality of deformable +objects and their complex dynamics. While data-driven methods have shown +potential for solving manipulation tasks, their application in the domain of +deformable objects has been constrained by the lack of data. To address this, +we propose PokeFlex, a pilot dataset featuring real-world 3D mesh data of +actively deformed objects, together with the corresponding forces and torques +applied by a robotic arm, using a simple poking strategy. Deformations are +captured with a professional volumetric capture system that allows for complete +360-degree reconstruction. The PokeFlex dataset consists of five deformable +objects with varying stiffness and shapes. Additionally, we leverage the +PokeFlex dataset to train a vision model for online 3D mesh reconstruction from +a single image and a template mesh. We refer readers to the supplementary +material and to our website ( https://pokeflex-dataset.github.io/ ) for demos +and examples of our dataset. + +
+
+ comment: Extended Abstract, 40th Anniversary of the IEEE International + Conference on Robotics and Automation. (ICRA@40 Rotterdam 2024) +
+
+
+
+
+ + ☆ Hierarchical Tri-manual Planning for Vision-assisted Fruit Harvesting + with Quadrupedal Robots + + +
+ This paper addresses the challenge of developing a multi-arm quadrupedal +robot capable of efficiently harvesting fruit in complex, natural environments. +To overcome the inherent limitations of traditional bimanual manipulation, we +introduce the first three-arm quadrupedal robot LocoHarv-3 and propose a novel +hierarchical tri-manual planning approach, enabling automated fruit harvesting +with collision-free trajectories. Our comprehensive semi-autonomous framework +integrates teleoperation, supported by LiDAR-based odometry and mapping, with +learning-based visual perception for accurate fruit detection and pose +estimation. Validation is conducted through a series of controlled indoor +experiments using motion capture and extensive field tests in natural settings. +Results demonstrate a 90\% success rate in in-lab settings with a single +attempt, and field trials further verify the system's robustness and efficiency +in more challenging real-world environments. + +
+
+ comment: 7 pages, 8 figures +
+
+
+
+
+ + ☆ Towards human-like kinematics in industrial robotic arms: a case study + on a UR3 robot + + +
+ Safety in industrial robotic environments is a hot research topic in the area +of human-robot interaction (HRI). Up to now, a robotic arm on an assembly line +interacts with other machines away from human workers. Nowadays, robotic arm +manufactures are aimed to their robots could increasingly perform tasks +collaborating with humans. One of the ways to improve this collaboration is by +making the movement of robots more humanlike. This way, it would be easier for +a human to foresee the movement of the robot and approach it without fear of +contact. The main difference between the movement of a human and of a robotic +arm is that the former has a bell-shaped speed profile while the latter has a +uniform speed one. To generate this speed profile, the kinematic theory of +rapid human movements and its Sigma-Lognormal model has been used. This model +is widely used to explain most of the basic phenomena related to the control of +human movements. Both human-like and robotic-like movements are transferred to +the UR3 robot. In this paper we detail the how the UR3 robot was programmed to +produce both kinds of movement. The dissimilarities result between the input +motion and output motion to the robot confirm the possibility to develop +human-like velocities in the UR3 robot. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ Self-Sensing for Proprioception and Contact Detection in Soft Robots + Using Shape Memory Alloy Artificial Muscles + + +
+ Estimating a soft robot's pose and applied forces, also called +proprioception, is crucial for safe interaction of the robot with its +environment. However, most solutions for soft robot proprioception use +dedicated sensors, particularly for external forces, which introduce design +trade-offs, rigidity, and risk of failure. This work presents an approach for +pose estimation and contact detection for soft robots actuated by shape memory +alloy (SMA) artificial muscles, using no dedicated force sensors. Our framework +uses the unique material properties of SMAs to self-sense their internal +stress, via offboard measurements of their electrical resistance and in-situ +temperature readings, in an existing fully-soft limb design. We demonstrate +that a simple polynomial regression model on these measurements is sufficient +to predict the robot's pose, under no-contact conditions. Then, we show that if +an additional measurement of the true pose is available (e.g. from an +already-in-place bending sensor), it is possible to predict a binary +contact/no-contact using multiple combinations of self-sensing signals. Our +hardware tests verify our hypothesis via a contact detection test with a human +operator. This proof-of-concept validates that self-sensing signals in soft +SMA-actuated soft robots can be used for proprioception and contact detection, +and suggests a direction for integrating proprioception into soft robots +without design compromises. Future work could employ machine learning for +enhanced accuracy. + +
+
+ comment: 6 pages, 7 figures +
+
+
+
+
+ + Collision-free time-optimal path parameterization for multi-robot teams + + +
+ Coordinating the motion of multiple robots in cluttered environments remains +a computationally challenging task. We study the problem of minimizing the +execution time of a set of geometric paths by a team of robots with +state-dependent actuation constraints. We propose a Time-Optimal Path +Parameterization (TOPP) algorithm for multiple car-like agents, where the +modulation of the timing of every robot along its assigned path is employed to +ensure collision avoidance and dynamic feasibility. This is achieved through +the use of a priority queue to determine the order of trajectory execution for +each robot while taking into account all possible collisions with higher +priority robots in a spatiotemporal graph. We show a 10-20% reduction in +makespan against existing state-of-the-art methods and validate our approach +through simulations and hardware experiments. + +
+
+
+
+
+ + ☆ 2024 BRAVO Challenge Track 1 1st Place Report: Evaluating Robustness of + Vision Foundation Models for Semantic Segmentation + + +
+ In this report, we present our solution for Track 1 of the 2024 BRAVO +Challenge, where a model is trained on Cityscapes and its robustness is +evaluated on several out-of-distribution datasets. Our solution leverages the +powerful representations learned by vision foundation models, by attaching a +simple segmentation decoder to DINOv2 and fine-tuning the entire model. This +approach outperforms more complex existing approaches, and achieves 1st place +in the challenge. Our code is publicly available at +https://github.com/tue-mps/benchmark-vfm-ss. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2409.15107 +
+
+
+
+
+ + ☆ Semantically-Driven Disambiguation for Human-Robot Interaction + + +
+ Ambiguities are common in human-robot interaction, especially when a robot +follows user instructions in a large collocated space. For instance, when the +user asks the robot to find an object in a home environment, the object might +be in several places depending on its varying semantic properties (e.g., a bowl +can be in the kitchen cabinet or on the dining room table, depending on whether +it is clean/dirty, full/empty and the other objects around it). Previous works +on object semantics have predicted such relationships using one shot-inferences +which are likely to fail for ambiguous or partially understood instructions. +This paper focuses on this gap and suggests a semantically-driven +disambiguation approach by utilizing follow-up clarifications to handle such +uncertainties. To achieve this, we first obtain semantic knowledge embeddings, +and then these embeddings are used to generate clarifying questions by +following an iterative process. The evaluation of our method shows that our +approach is model agnostic, i.e., applicable to different semantic embedding +models, and follow-up clarifications improve the performance regardless of the +embedding model. Additionally, our ablation studies show the significance of +informative clarifications and iterative predictions to enhance system +accuracies. + +
+
+
+
+
+ + ☆ WasteGAN: Data Augmentation for Robotic Waste Sorting through Generative + Adversarial Networks IROS 2024 + + +
+ Robotic waste sorting poses significant challenges in both perception and +manipulation, given the extreme variability of objects that should be +recognized on a cluttered conveyor belt. While deep learning has proven +effective in solving complex tasks, the necessity for extensive data collection +and labeling limits its applicability in real-world scenarios like waste +sorting. To tackle this issue, we introduce a data augmentation method based on +a novel GAN architecture called wasteGAN. The proposed method allows to +increase the performance of semantic segmentation models, starting from a very +limited bunch of labeled examples, such as few as 100. The key innovations of +wasteGAN include a novel loss function, a novel activation function, and a +larger generator block. Overall, such innovations helps the network to learn +from limited number of examples and synthesize data that better mirrors +real-world distributions. We then leverage the higher-quality segmentation +masks predicted from models trained on the wasteGAN synthetic data to compute +semantic-aware grasp poses, enabling a robotic arm to effectively recognizing +contaminants and separating waste in a real-world scenario. Through +comprehensive evaluation encompassing dataset-based assessments and real-world +experiments, our methodology demonstrated promising potential for robotic waste +sorting, yielding performance gains of up to 5.8\% in picking contaminants. The +project page is available at https://github.com/bach05/wasteGAN.git + +
+
+ comment: Accepted at 2024 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2024) +
+
+
+
+
+ + ☆ Hydraulic Volumetric Soft Everting Vine Robot Steering Mechanism for + Underwater Exploration + + +
+ Despite a significant proportion of the Earth being covered in water, +exploration of what lies below has been limited due to the challenges and +difficulties inherent in the process. Current state of the art robots such as +Remotely Operated Vehicles (ROVs) and Autonomous Underwater Vehicles (AUVs) are +bulky, rigid and unable to conform to their environment. Soft robotics offers +solutions to this issue. Fluid-actuated eversion or growing robots, in +particular, are a good example. While current eversion robots have found many +applications on land, their inherent properties make them particularly well +suited to underwater environments. An important factor when considering +underwater eversion robots is the establishment of a suitable steering +mechanism that can enable the robot to change direction as required. This +project proposes a design for an eversion robot that is capable of steering +while underwater, through the use of bending pouches, a design commonly seen in +the literature on land-based eversion robots. These bending pouches contract to +enable directional change. Similar to their land-based counterparts, the +underwater eversion robot uses the same fluid in the medium it operates in to +achieve extension and bending but also to additionally aid in neutral buoyancy. +The actuation method of bending pouches meant that robots needed to fully +extend before steering was possible. Three robots, with the same design and +dimensions were constructed from polyethylene tubes and tested. Our research +shows that although the soft eversion robot design in this paper was not +capable of consistently generating the same amounts of bending for the +inflation volume, it still achieved suitable bending at a range of inflation +volumes and was observed to bend to a maximum angle of 68 degrees at 2000 ml, +which is in line with the bending angles reported for land-based eversion +robots in the literature. + +
+
+
+
+
+ + ☆ Efficient Submap-based Autonomous MAV Exploration using Visual-Inertial + SLAM Configurable for LiDARs or Depth Cameras + + +
+ Autonomous exploration of unknown space is an essential component for the +deployment of mobile robots in the real world. Safe navigation is crucial for +all robotics applications and requires accurate and consistent maps of the +robot's surroundings. To achieve full autonomy and allow deployment in a wide +variety of environments, the robot must rely on on-board state estimation which +is prone to drift over time. We propose a Micro Aerial Vehicle (MAV) +exploration framework based on local submaps to allow retaining global +consistency by applying loop-closure corrections to the relative submap poses. +To enable large-scale exploration we efficiently compute global, +environment-wide frontiers from the local submap frontiers and use a +sampling-based next-best-view exploration planner. Our method seamlessly +supports using either a LiDAR sensor or a depth camera, making it suitable for +different kinds of MAV platforms. We perform comparative evaluations in +simulation against a state-of-the-art submap-based exploration framework to +showcase the efficiency and reconstruction quality of our approach. Finally, we +demonstrate the applicability of our method to real-world MAVs, one equipped +with a LiDAR and the other with a depth camera. Video available at +https://youtu.be/Uf5fwmYcuq4 . + +
+
+ comment: 7 pages, 8 figures, for the accompanying video see + https://youtu.be/Uf5fwmYcuq4 +
+
+
+
+
+ + ☆ Multi-Robot Informative Path Planning for Efficient Target Mapping using + Deep Reinforcement Learning + + +
+ Autonomous robots are being employed in several mapping and data collection +tasks due to their efficiency and low labor costs. In these tasks, the robots +are required to map targets-of-interest in an unknown environment while +constrained to a given resource budget such as path length or mission time. +This is a challenging problem as each robot has to not only detect and avoid +collisions from static obstacles in the environment but also has to model other +robots' trajectories to avoid inter-robot collisions. We propose a novel deep +reinforcement learning approach for multi-robot informative path planning to +map targets-of-interest in an unknown 3D environment. A key aspect of our +approach is an augmented graph that models other robots' trajectories to enable +planning for communication and inter-robot collision avoidance. We train our +decentralized reinforcement learning policy via the centralized training and +decentralized execution paradigm. Once trained, our policy is also scalable to +varying number of robots and does not require re-training. Our approach +outperforms other state-of-the-art multi-robot target mapping approaches by +33.75% in terms of the number of discovered targets-of-interest. We open-source +our code and model at: https://github.com/AccGen99/marl_ipp + +
+
+ comment: arXiv admin note: text overlap with arXiv:2402.04894 +
+
+
+
+
+ + ☆ DualLQR: Efficient Grasping of Oscillating Apples using Task + Parameterized Learning from Demonstration ICRA2025 + + +
+ Learning from Demonstration offers great potential for robots to learn to +perform agricultural tasks, specifically selective harvesting. One of the +challenges is that the target fruit can be oscillating while approaching. +Grasping oscillating targets has two requirements: 1) close tracking of the +target during the final approach for damage-free grasping, and 2) the complete +path should be as short as possible for improved efficiency. We propose a new +method called DualLQR. In this method, we use a finite horizon Linear Quadratic +Regulator (LQR) on a moving target, without the need of refitting the LQR. To +make this possible, we use a dual LQR setup, with an LQR running in two +seperate reference frames. Through extensive simulation testing, it was found +that the state-of-art method barely meets the required final accuracy without +oscillations and drops below the required accuracy with an oscillating target. +DualLQR was found to be able to meet the required final accuracy even with high +oscillations, with an accuracy increase of 60% for high orientation +oscillations. Further testing on a real-world apple grasping task showed that +DualLQR was able to successfully grasp oscillating apples, with a success rate +of 99%. + +
+
+ comment: Submitted to ICRA2025 +
+
+
+
+
+ + ☆ Dynamic Obstacle Avoidance through Uncertainty-Based Adaptive Planning + with Diffusion + + +
+ By framing reinforcement learning as a sequence modeling problem, recent work +has enabled the use of generative models, such as diffusion models, for +planning. While these models are effective in predicting long-horizon state +trajectories in deterministic environments, they face challenges in dynamic +settings with moving obstacles. Effective collision avoidance demands +continuous monitoring and adaptive decision-making. While replanning at every +timestep could ensure safety, it introduces substantial computational overhead +due to the repetitive prediction of overlapping state sequences -- a process +that is particularly costly with diffusion models, known for their intensive +iterative sampling procedure. We propose an adaptive generative planning +approach that dynamically adjusts replanning frequency based on the uncertainty +of action predictions. Our method minimizes the need for frequent, +computationally expensive, and redundant replanning while maintaining robust +collision avoidance performance. In experiments, we obtain a 13.5% increase in +the mean trajectory length and a 12.7% increase in mean reward over +long-horizon planning, indicating a reduction in collision rates and an +improved ability to navigate the environment safely. + +
+
+
+
+
+ + ☆ Go-SLAM: Grounded Object Segmentation and Localization with Gaussian + Splatting SLAM + + +
+ We introduce Go-SLAM, a novel framework that utilizes 3D Gaussian Splatting +SLAM to reconstruct dynamic environments while embedding object-level +information within the scene representations. This framework employs advanced +object segmentation techniques, assigning a unique identifier to each Gaussian +splat that corresponds to the object it represents. Consequently, our system +facilitates open-vocabulary querying, allowing users to locate objects using +natural language descriptions. Furthermore, the framework features an optimal +path generation module that calculates efficient navigation paths for robots +toward queried objects, considering obstacles and environmental uncertainties. +Comprehensive evaluations in various scene settings demonstrate the +effectiveness of our approach in delivering high-fidelity scene +reconstructions, precise object segmentation, flexible object querying, and +efficient robot path planning. This work represents an additional step forward +in bridging the gap between 3D scene reconstruction, semantic object +understanding, and real-time environment interactions. + +
+
+
+
+
+ + ☆ Performance assessment of ADAS in a representative subset of critical + traffic situations + + +
+ As a variety of automated collision prevention systems gain presence within +personal vehicles, rating and differentiating the automated safety performance +of car models has become increasingly important for consumers, manufacturers, +and insurers. In 2023, Swiss Re and partners initiated an eight-month long +vehicle testing campaign conducted on a recognized UNECE type approval +authority and Euro NCAP accredited proving ground in Germany. The campaign +exposed twelve mass-produced vehicle models and one prototype vehicle fitted +with collision prevention systems to a selection of safety-critical traffic +scenarios representative of United States and European Union accident +landscape. In this paper, we compare and evaluate the relative safety +performance of these thirteen collision prevention systems (hardware and +software stack) as demonstrated by this testing campaign. We first introduce a +new scoring system which represents a test system's predicted impact on overall +real-world collision frequency and reduction of collision impact energy, +weighted based on the real-world relevance of the test scenario. Next, we +introduce a novel metric that quantifies the realism of the protocol and +confirm that our test protocol is a plausible representation of real-world +driving. Finally, we find that the prototype system in its pre-release state +outperforms the mass-produced (post-consumer-release) vehicles in the majority +of the tested scenarios on the test track. + +
+
+
+
+
+ + ☆ Let's Make a Splan: Risk-Aware Trajectory Optimization in a Normalized + Gaussian Splat + + +
+ Neural Radiance Fields and Gaussian Splatting have transformed the field of +computer vision by enabling photo-realistic representation of complex scenes. +Despite this success, they have seen only limited use in real-world robotics +tasks such as trajectory optimization. Two key factors have contributed to this +limited success. First, it is challenging to reason about collisions in +radiance models. Second, it is difficult to perform inference of radiance +models fast enough for real-time trajectory synthesis. This paper addresses +these challenges by proposing SPLANNING, a risk-aware trajectory optimizer that +operates in a Gaussian Splatting model. This paper first derives a method for +rigorously upper-bounding the probability of collision between a robot and a +radiance field. Second, this paper introduces a normalized reformulation of +Gaussian Splatting that enables the efficient computation of the collision +bound in a Gaussian Splat. Third, a method is presented to optimize +trajectories while avoiding collisions with a scene represented by a Gaussian +Splat. Experiments demonstrate that SPLANNING outperforms state-of-the-art +methods in generating collision-free trajectories in highly cluttered +environments. The proposed system is also tested on a real-world robot +manipulator. A project page is available at +https://roahmlab.github.io/splanning. + +
+
+ comment: First two authors contributed equally. Project Page: + https://roahmlab.github.io/splanning +
+
+
+
+
+ + ☆ A Roadmap for Embodied and Social Grounding in LLMs + + +
+ The fusion of Large Language Models (LLMs) and robotic systems has led to a +transformative paradigm in the robotic field, offering unparalleled +capabilities not only in the communication domain but also in skills like +multimodal input handling, high-level reasoning, and plan generation. The +grounding of LLMs knowledge into the empirical world has been considered a +crucial pathway to exploit the efficiency of LLMs in robotics. Nevertheless, +connecting LLMs' representations to the external world with multimodal +approaches or with robots' bodies is not enough to let them understand the +meaning of the language they are manipulating. Taking inspiration from humans, +this work draws attention to three necessary elements for an agent to grasp and +experience the world. The roadmap for LLMs grounding is envisaged in an active +bodily system as the reference point for experiencing the environment, a +temporally structured experience for a coherent, self-related interaction with +the external world, and social skills to acquire a common-grounded shared +experience. + +
+
+ comment: Accepted Version of a conference paper presented at Robophilosophy + Conference 2024 +
+
+
+
+
+ + ☆ Robotic Backchanneling in Online Conversation Facilitation: A + Cross-Generational Study + + +
+ Japan faces many challenges related to its aging society, including +increasing rates of cognitive decline in the population and a shortage of +caregivers. Efforts have begun to explore solutions using artificial +intelligence (AI), especially socially embodied intelligent agents and robots +that can communicate with people. Yet, there has been little research on the +compatibility of these agents with older adults in various everyday situations. +To this end, we conducted a user study to evaluate a robot that functions as a +facilitator for a group conversation protocol designed to prevent cognitive +decline. We modified the robot to use backchannelling, a natural human way of +speaking, to increase receptiveness of the robot and enjoyment of the group +conversation experience. We conducted a cross-generational study with young +adults and older adults. Qualitative analyses indicated that younger adults +perceived the backchannelling version of the robot as kinder, more trustworthy, +and more acceptable than the non-backchannelling robot. Finally, we found that +the robot's backchannelling elicited nonverbal backchanneling in older +participants. + +
+
+ comment: Published at Proceedings of the 2023 32nd IEEE International + Conference on Robot and Human Interactive Communication (RO-MAN 2023) +
+
+
+
+
+ + ☆ Revisiting Space Mission Planning: A Reinforcement Learning-Guided + Approach for Multi-Debris Rendezvous + + +
+ This research introduces a novel application of a masked Proximal Policy +Optimization (PPO) algorithm from the field of deep reinforcement learning +(RL), for determining the most efficient sequence of space debris visitation, +utilizing the Lambert solver as per Izzo's adaptation for individual +rendezvous. The aim is to optimize the sequence in which all the given debris +should be visited to get the least total time for rendezvous for the entire +mission. A neural network (NN) policy is developed, trained on simulated space +missions with varying debris fields. After training, the neural network +calculates approximately optimal paths using Izzo's adaptation of Lambert +maneuvers. Performance is evaluated against standard heuristics in mission +planning. The reinforcement learning approach demonstrates a significant +improvement in planning efficiency by optimizing the sequence for debris +rendezvous, reducing the total mission time by an average of approximately +{10.96\%} and {13.66\%} compared to the Genetic and Greedy algorithms, +respectively. The model on average identifies the most time-efficient sequence +for debris visitation across various simulated scenarios with the fastest +computational speed. This approach signifies a step forward in enhancing +mission planning strategies for space debris clearance. + +
+
+ comment: Accepted for publication at the 2024 International Conference on + Space Robotics (iSpaRo) +
+
+
+
+
+ + ☆ GRACE: Generating Socially Appropriate Robot Actions Leveraging LLMs and + Human Explanations ICRA + + +
+ When operating in human environments, robots need to handle complex tasks +while both adhering to social norms and accommodating individual preferences. +For instance, based on common sense knowledge, a household robot can predict +that it should avoid vacuuming during a social gathering, but it may still be +uncertain whether it should vacuum before or after having guests. In such +cases, integrating common-sense knowledge with human preferences, often +conveyed through human explanations, is fundamental yet a challenge for +existing systems. In this paper, we introduce GRACE, a novel approach +addressing this while generating socially appropriate robot actions. GRACE +leverages common sense knowledge from Large Language Models (LLMs), and it +integrates this knowledge with human explanations through a generative network +architecture. The bidirectional structure of GRACE enables robots to refine and +enhance LLM predictions by utilizing human explanations and makes robots +capable of generating such explanations for human-specified actions. Our +experimental evaluations show that integrating human explanations boosts +GRACE's performance, where it outperforms several baselines and provides +sensible explanations. + +
+
+ comment: Under review for 2025 IEEE International Conference on Robotics & + Automation (ICRA), Supplementary video: https://youtu.be/3gP3euwNBjQ +
+
+
+
+
+ + ☆ Behavior evolution-inspired approach to walking gait reinforcement + training for quadruped robots + + +
+ Reinforcement learning method is extremely competitive in gait generation +techniques for quadrupedal robot, which is mainly due to the fact that +stochastic exploration in reinforcement training is beneficial to achieve an +autonomous gait. Nevertheless, although incremental reinforcement learning is +employed to improve training success and movement smoothness by relying on the +continuity inherent during limb movements, challenges remain in adapting gait +policy to diverse terrain and external disturbance. Inspired by the association +between reinforcement learning and the evolution of animal motion behavior, a +self-improvement mechanism for reference gait is introduced in this paper to +enable incremental learning of action and self-improvement of reference action +together to imitate the evolution of animal motion behavior. Further, a new +framework for reinforcement training of quadruped gait is proposed. In this +framework, genetic algorithm is specifically adopted to perform global +probabilistic search for the initial value of the arbitrary foot trajectory to +update the reference trajectory with better fitness. Subsequently, the improved +reference gait is used for incremental reinforcement learning of gait. The +above process is repeatedly and alternatively executed to finally train the +gait policy. The analysis considering terrain, model dimensions, and locomotion +condition is presented in detail based on simulation, and the results show that +the framework is significantly more adaptive to terrain compared to regular +incremental reinforcement learning. + +
+
+
+
+
+ + ☆ Communication Backbone Reconfiguration with Connectivity Maintenance + + +
+ The exchange of information is key in applications that involve multiple +agents, such as search and rescue, military operations, and disaster response. +In this work, we propose a simple and effective trajectory planning framework +that tackles the design, deployment, and reconfiguration of a communication +backbone by reframing the problem of networked multi-agent motion planning as a +manipulator motion planning problem. Our approach works for backbones of +variable configurations both in terms of the number of robots utilized and the +distance limit between each robot. While research has been conducted on +connection-restricted navigation for multi-robot systems in the last years, the +field of manipulators is arguably more developed both in theory and practice. +Hence, our methodology facilitates practical applications built on top of +widely available motion planning algorithms and frameworks for manipulators. + +
+
+ comment: Submitted to IEEE Latin America Transactions +
+
+
+
+
+ + ☆ CREVE: An Acceleration-based Constraint Approach for Robust Radar + Ego-Velocity Estimation + + +
+ Ego-velocity estimation from point cloud measurements of a millimeter-wave +frequency-modulated continuous wave (mmWave FMCW) radar has become a crucial +component of radar-inertial odometry (RIO) systems. Conventional approaches +often perform poorly when the number of point cloud outliers exceeds that of +inliers. In this paper, we propose CREVE, an acceleration-based inequality +constraints filter that leverages additional measurements from an inertial +measurement unit (IMU) to achieve robust ego-velocity estimations. To further +enhance accuracy and robustness against sensor errors, we introduce a practical +accelerometer bias estimation method and a parameter adaptation rule. The +effectiveness of the proposed method is evaluated using five open-source drone +datasets. Experimental results demonstrate that our algorithm significantly +outperforms three existing state-of-the-art methods, achieving reductions in +absolute trajectory error of approximately 53%, 84%, and 35% compared to them. + +
+
+ comment: 7 pages, conference +
+
+
+
+
+ + ☆ Conditional Generative Denoiser for Nighttime UAV Tracking + + +
+ State-of-the-art (SOTA) visual object tracking methods have significantly +enhanced the autonomy of unmanned aerial vehicles (UAVs). However, in low-light +conditions, the presence of irregular real noise from the environments severely +degrades the performance of these SOTA methods. Moreover, existing SOTA +denoising techniques often fail to meet the real-time processing requirements +when deployed as plug-and-play denoisers for UAV tracking. To address this +challenge, this work proposes a novel conditional generative denoiser +(CGDenoiser), which breaks free from the limitations of traditional +deterministic paradigms and generates the noise conditioning on the input, +subsequently removing it. To better align the input dimensions and accelerate +inference, a novel nested residual Transformer conditionalizer is developed. +Furthermore, an innovative multi-kernel conditional refiner is designed to +pertinently refine the denoised output. Extensive experiments show that +CGDenoiser promotes the tracking precision of the SOTA tracker by 18.18\% on +DarkTrack2021 whereas working 5.8 times faster than the second well-performed +denoiser. Real-world tests with complex challenges also prove the effectiveness +and practicality of CGDenoiser. Code, video demo and supplementary proof for +CGDenoier are now available at: +\url{https://github.com/vision4robotics/CGDenoiser}. + +
+
+
+
+
+ + ☆ OffRIPP: Offline RL-based Informative Path Planning ICRA 2025 + + +
+ Informative path planning (IPP) is a crucial task in robotics, where agents +must design paths to gather valuable information about a target environment +while adhering to resource constraints. Reinforcement learning (RL) has been +shown to be effective for IPP, however, it requires environment interactions, +which are risky and expensive in practice. To address this problem, we propose +an offline RL-based IPP framework that optimizes information gain without +requiring real-time interaction during training, offering safety and +cost-efficiency by avoiding interaction, as well as superior performance and +fast computation during execution -- key advantages of RL. Our framework +leverages batch-constrained reinforcement learning to mitigate extrapolation +errors, enabling the agent to learn from pre-collected datasets generated by +arbitrary algorithms. We validate the framework through extensive simulations +and real-world experiments. The numerical results show that our framework +outperforms the baselines, demonstrating the effectiveness of the proposed +approach. + +
+
+ comment: 7 pages, 6 figures, submitted to ICRA 2025 +
+
+
+
+
+ + ☆ On the role of Artificial Intelligence methods in modern + force-controlled manufacturing robotic tasks + + +
+ This position paper explores the integration of Artificial Intelligence (AI) +into force-controlled robotic tasks within the scope of advanced manufacturing, +a cornerstone of Industry 4.0. AI's role in enhancing robotic manipulators - +key drivers in the Fourth Industrial Revolution - is rapidly leading to +significant innovations in smart manufacturing. The objective of this article +is to frame these innovations in practical force-controlled applications - e.g. +deburring, polishing, and assembly tasks like peg-in-hole (PiH) - highlighting +their necessity for maintaining high-quality production standards. By reporting +on recent AI-based methodologies, this article contrasts them and identifies +current challenges to be addressed in future research. The analysis concludes +with a perspective on future research directions, emphasizing the need for +common performance metrics to validate AI techniques, integration of various +enhancements for performance optimization, and the importance of validating +them in relevant scenarios. These future directions aim to provide consistency +with already adopted approaches, so as to be compatible with manufacturing +standards, increasing the relevance of AI-driven methods in both academic and +industrial contexts. + +
+
+ comment: To be published in Proceedings of the 20th International Conference + on Informatics in Control, Automation and Robotics (ICINCO) +
+
+
+
+
+ + ☆ Inline Photometrically Calibrated Hybrid Visual SLAM + + +
+ This paper presents an integrated approach to Visual SLAM, merging online +sequential photometric calibration within a Hybrid direct-indirect visual SLAM +(H-SLAM). Photometric calibration helps normalize pixel intensity values under +different lighting conditions, and thereby improves the direct component of our +H-SLAM. A tangential benefit also results to the indirect component of H-SLAM +given that the detected features are more stable across variable lighting +conditions. Our proposed photometrically calibrated H-SLAM is tested on several +datasets, including the TUM monoVO as well as on a dataset we created. +Calibrated H-SLAM outperforms other state of the art direct, indirect, and +hybrid Visual SLAM systems in all the experiments. Furthermore, in online SLAM +tested at our site, it also significantly outperformed the other SLAM Systems. + +
+
+
+
+
+ + ☆ Do We Need iPhone Moment or Xiaomi Moment for Robots? Design of + Affordable Home Robots for Health Monitoring + + +
+ In this paper, we study cost-effective home robot solutions which are +designed for home health monitoring. The recent advancements in Artificial +Intelligence (AI) have significantly advanced the capabilities of the robots, +enabling them to better and efficiently understand and interact with their +surroundings. The most common robots currently used in homes are toy robots and +cleaning robots. While these are relatively affordable, their functionalities +are very limited. On the other hand, humanoid and quadruped robots offer more +sophisticated features and capabilities, albeit at a much higher cost. Another +category is educational robots, which provide educators with the flexibility to +attach various sensors and integrate different design methods with the +integrated operating systems. However, the challenge still exists in bridging +the gap between affordability and functionality. Our research aims to address +this by exploring the potential of developing advanced yet affordable and +accessible robots for home robots, aiming for health monitoring, by using edge +computing techniques and taking advantage of existing computing resources for +home robots, such as mobile phones. + +
+
+
+
+
+ + ☆ Programming of Skill-based Robots + + +
+ Manufacturing is facing ever changing market demands, with faster innovation +cycles resulting to growing agility and flexibility requirements. Industry 4.0 +has been transforming the manufacturing world towards digital automation and +the importance of software has increased drastically. Easy and fast task +programming and execution in robot - sensor systems become a prerequisite for +agile and flexible automation and in this paper, we propose such a system. Our +solution relies on a robot skill library, which provides the user with high +level and parametrized operations, i.e., robot skills, for task programming and +execution. Programming actions results to a control recipe in a neutral product +context and is based on use of product CAD models or alternatively +collaborative use of pointers and tracking sensor with real parts. Practical +tests are also reported to show the feasibility of our approach. + +
+
+ comment: IEEE ICIEA 2024 +
+
+
+
+
+ + ☆ World Model-based Perception for Visual Legged Locomotion + + +
+ Legged locomotion over various terrains is challenging and requires precise +perception of the robot and its surroundings from both proprioception and +vision. However, learning directly from high-dimensional visual input is often +data-inefficient and intricate. To address this issue, traditional methods +attempt to learn a teacher policy with access to privileged information first +and then learn a student policy to imitate the teacher's behavior with visual +input. Despite some progress, this imitation framework prevents the student +policy from achieving optimal performance due to the information gap between +inputs. Furthermore, the learning process is unnatural since animals +intuitively learn to traverse different terrains based on their understanding +of the world without privileged knowledge. Inspired by this natural ability, we +propose a simple yet effective method, World Model-based Perception (WMP), +which builds a world model of the environment and learns a policy based on the +world model. We illustrate that though completely trained in simulation, the +world model can make accurate predictions of real-world trajectories, thus +providing informative signals for the policy controller. Extensive simulated +and real-world experiments demonstrate that WMP outperforms state-of-the-art +baselines in traversability and robustness. Videos and Code are available at: +https://wmp-loco.github.io/. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Dashing for the Golden Snitch: Multi-Drone Time-Optimal Motion Planning + with Multi-Agent Reinforcement Learning + + +
+ Recent innovations in autonomous drones have facilitated time-optimal flight +in single-drone configurations and enhanced maneuverability in multi-drone +systems through the application of optimal control and learning-based methods. +However, few studies have achieved time-optimal motion planning for multi-drone +systems, particularly during highly agile maneuvers or in dynamic scenarios. +This paper presents a decentralized policy network for time-optimal multi-drone +flight using multi-agent reinforcement learning. To strike a balance between +flight efficiency and collision avoidance, we introduce a soft collision +penalty inspired by optimization-based methods. By customizing PPO in a +centralized training, decentralized execution (CTDE) fashion, we unlock higher +efficiency and stability in training, while ensuring lightweight +implementation. Extensive simulations show that, despite slight performance +trade-offs compared to single-drone systems, our multi-drone approach maintains +near-time-optimal performance with low collision rates. Real-world experiments +validate our method, with two quadrotors using the same network as simulation +achieving a maximum speed of 13.65 m/s and a maximum body rate of 13.4 rad/s in +a 5.5 m * 5.5 m * 2.0 m space across various tracks, relying entirely on +onboard computation. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ☆ Vision-Language Model Fine-Tuning via Simple Parameter-Efficient + Modification EMNLP 2024 + + +
+ Recent advances in fine-tuning Vision-Language Models (VLMs) have witnessed +the success of prompt tuning and adapter tuning, while the classic model +fine-tuning on inherent parameters seems to be overlooked. It is believed that +fine-tuning the parameters of VLMs with few-shot samples corrupts the +pre-trained knowledge since fine-tuning the CLIP model even degrades +performance. In this paper, we revisit this viewpoint, and propose a new +perspective: fine-tuning the specific parameters instead of all will uncover +the power of classic model fine-tuning on VLMs. Through our meticulous study, +we propose ClipFit, a simple yet effective method to fine-tune CLIP without +introducing any overhead of extra parameters. We demonstrate that by only +fine-tuning the specific bias terms and normalization layers, ClipFit can +improve the performance of zero-shot CLIP by 7.27\% average harmonic mean +accuracy. Lastly, to understand how fine-tuning in CLIPFit affects the +pre-trained models, we conducted extensive experimental analyses w.r.t. changes +in internal parameters and representations. We found that low-level text bias +layers and the first layer normalization layer change much more than other +layers. The code is available at \url{https://github.com/minglllli/CLIPFit}. + +
+
+ comment: EMNLP 2024 Main Conference +
+
+
+
+
+ + ☆ Online 6DoF Pose Estimation in Forests using Cross-View Factor Graph + Optimisation and Deep Learned Re-localisation ICRA2025 + + +
+ This paper presents a novel approach for robust global localisation and 6DoF +pose estimation of ground robots in forest environments by leveraging +cross-view factor graph optimisation and deep-learned re-localisation. The +proposed method addresses the challenges of aligning aerial and ground data for +pose estimation, which is crucial for accurate point-to-point navigation in +GPS-denied environments. By integrating information from both perspectives into +a factor graph framework, our approach effectively estimates the robot's global +position and orientation. We validate the performance of our method through +extensive experiments in diverse forest scenarios, demonstrating its +superiority over existing baselines in terms of accuracy and robustness in +these challenging environments. Experimental results show that our proposed +localisation system can achieve drift-free localisation with bounded +positioning errors, ensuring reliable and safe robot navigation under canopies. + +
+
+ comment: 7 pages, 4 figures, Submitted to ICRA2025 +
+
+
+
+
+ + ☆ Multirotor Nonlinear Model Predictive Control based on Visual Servoing + of Evolving Features + + +
+ This article presents a Visual Servoing Nonlinear Model Predictive Control +(NMPC) scheme for autonomously tracking a moving target using multirotor +Unmanned Aerial Vehicles (UAVs). The scheme is developed for surveillance and +tracking of contour-based areas with evolving features. NMPC is used to manage +input and state constraints, while additional barrier functions are +incorporated in order to ensure system safety and optimal performance. The +proposed control scheme is designed based on the extraction and implementation +of the full dynamic model of the features describing the target and the state +variables. Real-time simulations and experiments using a quadrotor UAV equipped +with a camera demonstrate the effectiveness of the proposed strategy. + +
+
+
+
+
+ + ☆ Achieving Stable High-Speed Locomotion for Humanoid Robots with Deep + Reinforcement Learning + + +
+ Humanoid robots offer significant versatility for performing a wide range of +tasks, yet their basic ability to walk and run, especially at high velocities, +remains a challenge. This letter presents a novel method that combines deep +reinforcement learning with kinodynamic priors to achieve stable locomotion +control (KSLC). KSLC promotes coordinated arm movements to counteract +destabilizing forces, enhancing overall stability. Compared to the baseline +method, KSLC provides more accurate tracking of commanded velocities and better +generalization in velocity control. In simulation tests, the KSLC-enabled +humanoid robot successfully tracked a target velocity of 3.5 m/s with reduced +fluctuations. Sim-to-sim validation in a high-fidelity environment further +confirmed its robust performance, highlighting its potential for real-world +applications. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Robo-Platform: A Robotic System for Recording Sensors and Controlling + Robots + + +
+ Mobile smartphones compactly provide sensors such as cameras, IMUs, GNSS +measurement units, and wireless and wired communication channels required for +robotics projects. They are affordable, portable, and programmable, which makes +them ideal for testing, data acquisition, controlling mobile robots, and many +other robotic applications. A robotic system is proposed in this paper, +consisting of an Android phone, a microcontroller board attached to the phone +via USB, and a remote wireless controller station. In the data acquisition +mode, the Android device can record a dataset of a diverse configuration of +multiple cameras, IMUs, GNSS units, and external USB ADC channels in the rawest +format used for, but not limited to, pose estimation and scene reconstruction +applications. In robot control mode, the Android phone, a microcontroller +board, and other peripherals constitute the mobile or stationary robotic +system. This system is controlled using a remote server connected over Wi-Fi or +Bluetooth. Experiments show that although the SLAM and AR applications can +utilize the acquired data, the proposed system can pave the way for more +advanced algorithms for processing these noisy and sporadic measurements. +Moreover, the characteristics of the communication media are studied, and two +example robotic projects, which involve controlling a toy car and a quadcopter, +are included. + +
+
+ comment: Project repository: https://github.com/m-dayani/robo-platform Youtube + Video: https://youtu.be/BTQ4yLB1bak Dataset: + https://drive.google.com/drive/folders/1OZqdA1xa-SyJ64qL_TibqhtwhR1fWWrx?usp=sharing +
+
+
+
+
+ + ☆ FLaRe: Achieving Masterful and Adaptive Robot Policies with Large-Scale + Reinforcement Learning Fine-Tuning + + +
+ In recent years, the Robotics field has initiated several efforts toward +building generalist robot policies through large-scale multi-task Behavior +Cloning. However, direct deployments of these policies have led to +unsatisfactory performance, where the policy struggles with unseen states and +tasks. How can we break through the performance plateau of these models and +elevate their capabilities to new heights? In this paper, we propose FLaRe, a +large-scale Reinforcement Learning fine-tuning framework that integrates robust +pre-trained representations, large-scale training, and gradient stabilization +techniques. Our method aligns pre-trained policies towards task completion, +achieving state-of-the-art (SoTA) performance both on previously demonstrated +and on entirely novel tasks and embodiments. Specifically, on a set of +long-horizon mobile manipulation tasks, FLaRe achieves an average success rate +of 79.5% in unseen environments, with absolute improvements of +23.6% in +simulation and +30.7% on real robots over prior SoTA methods. By utilizing only +sparse rewards, our approach can enable generalizing to new capabilities beyond +the pretraining data with minimal human effort. Moreover, we demonstrate rapid +adaptation to new embodiments and behaviors with less than a day of +fine-tuning. Videos can be found on the project website at +https://robot-flare.github.io/ + +
+
+
+
+
+ + ☆ Reactive Multi-Robot Navigation in Outdoor Environments Through + Uncertainty-Aware Active Learning of Human Preference Landscape + + +
+ Compared with single robots, Multi-Robot Systems (MRS) can perform missions +more efficiently due to the presence of multiple members with diverse +capabilities. However, deploying an MRS in wide real-world environments is +still challenging due to uncertain and various obstacles (e.g., building +clusters and trees). With a limited understanding of environmental uncertainty +on performance, an MRS cannot flexibly adjust its behaviors (e.g., teaming, +load sharing, trajectory planning) to ensure both environment adaptation and +task accomplishments. In this work, a novel joint preference landscape learning +and behavior adjusting framework (PLBA) is designed. PLBA efficiently +integrates real-time human guidance to MRS coordination and utilizes Sparse +Variational Gaussian Processes with Varying Output Noise to quickly assess +human preferences by leveraging spatial correlations between environment +characteristics. An optimization-based behavior-adjusting method then safely +adapts MRS behaviors to environments. To validate PLBA's effectiveness in MRS +behavior adaption, a flood disaster search and rescue task was designed. 20 +human users provided 1764 feedback based on human preferences obtained from MRS +behaviors related to "task quality", "task progress", "robot safety". The +prediction accuracy and adaptation speed results show the effectiveness of PLBA +in preference learning and MRS behavior adaption. + +
+
+
+
+
+ + ☆ Task-driven SLAM Benchmarking ICRA2025 + + +
+ For assistive robots, one critical use case of SLAM is to support +localization as they navigate through an environment completing tasks. Current +SLAM benchmarks do not consider task-based deployments where repeatability +(precision) is more critical than accuracy. To address this gap, we propose a +task-driven benchmarking framework for evaluating SLAM methods. The framework +accounts for SLAM's mapping capabilities, employs precision as a key metric, +and has low resource requirements to implement. Testing of state-of-the-art +SLAM methods in both simulated and real-world scenarios provides insights into +the performance properties of modern SLAM solutions. In particular, it shows +that passive stereo SLAM operates at a level of precision comparable to +LiDAR-based SLAM in typical indoor environments. The benchmarking approach +offers a more relevant and accurate assessment of SLAM performance in +task-driven applications. + +
+
+ comment: 7 pages, 7 figures, 1 table. Submitted to ICRA2025 +
+
+
+
+
+ + ☆ PANOS: Payload-Aware Navigation in Offroad Scenarios + + +
+ Nature has evolved humans to walk on different terrains by developing a +detailed understanding of their physical characteristics. Similarly, legged +robots need to develop their capability to walk on complex terrains with a +variety of task-dependent payloads to achieve their goals. However, +conventional terrain adaptation methods are susceptible to failure with varying +payloads. In this work, we introduce PANOS, a weakly supervised approach that +integrates proprioception and exteroception from onboard sensing to achieve a +stable gait while walking by a legged robot over various terrains. Our work +also provides evidence of its adaptability over varying payloads. We evaluate +our method on multiple terrains and payloads using a legged robot. PANOS +improves the stability up to 44% without any payload and 53% with 15 lbs +payload. We also notice a reduction in the vibration cost of 20% with the +payload for various terrain types when compared to state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Design, Integration, and Field Evaluation of a Robotic Blossom Thinning + System for Tree Fruit Crops + + +
+ The US apple industry relies heavily on semi-skilled manual labor force for +essential field operations such as training, pruning, blossom and green fruit +thinning, and harvesting. Blossom thinning is one of the crucial crop load +management practices to achieve desired crop load, fruit quality, and return +bloom. While several techniques such as chemical, and mechanical thinning are +available for large-scale blossom thinning such approaches often yield +unpredictable thinning results and may cause damage the canopy, spurs, and leaf +tissue. Hence, growers still depend on laborious, labor intensive and expensive +manual hand blossom thinning for desired thinning outcomes. This research +presents a robotic solution for blossom thinning in apple orchards using a +computer vision system with artificial intelligence, a six degrees of freedom +robotic manipulator, and an electrically actuated miniature end-effector for +robotic blossom thinning. The integrated robotic system was evaluated in a +commercial apple orchard which showed promising results for targeted and +selective blossom thinning. Two thinning approaches, center and boundary +thinning, were investigated to evaluate the system ability to remove varying +proportion of flowers from apple flower clusters. During boundary thinning the +end effector was actuated around the cluster boundary while center thinning +involved end-effector actuation only at the cluster centroid for a fixed +duration of 2 seconds. The boundary thinning approach thinned 67.2% of flowers +from the targeted clusters with a cycle time of 9.0 seconds per cluster, +whereas center thinning approach thinned 59.4% of flowers with a cycle time of +7.2 seconds per cluster. When commercially adopted, the proposed system could +help address problems faced by apple growers with current hand, chemical, and +mechanical blossom thinning approaches. + +
+
+ comment: Accepted for publication in the Journal of Field Robotics +
+
+
+
+
+ + ♻ ☆ MHRC: Closed-loop Decentralized Multi-Heterogeneous Robot Collaboration + with Large Language Models + + +
+ The integration of large language models (LLMs) with robotics has +significantly advanced robots' abilities in perception, cognition, and task +planning. The use of natural language interfaces offers a unified approach for +expressing the capability differences of heterogeneous robots, facilitating +communication between them, and enabling seamless task allocation and +collaboration. Currently, the utilization of LLMs to achieve decentralized +multi-heterogeneous robot collaborative tasks remains an under-explored area of +research. In this paper, we introduce a novel framework that utilizes LLMs to +achieve decentralized collaboration among multiple heterogeneous robots. Our +framework supports three robot categories, mobile robots, manipulation robots, +and mobile manipulation robots, working together to complete tasks such as +exploration, transportation, and organization. We developed a rich set of +textual feedback mechanisms and chain-of-thought (CoT) prompts to enhance task +planning efficiency and overall system performance. The mobile manipulation +robot can adjust its base position flexibly, ensuring optimal conditions for +grasping tasks. The manipulation robot can comprehend task requirements, seek +assistance when necessary, and handle objects appropriately. Meanwhile, the +mobile robot can explore the environment extensively, map object locations, and +communicate this information to the mobile manipulation robot, thus improving +task execution efficiency. We evaluated the framework using PyBullet, creating +scenarios with three different room layouts and three distinct operational +tasks. We tested various LLM models and conducted ablation studies to assess +the contributions of different modules. The experimental results confirm the +effectiveness and necessity of our proposed framework. + +
+
+
+
+
+ + ♻ Learning to Walk and Fly with Adversarial Motion Priors IROS + + +
+ Robot multimodal locomotion encompasses the ability to transition between +walking and flying, representing a significant challenge in robotics. This work +presents an approach that enables automatic smooth transitions between legged +and aerial locomotion. Leveraging the concept of Adversarial Motion Priors, our +method allows the robot to imitate motion datasets and accomplish the desired +task without the need for complex reward functions. The robot learns walking +patterns from human-like gaits and aerial locomotion patterns from motions +obtained using trajectory optimization. Through this process, the robot adapts +the locomotion scheme based on environmental feedback using reinforcement +learning, with the spontaneous emergence of mode-switching behavior. The +results highlight the potential for achieving multimodal locomotion in aerial +humanoid robotics through automatic control of walking and flying modes, paving +the way for applications in diverse domains such as search and rescue, +surveillance, and exploration missions. This research contributes to advancing +the capabilities of aerial humanoid robots in terms of versatile locomotion in +various environments. + +
+
+ comment: This paper has been accepted for publication at the IEEE/RSJ + International Conference on Intelligent Robots and Systems (IROS), Abu Dhabi, + 2024 +
+
+
+
+
+ + ♻ ☆ DroneWiS: Automated Simulation Testing of small Unmanned Aerial Systems + in Realistic Windy Conditions + + +
+ The continuous evolution of small Unmanned Aerial Systems (sUAS) demands +advanced testing methodologies to ensure their safe and reliable operations in +the real-world. To push the boundaries of sUAS simulation testing in realistic +environments, we previously developed the DroneReqValidator (DRV) platform, +allowing developers to automatically conduct simulation testing in digital twin +of earth. In this paper, we present DRV 2.0, which introduces a novel component +called DroneWiS (Drone Wind Simulation). DroneWiS allows sUAS developers to +automatically simulate realistic windy conditions and test the resilience of +sUAS against wind. Unlike current state-of-the-art simulation tools such as +Gazebo and AirSim that only simulate basic wind conditions, DroneWiS leverages +Computational Fluid Dynamics (CFD) to compute the unique wind flows caused by +the interaction of wind with the objects in the environment such as buildings +and uneven terrains. This simulation capability provides deeper insights to +developers about the navigation capability of sUAS in challenging and realistic +windy conditions. DroneWiS equips sUAS developers with a powerful tool to test, +debug, and improve the reliability and safety of sUAS in real-world. A working +demonstration is available at https://youtu.be/khBHEBST8Wc + +
+
+
+
+
+ + ♻ ☆ RAP: Retrieval-Augmented Planner for Adaptive Procedure Planning in + Instructional Videos ECCV 2024 + + +
+ Procedure Planning in instructional videos entails generating a sequence of +action steps based on visual observations of the initial and target states. +Despite the rapid progress in this task, there remain several critical +challenges to be solved: (1) Adaptive procedures: Prior works hold an +unrealistic assumption that the number of action steps is known and fixed, +leading to non-generalizable models in real-world scenarios where the sequence +length varies. (2) Temporal relation: Understanding the step temporal relation +knowledge is essential in producing reasonable and executable plans. (3) +Annotation cost: Annotating instructional videos with step-level labels (i.e., +timestamp) or sequence-level labels (i.e., action category) is demanding and +labor-intensive, limiting its generalizability to large-scale datasets. In this +work, we propose a new and practical setting, called adaptive procedure +planning in instructional videos, where the procedure length is not fixed or +pre-determined. To address these challenges, we introduce Retrieval-Augmented +Planner (RAP) model. Specifically, for adaptive procedures, RAP adaptively +determines the conclusion of actions using an auto-regressive model +architecture. For temporal relation, RAP establishes an external memory module +to explicitly retrieve the most relevant state-action pairs from the training +videos and revises the generated procedures. To tackle high annotation cost, +RAP utilizes a weakly-supervised learning manner to expand the training dataset +to other task-relevant, unannotated videos by generating pseudo labels for +action steps. Experiments on CrossTask and COIN benchmarks show the superiority +of RAP over traditional fixed-length models, establishing it as a strong +baseline solution for adaptive procedure planning. + +
+
+ comment: Accepted in ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Multi-UAV Pursuit-Evasion with Online Planning in Unknown Environments + by Deep Reinforcement Learning + + +
+ Multi-UAV pursuit-evasion, where pursuers aim to capture evaders, poses a key +challenge for UAV swarm intelligence. Multi-agent reinforcement learning (MARL) +has demonstrated potential in modeling cooperative behaviors, but most RL-based +approaches remain constrained to simplified simulations with limited dynamics +or fixed scenarios. Previous attempts to deploy RL policy to real-world +pursuit-evasion are largely restricted to two-dimensional scenarios, such as +ground vehicles or UAVs at fixed altitudes. In this paper, we address multi-UAV +pursuit-evasion by considering UAV dynamics and physical constraints. We +introduce an evader prediction-enhanced network to tackle partial observability +in cooperative strategy learning. Additionally, we propose an adaptive +environment generator within MARL training, enabling higher exploration +efficiency and better policy generalization across diverse scenarios. +Simulations show our method significantly outperforms all baselines in +challenging scenarios, generalizing to unseen scenarios with a 100% capture +rate. Finally, we derive a feasible policy via a two-stage reward refinement +and deploy the policy on real quadrotors in a zero-shot manner. To our +knowledge, this is the first work to derive and deploy an RL-based policy using +collective thrust and body rates control commands for multi-UAV pursuit-evasion +in unknown environments. The open-source code and videos are available at +https://sites.google.com/view/pursuit-evasion-rl. + +
+
+
+
+
+ + ♻ ☆ Event-Free Moving Object Segmentation from Moving Ego Vehicle + + +
+ Moving object segmentation (MOS) in dynamic scenes is an important, +challenging, but under-explored research topic for autonomous driving, +especially for sequences obtained from moving ego vehicles. Most segmentation +methods leverage motion cues obtained from optical flow maps. However, since +these methods are often based on optical flows that are pre-computed from +successive RGB frames, this neglects the temporal consideration of events +occurring within the inter-frame, consequently constraining its ability to +discern objects exhibiting relative staticity but genuinely in motion. To +address these limitations, we propose to exploit event cameras for better video +understanding, which provide rich motion cues without relying on optical flow. +To foster research in this area, we first introduce a novel large-scale dataset +called DSEC-MOS for moving object segmentation from moving ego vehicles, which +is the first of its kind. For benchmarking, we select various mainstream +methods and rigorously evaluate them on our dataset. Subsequently, we devise +EmoFormer, a novel network able to exploit the event data. For this purpose, we +fuse the event temporal prior with spatial semantic maps to distinguish +genuinely moving objects from the static background, adding another level of +dense supervision around our object of interest. Our proposed network relies +only on event data for training but does not require event input during +inference, making it directly comparable to frame-only methods in terms of +efficiency and more widely usable in many application cases. The exhaustive +comparison highlights a significant performance improvement of our method over +all other methods. The source code and dataset are publicly available at: +https://github.com/ZZY-Zhou/DSEC-MOS. + +
+
+
+
+
+ + ♻ ☆ Mamba as a motion encoder for robotic imitation learning + + +
+ Recent advancements in imitation learning, particularly with the integration +of LLM techniques, are set to significantly improve robots' dexterity and +adaptability. This paper proposes using Mamba, a state-of-the-art architecture +with potential applications in LLMs, for robotic imitation learning, +highlighting its ability to function as an encoder that effectively captures +contextual information. By reducing the dimensionality of the state space, +Mamba operates similarly to an autoencoder. It effectively compresses the +sequential information into state variables while preserving the essential +temporal dynamics necessary for accurate motion prediction. Experimental +results in tasks such as cup placing and case loading demonstrate that despite +exhibiting higher estimation errors, Mamba achieves superior success rates +compared to Transformers in practical task execution. This performance is +attributed to Mamba's structure, which encompasses the state space model. +Additionally, the study investigates Mamba's capacity to serve as a real-time +motion generator with a limited amount of training data. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ An explicit construction of Kaleidocycles by elliptic theta functions + + +
+ We consider the configuration space of points on the two-dimensional sphere +that satisfy a specific system of quadratic equations. We construct periodic +orbits in this configuration space using elliptic theta functions and show that +they satisfy semi-discrete analogues of mKdV and sine-Gordon equations. The +configuration space we investigate corresponds to the state space of a linkage +mechanism known as the Kaleidocycle, and the constructed orbits describe the +characteristic motion of the Kaleidocycle. Our approach is founded on the +relationship between the deformation of spatial curves and integrable systems, +offering an intriguing example where an integrable system generates an orbit in +the space of real solutions to polynomial equations defined by geometric +constraints. + +
+
+
+
+
+ + ♻ ☆ Efficient Motion Prediction: A Lightweight & Accurate Trajectory + Prediction Model With Fast Training and Inference Speed IROS 2024 + + +
+ For efficient and safe autonomous driving, it is essential that autonomous +vehicles can predict the motion of other traffic agents. While highly accurate, +current motion prediction models often impose significant challenges in terms +of training resource requirements and deployment on embedded hardware. We +propose a new efficient motion prediction model, which achieves highly +competitive benchmark results while training only a few hours on a single GPU. +Due to our lightweight architectural choices and the focus on reducing the +required training resources, our model can easily be applied to custom +datasets. Furthermore, its low inference latency makes it particularly suitable +for deployment in autonomous applications with limited computing resources. + +
+
+ comment: Accepted to IROS 2024 +
+
+
+
+
+ + ♻ ☆ Cosserat Rods for Modeling Tendon-Driven Robotic Catheter Systems + + +
+ Tendon-driven robotic catheters are capable of precise execution of minimally +invasive cardiac procedures including ablations and imaging. These procedures +require accurate mathematical models of not only the catheter and tendons but +also their interactions with surrounding tissue and vasculature in order to +control the robot path and interaction. This paper presents a mechanical model +of a tendon-driven robotic catheter system based on Cosserat rods and +integrated with a stable, implicit Euler scheme. We implement the Cosserat rod +as a model for a simple catheter centerline and validate its physical accuracy +against a large deformation analytical model and experimental data. The +catheter model is then supplemented by adding a second Cosserat rod to model a +single tendon, using penalty forces to define the constraints of the +tendon-catheter system. All the model parameters are defined by the catheter +properties established by the design. The combined model is validated against +experimental data to confirm its physical accuracy. This model represents a new +contribution to the field of robotic catheter modeling in which both the +tendons and catheter are modeled by mechanical Cosserat rods and +fully-validated against experimental data in the case of the single rod system. + +
+
+ comment: 24 pages, 23 figures +
+
+
+
+
+ + ♻ ☆ TempFuser: Learning Agile, Tactical, and Acrobatic Flight Maneuvers + Using a Long Short-Term Temporal Fusion Transformer + + +
+ Dogfighting is a challenging scenario in aerial applications that requires a +comprehensive understanding of both strategic maneuvers and the aerodynamics of +agile aircraft. The aerial agent needs to not only understand tactically +evolving maneuvers of fighter jets from a long-term perspective but also react +to rapidly changing aerodynamics of aircraft from a short-term viewpoint. In +this paper, we introduce TempFuser, a novel long short-term temporal fusion +transformer architecture that can learn agile, tactical, and acrobatic flight +maneuvers in complex dogfight problems. Our approach integrates two distinct +temporal transition embeddings into a transformer-based network to +comprehensively capture both the long-term tactics and short-term agility of +aerial agents. By incorporating these perspectives, our policy network +generates end-to-end flight commands that secure dominant positions over the +long term and effectively outmaneuver agile opponents. After training in a +high-fidelity flight simulator, our model successfully learns to execute +strategic maneuvers, outperforming baseline policy models against various types +of opponent aircraft. Notably, our model exhibits human-like acrobatic +maneuvers even when facing adversaries with superior specifications, all +without relying on prior knowledge. Moreover, it demonstrates robust pursuit +performance in challenging supersonic and low-altitude situations. Demo videos +are available at https://sites.google.com/view/tempfuser. + +
+
+ comment: 8 pages, 7 figures. Accepted for publication in IEEE Robotics and + Automation Letters (RA-L). Copyright 2024 IEEE. Personal use is permitted. + For other uses, permission from IEEE is required +
+
+
+
+
+ + ♻ ☆ COHERENT: Collaboration of Heterogeneous Multi-Robot System with Large + Language Models ICRA + + +
+ Leveraging the powerful reasoning capabilities of large language models +(LLMs), recent LLM-based robot task planning methods yield promising results. +However, they mainly focus on single or multiple homogeneous robots on simple +tasks. Practically, complex long-horizon tasks always require collaborations +among multiple heterogeneous robots especially with more complex action spaces, +which makes these tasks more challenging. To this end, we propose COHERENT, a +novel LLM-based task planning framework for collaboration of heterogeneous +multi-robot systems including quadrotors, robotic dogs, and robotic arms. +Specifically, a Proposal-Execution-Feedback-Adjustment (PEFA) mechanism is +designed to decompose and assign actions for individual robots, where a +centralized task assigner makes a task planning proposal to decompose the +complex task into subtasks, and then assigns subtasks to robot executors. Each +robot executor selects a feasible action to implement the assigned subtask and +reports self-reflection feedback to the task assigner for plan adjustment. The +PEFA loops until the task is completed. Moreover, we create a challenging +heterogeneous multi-robot task planning benchmark encompassing 100 complex +long-horizon tasks. The experimental results show that our work surpasses the +previous methods by a large margin in terms of success rate and execution +efficiency. The experimental videos, code, and benchmark are released at +https://github.com/MrKeee/COHERENT. + +
+
+ comment: 7 pages, 5 figures. Submitted to IEEE International Conference on + Robotics and Automation (ICRA), 2025 +
+
+
+
+
+ + ♻ ☆ ManiFoundation Model for General-Purpose Robotic Manipulation of Contact + Synthesis with Arbitrary Objects and Robots + + +
+ To substantially enhance robot intelligence, there is a pressing need to +develop a large model that enables general-purpose robots to proficiently +undertake a broad spectrum of manipulation tasks, akin to the versatile +task-planning ability exhibited by LLMs. The vast diversity in objects, robots, +and manipulation tasks presents huge challenges. Our work introduces a +comprehensive framework to develop a foundation model for general robotic +manipulation that formalizes a manipulation task as contact synthesis. +Specifically, our model takes as input object and robot manipulator point +clouds, object physical attributes, target motions, and manipulation region +masks. It outputs contact points on the object and associated contact forces or +post-contact motions for robots to achieve the desired manipulation task. We +perform extensive experiments both in the simulation and real-world settings, +manipulating articulated rigid objects, rigid objects, and deformable objects +that vary in dimensionality, ranging from one-dimensional objects like ropes to +two-dimensional objects like cloth and extending to three-dimensional objects +such as plasticine. Our model achieves average success rates of around 90\%. +Supplementary materials and videos are available on our project website at +https://manifoundationmodel.github.io/. + +
+
+
+
+
+ + ♻ ☆ EF-Calib: Spatiotemporal Calibration of Event- and Frame-Based Cameras + Using Continuous-Time Trajectories + + +
+ Event camera, a bio-inspired asynchronous triggered camera, offers promising +prospects for fusion with frame-based cameras owing to its low latency and high +dynamic range. However, calibrating stereo vision systems that incorporate both +event and frame-based cameras remains a significant challenge. In this letter, +we present EF-Calib, a spatiotemporal calibration framework for event- and +frame-based cameras using continuous-time trajectories. A novel calibration +pattern applicable to both camera types and the corresponding event recognition +algorithm is proposed. Leveraging the asynchronous nature of events, a +derivable piece-wise B-spline to represent camera pose continuously is +introduced, enabling calibration for intrinsic parameters, extrinsic +parameters, and time offset, with analytical Jacobians provided. Various +experiments are carried out to evaluate the calibration performance of +EF-Calib, including calibration experiments for intrinsic parameters, extrinsic +parameters, and time offset. Experimental results show that EF-Calib achieves +the most accurate intrinsic parameters compared to current SOTA, the close +accuracy of the extrinsic parameters compared to the frame-based results, and +accurate time offset estimation. EF-Calib provides a convenient and accurate +toolbox for calibrating the system that fuses events and frames. The code of +this paper will also be open-sourced at: https://github.com/wsakobe/EF-Calib. + +
+
+ comment: Accepted by IEEE Robotics and Automation Letters +
+
+
+
+
+ + ♻ ☆ Precision Aquaculture: An Integrated Computer Vision and IoT Approach + for Optimized Tilapia Feeding + + +
+ Traditional fish farming practices often lead to inefficient feeding, +resulting in environmental issues and reduced productivity. We developed an +innovative system combining computer vision and IoT technologies for precise +Tilapia feeding. Our solution uses real-time IoT sensors to monitor water +quality parameters and computer vision algorithms to analyze fish size and +count, determining optimal feed amounts. A mobile app enables remote monitoring +and control. We utilized YOLOv8 for keypoint detection to measure Tilapia +weight from length, achieving \textbf{94\%} precision on 3,500 annotated +images. Pixel-based measurements were converted to centimeters using depth +estimation for accurate feeding calculations. Our method, with data collection +mirroring inference conditions, significantly improved results. Preliminary +estimates suggest this approach could increase production up to 58 times +compared to traditional farms. Our models, code, and dataset are +open-source~\footnote{The code, dataset, and models are available upon +reasonable request. + +
+
+ comment: 8 pages, 6 figures, 3 tables, 21th International Conference on + Informatics in Control, Automation, and Robotics +
+
+
+
+
+ + ♻ ☆ D3RoMa: Disparity Diffusion-based Depth Sensing for Material-Agnostic + Robotic Manipulation + + +
+ Depth sensing is an important problem for 3D vision-based robotics. Yet, a +real-world active stereo or ToF depth camera often produces noisy and +incomplete depth which bottlenecks robot performances. In this work, we propose +D3RoMa, a learning-based depth estimation framework on stereo image pairs that +predicts clean and accurate depth in diverse indoor scenes, even in the most +challenging scenarios with translucent or specular surfaces where classical +depth sensing completely fails. Key to our method is that we unify depth +estimation and restoration into an image-to-image translation problem by +predicting the disparity map with a denoising diffusion probabilistic model. At +inference time, we further incorporated a left-right consistency constraint as +classifier guidance to the diffusion process. Our framework combines recently +advanced learning-based approaches and geometric constraints from traditional +stereo vision. For model training, we create a large scene-level synthetic +dataset with diverse transparent and specular objects to compensate for +existing tabletop datasets. The trained model can be directly applied to +real-world in-the-wild scenes and achieve state-of-the-art performance in +multiple public depth estimation benchmarks. Further experiments in real +environments show that accurate depth prediction significantly improves robotic +manipulation in various scenarios. + +
+
+
+
+
+
+
+
+ + Systems and Control 37 + +
+
+
+ + ☆ On the Interplay of Clustering and Evolution in the Emergence of + Epidemic Outbreaks + + +
+ In an increasingly interconnected world, a key scientific challenge is to +examine mechanisms that lead to the widespread propagation of contagions, such +as misinformation and pathogens, and identify risk factors that can trigger +large-scale outbreaks. Underlying both the spread of disease and misinformation +epidemics is the evolution of the contagion as it propagates, leading to the +emergence of different strains, e.g., through genetic mutations in pathogens +and alterations in the information content. Recent studies have revealed that +models that do not account for heterogeneity in transmission risks associated +with different strains of the circulating contagion can lead to inaccurate +predictions. However, existing results on multi-strain spreading assume that +the network has a vanishingly small clustering coefficient, whereas clustering +is widely known to be a fundamental property of real-world social networks. In +this work, we investigate spreading processes that entail evolutionary +adaptations on random graphs with tunable clustering and arbitrary degree +distributions. We derive a mathematical framework to quantify the epidemic +characteristics of a contagion that evolves as it spreads, with the structure +of the underlying network as given via arbitrary {\em joint} degree +distributions of single-edges and triangles. To the best of our knowledge, our +work is the first to jointly analyze the impact of clustering and evolution on +the emergence of epidemic outbreaks. We supplement our theoretical finding with +numerical simulations and case studies, shedding light on the impact of +clustering on contagion spread. + +
+
+
+
+
+ + ☆ Learning with Dynamics: Autonomous Regulation of UAV Based Communication + Networks with Dynamic UAV Crew + + +
+ Unmanned Aerial Vehicle (UAV) based communication networks (UCNs) are a key +component in future mobile networking. To handle the dynamic environments in +UCNs, reinforcement learning (RL) has been a promising solution attributed to +its strong capability of adaptive decision-making free of the environment +models. However, most existing RL-based research focus on control strategy +design assuming a fixed set of UAVs. Few works have investigated how UCNs +should be adaptively regulated when the serving UAVs change dynamically. This +article discusses RL-based strategy design for adaptive UCN regulation given a +dynamic UAV set, addressing both reactive strategies in general UCNs and +proactive strategies in solar-powered UCNs. An overview of the UCN and the RL +framework is first provided. Potential research directions with key challenges +and possible solutions are then elaborated. Some of our recent works are +presented as case studies to inspire innovative ways to handle dynamic UAV crew +with different RL algorithms. + +
+
+ comment: 7 pages, 6 figures, magazine paper +
+
+
+
+
+ + ☆ Complex-Phase, Data-Driven Identification of Grid-Forming Inverter + Dynamics + + +
+ The increasing integration of renewable energy sources (RESs) into power +systems requires the deployment of grid-forming inverters to ensure a stable +operation. Accurate modeling of these devices is necessary. In this paper, a +system identification approach to obtain low-dimensional models of grid-forming +inverters is presented. The proposed approach is based on a Hammerstein-Wiener +parametrization of the normal-form model. The normal-form is a gray-box model +that utilizes complex frequency and phase to capture non-linear inverter +dynamics. The model is validated on two well-known control strategies: +droop-control and dispatchable virtual oscillators. Simulations and +hardware-in-the-loop experiments demonstrate that the normal-form accurately +models inverter dynamics across various operating conditions. The approach +shows great potential for enhancing the modeling of RES-dominated power +systems, especially when component models are unavailable or computationally +expensive. + +
+
+
+
+
+ + ☆ Towards human-like kinematics in industrial robotic arms: a case study + on a UR3 robot + + +
+ Safety in industrial robotic environments is a hot research topic in the area +of human-robot interaction (HRI). Up to now, a robotic arm on an assembly line +interacts with other machines away from human workers. Nowadays, robotic arm +manufactures are aimed to their robots could increasingly perform tasks +collaborating with humans. One of the ways to improve this collaboration is by +making the movement of robots more humanlike. This way, it would be easier for +a human to foresee the movement of the robot and approach it without fear of +contact. The main difference between the movement of a human and of a robotic +arm is that the former has a bell-shaped speed profile while the latter has a +uniform speed one. To generate this speed profile, the kinematic theory of +rapid human movements and its Sigma-Lognormal model has been used. This model +is widely used to explain most of the basic phenomena related to the control of +human movements. Both human-like and robotic-like movements are transferred to +the UR3 robot. In this paper we detail the how the UR3 robot was programmed to +produce both kinds of movement. The dissimilarities result between the input +motion and output motion to the robot confirm the possibility to develop +human-like velocities in the UR3 robot. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ Generic Diagonalizability, Structural Functional Observability and + Output Controllability + + +
+ This paper investigates the structural functional observability (SFO) and +structural output controllability (SOC) of a class of systems with generically +diagonalizable state matrices and explores the associated minimal sensor and +actuator placement problems. The verification of SOC and the corresponding +sensor and actuator placement problems, i.e., the problems of determining the +minimum number of outputs and inputs required to achieve SFO and SOC, +respectively, are yet open for general systems, which motivates our focus on a +class of systems enabling polynomial-time solutions. In this line, we first +define and characterize generically diagonalizable systems, referring to +structured systems for which almost all realizations of the state matrices are +diagonalizable. We then develop computationally efficient criteria for SFO and +SOC within the context of generically diagonalizable systems. Our work expands +the class of systems amenable to polynomial-time SOC verification. Thanks to +the simplicity of the obtained criteria, we derive closed-form solutions for +determining the minimal sensor placement to achieve SFO and the minimal +actuator deployment to achieve SOC in such systems, along with efficient +weighted maximum matching based and weighted maximum flow based algorithms. For +more general systems to achieve SFO, an upper bound is given by identifying a +non-decreasing property of SFO with respect to a specific class of edge +additions, which is shown to be optimal under certain circumstances. + +
+
+ comment: Under review in a Journal +
+
+
+
+
+ + ☆ Energy efficiency analysis as a function of the working voltages in + supercapacitors + + +
+ Supercapacitors are increasingly used as energy storage elements. Unlike +batteries, their state of charge has a considerable influence on their voltage +in normal operation, allowing them to work from zero to their maximum voltage. +In this work, a theoretical and practical analysis is proposed of the energy +efficiency of these devices according to their working voltages. To this end, +several supercapacitors were subjected to charge and discharge cycles until the +measurements of current and voltage stabilized. At this point their energy +efficiency was calculated. These charge-discharge cycles were carried out: i) +without rest between charging and discharging; and ii) with a rest of several +minutes between the two stages. Using the information obtained from the tests, +the energy efficiency is shown plotted against the minimum and maximum working +voltages. By consulting the data and the graphs, the ideal working voltages to +optimize the energy efficiency of these devices can be obtained. + +
+
+ comment: 18 pages, 10 figures +
+
+
+
+
+ + ☆ A Novel MOSFET based Single Event Latchup Detection, Current Limiting & + Self Power Cycling circuit for Spacecraft systems + + +
+ Single Event Latch-up (SEL) is one of the prime concerns for CMOS ICs used in +space systems. Galactic Cosmic Rays or Solar Energetic Particles (SEP) may +trigger the parasitic latch up circuit in CMOS ICs and cause increase in +current beyond the safe limits thereby presenting a threat of permanent failure +of the IC. Mitigation of the SEL is always a challenging task. The conventional +mitigation approaches inherently introduce some response time which presents an +uncertainty because during this response time the current may exceed the safe +current limits. This paper presents a novel circuit based on MOSFETs which +provides end-to-end complete solution of detecting SEL, limiting the current +below the set threshold and executing power cycling to restore the normal +functioning of the CMOS IC. The proposed circuit has been simulated in MULTISIM +and the simulation results match very well with the expected behavior of +(i)current limiting and (ii) the total time duration taken in power cycling to +bring the SEL sensitive device back to its normal operational state. This +circuit can be harnessed by spacecraft system designers to overcome the +catastrophic threat of SEL posed by space radiation environment. + +
+
+
+
+
+ + ☆ The Power-Oriented Graphs Modeling Technique: From the Fundamental + Principles to the Systematic, Step-by-Step Modeling of Complex Physical + Systems + + +
+ Modeling physical systems is an essential skill for a control engineer, since +it enables to achieve a deep understanding of their dynamic behavior and, +consequently, the development of effective control strategies. The first part +of this article provides a tutorial description of the fundamental principles +and properties of the Power-Oriented Graphs (POG) modeling technique. Various +case studies in different energetic domains are then presented to consolidate +the fundamental principles, each highlighting different features of the POG +modeling technique. The latter is then compared with the other two main +graphical modeling techniques available in the literature, namely Bond Graph +(BG) and Energetic Macroscopic Representation (EMR). The second part of this +article assumes once again a tutorial nature, in order to introduce the new +Fast Modeling POG (FMPOG) procedure. The FMPOG, which operates in the POG +framework, is a methodical step-by-step procedure that enables the readers to +quickly derive the power-oriented graphical model of physical systems starting +from their schematics. From the power-oriented graphical model, the state-space +model can then be directly determined. To ensure the FMPOG procedure is easily +usable by the entire community, we apply it to three examples in different +energetic domains in this article, guiding the reader step-by-step through the +derivation of the physical systems models. + +
+
+
+
+
+ + ☆ Feedforward Controllers from Learned Dynamic Local Model Networks with + Application to Excavator Assistance Functions + + +
+ Complicated first principles modelling and controller synthesis can be +prohibitively slow and expensive for high-mix, low-volume products such as +hydraulic excavators. Instead, in a data-driven approach, recorded trajectories +from the real system can be used to train local model networks (LMNs), for +which feedforward controllers are derived via feedback linearization. However, +previous works required LMNs without zero dynamics for feedback linearization, +which restricts the model structure and thus modelling capacity of LMNs. In +this paper, we overcome this restriction by providing a criterion for when +feedback linearization of LMNs with zero dynamics yields a valid controller. As +a criterion we propose the bounded-input bounded-output stability of the +resulting controller. In two additional contributions, we extend this approach +to consider measured disturbance signals and multiple inputs and outputs. We +illustrate the effectiveness of our contributions in a hydraulic excavator +control application with hardware experiments. To this end, we train LMNs from +recorded, noisy data and derive feedforward controllers used as part of a +leveling assistance system on the excavator. In our experiments, incorporating +disturbance signals and multiple inputs and outputs enhances tracking +performance of the learned controller. A video of our experiments is available +at https://youtu.be/lrrWBx2ASaE. + +
+
+
+
+
+ + ☆ Measurements and System Identification for the Characterization of + Smooth Muscle Cell Dynamics + + +
+ Biological tissue integrity is actively maintained by cells. It is essential +to comprehend how cells accomplish this in order to stage tissue diseases. +However, addressing the complexity of a cell's system of interrelated +mechanisms poses a challenge. This necessitates a well-structured +identification framework and an effective integration of measurements. Here we +introduce the use of state-of-the-art frequency-domain system identification +techniques combined with an indentation measurement platform to analyze the +underlying mechanisms from the perspective of control system theory. The +ultimate goal is to explore how mechanical and biological factors are related +in induced Pluripotent Stem Cell-derived vascular smooth muscle cells. We study +on the frequency-domain analysis for the investigation and characterization of +cellular dynamics of smooth muscle cells from the measured data. The +measurement model in this study exploits the availability of human tissue and +samples, enabling fundamental investigations of vascular tissue disease. This +approach using human cell lines holds significant potential to decrease the +necessity for animal-based safety and efficacy studies. The focus of this +review is to investigate the cellular dynamics underlying the myogenic response +and to demonstrate the practicability of employing a nano-indentation +measurement setup for the broadband frequency-domain characterization of +induced Pluripotent Stem Cell-derived vascular smooth muscle cells. + +
+
+ comment: 6 pages, 9 figures, presented in the Medical Measurements and + Applications - MeMeA2024 conference +
+
+
+
+
+ + ☆ Performance Boundary Analyses for Statistical Multi-QoS Framework Over + 6G SAGINs + + +
+ To enable the cost-effective universal access and the enhancement of current +communication services, the space-air-ground integrated networks (SAGINs) have +recently been developed due to its exceptional 3D coverage and the ability to +guarantee rigorous and multidimensional demands for quality-of-service (QoS) +provisioning, including delay and reliability across vast distances. In +response to the complex, heterogeneous, and dynamic serving scenarios and +stringent performance expectations for 6G SAGINs, it is crucial to undertake +modeling, assurance, and analysis of the key technologies, aligned with the +diverse demands for QoS provisioning in the non-asymptotic regime, i.e., when +implementing finite blocklength coding (FBC) as a new dimension for error-rate +bounded QoS metric. However, how to design new statistical QoS-driven +performance modeling approaches that accurately delineate the complex and +dynamic behaviors of networks, particularly in terms of constraining both delay +and error rate, persists as a significant challenge for implementing mURLLC +within 6G SAGINs in the finite blocklength regime. To overcome these +difficulties, in this paper we propose to develop a set of analytical modeling +frameworks for 6G SAGIN in supporting statistical delay and error-rate bounded +QoS in the finite blocklength regime. First we establish the SAGIN system +architecture model. Second, the aggregate interference and decoding error +probability functions are modeled and examined through using Laplace transform. +Third, we introduce modeling techniques aimed at defining +the$\epsilon$-effective capacity function as a crucial metric for facilitating +statistical QoS standards with respect to delay and error-rate. To validate the +effectiveness of the developed performance modeling schemes, we have executed a +series of simulations over SAGINs. + +
+
+
+
+
+ + ☆ Inline Photometrically Calibrated Hybrid Visual SLAM + + +
+ This paper presents an integrated approach to Visual SLAM, merging online +sequential photometric calibration within a Hybrid direct-indirect visual SLAM +(H-SLAM). Photometric calibration helps normalize pixel intensity values under +different lighting conditions, and thereby improves the direct component of our +H-SLAM. A tangential benefit also results to the indirect component of H-SLAM +given that the detected features are more stable across variable lighting +conditions. Our proposed photometrically calibrated H-SLAM is tested on several +datasets, including the TUM monoVO as well as on a dataset we created. +Calibrated H-SLAM outperforms other state of the art direct, indirect, and +hybrid Visual SLAM systems in all the experiments. Furthermore, in online SLAM +tested at our site, it also significantly outperformed the other SLAM Systems. + +
+
+
+
+
+ + ☆ Distributed Robust Optimization Method for AC/MTDC Hybrid Power Systems + with DC Network Cognizance + + +
+ AC/multi-terminal DC (MTDC) hybrid power systems have emerged as a solution +for the large-scale and longdistance accommodation of power produced by +renewable energy systems (RESs). To ensure the optimal operation of such hybrid +power systems, this paper addresses three key issues: system operational +flexibility, centralized communication limitations, and RES uncertainties. +Accordingly, a specific AC/DC optimal power flow (OPF) model and a distributed +robust optimization method are proposed. Firstly, we apply a set of linear +approximation and convex relaxation techniques to formulate the mixed-integer +convex AC/DC OPF model. This model incorporates the DC network-cognizant +constraint and enables DC topology reconfiguration. Next, generalized Benders +decomposition (GBD) is employed to provide distributed optimization. Enhanced +approaches are incorporated into GBD to achieve parallel computation and +asynchronous updating. Additionally, the extreme scenario method (ESM) is +embedded into the AC/DC OPF model to provide robust decisions to hedge against +RES uncertainties. ESM is further extended to align the GBD procedure. +Numerical results are finally presented to validate the effectiveness of our +proposed method. + +
+
+
+
+
+ + ☆ Adaptive Single-Terminal Fault Location for DC Microgrids + + +
+ Identifying faulty lines and their accurate location is key for rapidly +restoring distribution systems. This will become a greater challenge as the +penetration of power electronics increases, and contingencies are seen across +larger areas. This paper proposes a single terminal methodology (i.e., no +communication involved) that is robust to variations of key parameters (e.g., +sampling frequency, system parameters, etc.) and performs particularly well for +low resistance faults that constitute the majority of faults in low voltage DC +systems. The proposed method uses local measurements to estimate the current +caused by the other terminals affected by the contingency. This mimics the +strategy followed by double terminal methods that require communications and +decouples the accuracy of the methodology from the fault resistance. The +algorithm takes consecutive voltage and current samples, including the +estimated current of the other terminal, into the analysis. This mathematical +methodology results in a better accuracy than other single-terminal approaches +found in the literature. The robustness of the proposed strategy against +different fault resistances and locations is demonstrated using MATLAB +simulations. + +
+
+ comment: SEST 2024 +
+
+
+
+
+ + ☆ Event-Triggered Non-Linear Control of Offshore MMC Grids for + Asymmetrical AC Faults + + +
+ Fault ride-through capability studies of MMC-HVDC connected wind power plants +have focused primarily on the DC link and onshore AC grid faults. Offshore AC +faults, mainly asymmetrical faults have not gained much attention in the +literature despite being included in the future development at national levels +in the ENTSO-E HVDC code. The proposed work gives an event-triggered control to +stabilize the system once the offshore AC fault has occurred, identified, and +isolated. Different types of control actions such as proportional-integral (PI) +controller and super-twisted sliding mode control (STSMC) are used to smoothly +transition the post-fault system to a new steady state operating point by +suppressing the negative sequence control. Initially, the effect of a negative +sequence current control scheme on the transient behavior of the power system +with a PI controller is discussed in this paper. Further, a non-linear control +strategy (STSMC) is proposed which gives quicker convergence of the system +post-fault in comparison to PI control action. These post-fault control +operations are only triggered in the presence of a fault in the system, i.e., +they are event-triggered. The validity of the proposed strategy is demonstrated +by simulation on a $\pm$525 kV, three-terminal meshed MMC-HVDC system model in +Real Time Digital Simulator (RTDS). + +
+
+
+
+
+ + ☆ The Bayesian Separation Principle for Data-driven Control + + +
+ This paper investigates the existence of a separation principle between model +identification and control design in the context of model predictive control. +First, we elucidate that the separation principle holds asymptotically in the +number of data in a Fisherian setting, and universally in a Bayesian setting. +Then, by formulating model predictive control within a Gaussian regression +framework, we describe how the Bayesian separation principle can be used to +derive explicit, uncertainty-aware expressions for the control cost and optimal +input sequence, thereby bridging direct and indirect data-driven approaches. + +
+
+ comment: 13 pages, 1 figure +
+
+
+
+
+ + ☆ Stochastic Shortest Path Problem with Failure Probability + + +
+ We solve a sequential decision-making problem under uncertainty that takes +into account the failure probability of a task. This problem cannot be handled +by the stochastic shortest path problem, which is the standard model for +sequential decision-making. This problem is addressed by introducing dead-ends. +Conventionally, we only consider policies that minimize the probability of task +failure, so the optimal policy constructed could be overly conservative. In +this paper, we address this issue by expanding the search range to a class of +policies whose failure probability is less than a desired threshold. This +problem can be solved by treating it as a framework of a Bayesian Markov +decision process and a two-person zero-sum game. Also, it can be seen that the +optimal policy is expressed in the form of a probability distribution on a set +of deterministic policies. We also demonstrate the effectiveness of the +proposed methods by applying them to a motion planning problem with obstacle +avoidance for a moving robot. + +
+
+ comment: 22 pages, 5 figure +
+
+
+
+
+ + ☆ Multirotor Nonlinear Model Predictive Control based on Visual Servoing + of Evolving Features + + +
+ This article presents a Visual Servoing Nonlinear Model Predictive Control +(NMPC) scheme for autonomously tracking a moving target using multirotor +Unmanned Aerial Vehicles (UAVs). The scheme is developed for surveillance and +tracking of contour-based areas with evolving features. NMPC is used to manage +input and state constraints, while additional barrier functions are +incorporated in order to ensure system safety and optimal performance. The +proposed control scheme is designed based on the extraction and implementation +of the full dynamic model of the features describing the target and the state +variables. Real-time simulations and experiments using a quadrotor UAV equipped +with a camera demonstrate the effectiveness of the proposed strategy. + +
+
+
+
+
+ + ☆ A Fast Dynamic Internal Predictive Power Scheduling Approach for Power + Management in Microgrids + + +
+ This paper presents a Dynamic Internal Predictive Power Scheduling (DIPPS) +approach for optimizing power management in microgrids, particularly focusingon +external power exchanges among diverse prosumers. DIPPS utilizes a dynamic +objective function with a time-varying binary parameter to control the timing +of power transfers to the external grid, facilitated by efficient usage of +energy storage for surplus renewable power. The microgrid power scheduling +problem is modeled as a mixed-integer nonlinear programmig (MINLP-PS) and +subsequently transformed into a mixed-integer linear programming (MILP-PS) +optimization through McCormick's relaxation to reduce the computational +complexity. A predictive window with 6 data points is solved at an average of +0.92s, a 97.6% improvement over the 38.27s required for the MINLP-PS +formulation, implying the numerical feasibility of the DIPPS approach for +real-time implementation. Finally, the approach is validated against a static +objective using real-world load data across three case studies with different +time-varying parameters, demonstrationg the ability of DIPPS to optimize power +exchanges and efficiently utilize distributed resources whie shifting the +eexternal power transfers to specified time durations. + +
+
+
+
+
+ + ☆ Robo-Platform: A Robotic System for Recording Sensors and Controlling + Robots + + +
+ Mobile smartphones compactly provide sensors such as cameras, IMUs, GNSS +measurement units, and wireless and wired communication channels required for +robotics projects. They are affordable, portable, and programmable, which makes +them ideal for testing, data acquisition, controlling mobile robots, and many +other robotic applications. A robotic system is proposed in this paper, +consisting of an Android phone, a microcontroller board attached to the phone +via USB, and a remote wireless controller station. In the data acquisition +mode, the Android device can record a dataset of a diverse configuration of +multiple cameras, IMUs, GNSS units, and external USB ADC channels in the rawest +format used for, but not limited to, pose estimation and scene reconstruction +applications. In robot control mode, the Android phone, a microcontroller +board, and other peripherals constitute the mobile or stationary robotic +system. This system is controlled using a remote server connected over Wi-Fi or +Bluetooth. Experiments show that although the SLAM and AR applications can +utilize the acquired data, the proposed system can pave the way for more +advanced algorithms for processing these noisy and sporadic measurements. +Moreover, the characteristics of the communication media are studied, and two +example robotic projects, which involve controlling a toy car and a quadcopter, +are included. + +
+
+ comment: Project repository: https://github.com/m-dayani/robo-platform Youtube + Video: https://youtu.be/BTQ4yLB1bak Dataset: + https://drive.google.com/drive/folders/1OZqdA1xa-SyJ64qL_TibqhtwhR1fWWrx?usp=sharing +
+
+
+
+
+ + ☆ $\mathcal{L}_{1}$ Adaptive Optimizer for Uncertain Time-Varying Convex + Optimization + + +
+ We propose an adaptive method for uncertain time-varying (TV) convex +optimization, termed as $\mathcal{L}_{1}$ adaptive optimization +($\mathcal{L}_{1}$-AO). The proposed method uses a baseline TV optimizer with a +prediction model, designed for the gradient dynamics to exploit the underlying +structure of the temporal correlation. Inspired by $\mathcal{L}_{1}$ adaptive +control, the proposed method augments an adaptive update law to estimate and +compensate for the uncertainty from the inaccurate prediction in the online +implementation. The proposed method provides the performance bounds of the +error in the optimization variables and cost function, allowing efficient and +reliable optimization for uncertain TV problems. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ Device for detection of activity-dependent changes in neural spheroids + at MHz and GHz frequencies + + +
+ Intracellular processes triggered by neural activity include changes in ionic +concentrations, protein release, and synaptic vesicle cycling. These processes +play significant roles in neurological disorders. The beneficial effects of +brain stimulation may also be mediated through intracellular changes. There is +a lack of label-free techniques for monitoring activity-dependent intracellular +changes. Electromagnetic (EM) waves at frequencies larger than 1x10^6 Hz (1 +MHz) were previously used to probe intracellular contents of cells, as cell +membrane becomes transparent at this frequency range. EM waves interact with +membranes of intracellular organelles, proteins, and water in the MHz-GHz +range. In this work, we developed a device for probing the interaction between +intracellular contents of active neurons and EM waves. The device used an array +of grounded coplanar waveguides (GCPWs) to deliver EM waves to a +three-dimensional (3D) spheroid of rat cortical neurons. Neural activity was +evoked using optogenetics, with synchronous detection of propagation of EM +waves. Broadband measurements were conducted in the MHz-GHz range to track +changes in transmission coefficients. Neuronal activity was found to reversibly +alter EM wave transmission. Pharmacological suppression of neuronal activity +abolished changes in transmission. Time constants of changes in transmission +were in the range of seconds to tens of seconds, suggesting the presence of +relatively slow, activity-dependent intracellular processes. This study +provides the first evidence that EM transmission through neuronal tissue is +activity-dependent in MHz-GHz range. Device developed in this work may find +future applications in studies of the mechanisms of neurological disorders and +the development of new therapies. + +
+
+
+
+
+ + ♻ ☆ Robust Adaptive MPC Using Uncertainty Compensation + + +
+ This paper presents an uncertainty compensation-based robust adaptive model +predictive control (MPC) framework for linear systems with both matched and +unmatched nonlinear uncertainties subject to both state and input constraints. +In particular, the proposed control framework leverages an L1 adaptive +controller (L1AC) to compensate for the matched uncertainties and to provide +guaranteed uniform bounds on the error between the states and control inputs of +the actual system and those of a nominal i.e., uncertainty-free, system. The +performance bounds provided by the L1AC are then used to tighten the state and +control constraints of the actual system, and a model predictive controller is +designed for the nominal system with the tightened constraints. The proposed +control framework, which we denote as uncertainty compensation-based MPC +(UC-MPC), guarantees constraint satisfaction and achieves improved performance +compared with existing methods. Simulation results on a flight control example +demonstrate the benefits of the proposed framework. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2208.02985 +
+
+
+
+
+ + ♻ ☆ Applications of Lifted Nonlinear Cuts to Convex Relaxations of the AC + Power Flow Equations + + +
+ We demonstrate that valid inequalities, or lifted nonlinear cuts (LNC), can +be projected to tighten the Second Order Cone (SOC), Convex DistFlow (CDF), and +Network Flow (NF) relaxations of the AC Optimal Power Flow (AC-OPF) problem. We +conduct experiments on 36 cases from the PGLib-OPF library for two objective +functions, (1) power generation maximization and (2) generation cost +minimization. Significant optimality gap improvements are shown for the +maximization problem, where the LNC strengthen the SOC and CDF relaxations in +100% of the test cases, with average and maximum differences in the optimality +gaps of 23.1% and 93.5% respectively. The NF relaxation is strengthened in +79.2% of test cases, with average and maximum differences in the optimality +gaps of 3.45% and 21.2% respectively. We also study the trade-off between +relaxation quality and solve time, demonstrating that the strengthened CDF +relaxation outperforms the strengthened SOC formulation in terms of runtime and +number of iterations needed, while the strengthened NF formulation is the most +scalable with the lowest relaxation quality provided by these LNC. + +
+
+
+
+
+ + ♻ ☆ Probabilistic Metaplasticity for Continual Learning with Memristors + + +
+ Edge devices operating in dynamic environments critically need the ability to +continually learn without catastrophic forgetting. The strict resource +constraints in these devices pose a major challenge to achieve this, as +continual learning entails memory and computational overhead. Crossbar +architectures using memristor devices offer energy efficiency through +compute-in-memory and hold promise to address this issue. However, memristors +often exhibit low precision and high variability in conductance modulation, +rendering them unsuitable for continual learning solutions that require precise +modulation of weight magnitude for consolidation. Current approaches fall short +to address this challenge directly and rely on auxiliary high-precision memory, +leading to frequent memory access, high memory overhead, and energy +dissipation. In this research, we propose probabilistic metaplasticity, which +consolidates weights by modulating their update probability rather than +magnitude. The proposed mechanism eliminates high-precision modification to +weight magnitudes and, consequently, the need for auxiliary high-precision +memory. We demonstrate the efficacy of the proposed mechanism by integrating +probabilistic metaplasticity into a spiking network trained on an error +threshold with low-precision memristor weights. Evaluations of continual +learning benchmarks show that probabilistic metaplasticity achieves performance +equivalent to state-of-the-art continual learning models with high-precision +weights while consuming ~ 67% lower memory for additional parameters and up to +~ 60x lower energy during parameter updates compared to an auxiliary +memory-based solution. The proposed model shows potential for energy-efficient +continual learning with low-precision emerging devices. + +
+
+
+
+
+ + ♻ ☆ Sampling-based Stochastic Data-driven Predictive Control under Data + Uncertainty + + +
+ We present a stochastic constrained output-feedback data-driven predictive +control scheme for linear time-invariant systems subject to bounded additive +disturbances. The approach uses data-driven predictors based on an extension of +Willems' fundamental lemma and requires only a single persistently exciting +input-output data trajectory. Compared to current state-of-the-art approaches, +we do not rely on availability of exact disturbance data. Instead, we leverage +a novel parameterization of the unknown disturbance data considering +consistency with the measured data and the system class. This allows for +deterministic approximation of the chance constraints in a sampling-based +fashion. A robust constraint on the first predicted step enables recursive +feasibility, closed-loop constraint satisfaction, and robust asymptotic +stability in expectation under standard assumptions. A numerical example +demonstrates the efficiency of the proposed control scheme. + +
+
+
+
+
+ + ♻ ☆ GPU-Accelerated DCOPF using Gradient-Based Optimization + + +
+ DC Optimal Power Flow (DCOPF) is a key operational tool for power system +operators, and it is embedded as a subproblem in many challenging optimization +problems (e.g., line switching). However, traditional CPU-based solve routines +(e.g., simplex) have saturated in speed and are hard to parallelize. This paper +focuses on solving DCOPF problems using gradient-based routines on Graphics +Processing Units (GPUs), which have massive parallelization capability. To +formulate these problems, we pose a Lagrange dual associated with DCOPF (linear +and quadratic cost curves), and then we explicitly solve the inner (primal) +minimization problem with a dual norm. The resulting dual problem can be +efficiently iterated using projected gradient ascent. After solving the dual +problem on both CPUs and GPUs to find tight lower bounds, we benchmark against +Gurobi and MOSEK, comparing convergence speed and tightness on the IEEE 2000, +4601, and 10000 bus systems. We provide reliable and tight lower bounds for +these problems with, at best, 5.4x speedup over a conventional solver. + +
+
+
+
+
+ + ♻ ☆ Identification of Additive Continuous-time Systems in Open and Closed + loop + + +
+ When identifying electrical, mechanical, or biological systems, parametric +continuous-time identification methods can lead to interpretable and +parsimonious models when the model structure aligns with the physical +properties of the system. Traditional linear system identification may not +consider the most parsimonious model when relying solely on unfactored transfer +functions, which typically result from standard direct approaches. This paper +presents a novel identification method that delivers additive models for both +open and closed-loop setups. The estimators that are derived are shown to be +generically consistent, and can admit the identification of marginally stable +additive systems. Numerical simulations show the efficacy of the proposed +approach, and its performance in identifying a modal representation of a +flexible beam is verified using experimental data. + +
+
+ comment: 15 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Instantaneous Frequency Estimation in Unbalanced Systems Using Affine + Differential Geometry + + +
+ The paper discusses the relationships between electrical and affine +differential geometry quantities, establishing a link between frequency and +time derivatives of voltage, through the utilization of affine geometric +invariants. Based on this link, a new instantaneous frequency estimation +formula is proposed, which is particularly suited for unbalanced and +single-phase systems. Several examples as well as measurements based on two +real-world events illustrate the findings of the paper. + +
+
+
+
+
+ + ♻ ☆ Proactive Emergency Collision Avoidance for Automated Driving in Highway + Scenarios + + +
+ Uncertainty in the behavior of other traffic participants is a crucial factor +in collision avoidance for automated driving; here, stochastic metrics could +avoid overly conservative decisions. This paper introduces a Stochastic Model +Predictive Control (SMPC) planner for emergency collision avoidance in highway +scenarios to proactively minimize collision risk while ensuring safety through +chance constraints. To guarantee that the emergency trajectory can be attained, +we incorporate nonlinear tire dynamics in the prediction model of the ego +vehicle. Further, we exploit Max-Min-Plus-Scaling (MMPS) approximations of the +nonlinearities to avoid conservatism, enforce proactive collision avoidance, +and improve computational efficiency in terms of performance and speed. +Consequently, our contributions include integrating a dynamic ego vehicle model +into the SMPC planner, introducing the MMPS approximation for real-time +implementation in emergency scenarios, and integrating SMPC with hybridized +chance constraints and risk minimization. We evaluate our SMPC formulation in +terms of proactivity and efficiency in various hazardous scenarios. Moreover, +we demonstrate the effectiveness of our proposed approach by comparing it with +a state-of-the-art SMPC planner and we validate that the generated trajectories +can be attained using a high-fidelity vehicle model in IPG CarMaker. + +
+
+ comment: 14 pages, 11 figures, submitted to IEEE Transactions on Control + Systems Technology +
+
+
+
+
+ + ♻ ☆ Mamba as a motion encoder for robotic imitation learning + + +
+ Recent advancements in imitation learning, particularly with the integration +of LLM techniques, are set to significantly improve robots' dexterity and +adaptability. This paper proposes using Mamba, a state-of-the-art architecture +with potential applications in LLMs, for robotic imitation learning, +highlighting its ability to function as an encoder that effectively captures +contextual information. By reducing the dimensionality of the state space, +Mamba operates similarly to an autoencoder. It effectively compresses the +sequential information into state variables while preserving the essential +temporal dynamics necessary for accurate motion prediction. Experimental +results in tasks such as cup placing and case loading demonstrate that despite +exhibiting higher estimation errors, Mamba achieves superior success rates +compared to Transformers in practical task execution. This performance is +attributed to Mamba's structure, which encompasses the state space model. +Additionally, the study investigates Mamba's capacity to serve as a real-time +motion generator with a limited amount of training data. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Model-Free Generic Robust Control for Servo-Driven Actuation Mechanisms + with Layered Insight into Energy Conversions + + +
+ To advance theoretical solutions and address limitations in modeling complex +servo-driven actuation systems experiencing high non-linearity and load +disturbances, this paper aims to design a practical model-free generic robust +control (GRC) framework for these mechanisms. This framework is intended to be +applicable across all actuator systems encompassing electrical, hydraulic, or +pneumatic servomechanisms, while also functioning within complex interactions +among dynamic components and adhering to control input constraints. In this +respect, the state-space model of actuator systems is decomposed into smaller +subsystems that incorporate the first principle equation of actuator motion +dynamics and interactive energy conversion equations. This decomposition +operates under the assumption that the comprehensive model of the servo-driven +actuator system and energy conversion, uncertainties, load disturbances, and +their bounds are unknown. Then, the GRC employs subsystem-based adaptive +control strategies for each state-variant subsystem separately. Despite control +input constraints and the unknown interactive system model, the GRC-applied +actuator mechanism ensures uniform exponential stability and robustness in +tracking desired motions. It features straightforward implementation, +experimentally evaluated by applying it to two industrial applications. + +
+
+ comment: This work has been submitted for possible publication in the IEEE +
+
+
+
+
+ + ♻ ☆ SIMBa: System Identification Methods leveraging Backpropagation + + +
+ This manuscript details and extends the SIMBa toolbox (System Identification +Methods leveraging Backpropagation) presented in previous work, which uses +well-established Machine Learning tools for discrete-time linear +multi-step-ahead state-space System Identification (SI). SIMBa leverages +linear-matrix-inequality-based free parametrizations of Schur matrices to +guarantee the stability of the identified model by design. In this paper, +backed up by novel free parametrizations of Schur matrices, we extend the +toolbox to show how SIMBa can incorporate known sparsity patterns or true +values of the state-space matrices to identify without jeopardizing stability. + We extensively investigate SIMBa's behavior when identifying diverse systems +with various properties from both simulated and real-world data. Overall, we +find it consistently outperforms traditional stable subspace identification +methods, and sometimes significantly, especially when enforcing desired model +properties. These results hint at the potential of SIMBa to pave the way for +generic structured nonlinear SI. The toolbox is open-sourced on +https://github.com/Cemempamoi/simba. + +
+
+ comment: First two authors contributed equally. Submitted to IEEE TCST +
+
+
+
+
+ + ♻ ☆ An Alternative to Multi-Factor Authentication with a Triple-Identity + Authentication Scheme + + +
+ The existing authentication system has two entry points (i.e., username and +password fields) to interact with the outside, but neither of them has a +gatekeeper, making the system vulnerable to cyberattacks. In order to ensure +the authentication security, the system sets a third entry point and use an +external MFA service to guard it. The crux of the problem is that the system +has no internal mechanism to guard its own entry points as no identifiers can +be defined for the username and password without using any personal +information. To solve this problem, we open the hash algorithm of a +dual-password login-authentication system to three login credentials. +Therefore, the intermediate elements of the algorithm can be used to define an +identifier to verify the user identity at each entry point of the system. As a +result of the above setup, a triple-identity authentication is established, the +key of which is that the readily available user's login name and password are +randomly converted into a matrix of meaningless hash elements which are +concealed, incommunicable, inaccessible, and independent of personal +information. So the identifiers defined using such elements can be used by the +system to verify the identities of the user at all the entry points of the +system, thereby ensuring the authentication security without relying on MFA +services. + +
+
+ comment: 5 pages, 2 figures, 11 conferences +
+
+
+
+
+ + ♻ ☆ Towards Autonomous Supply Chains: Definition, Characteristics, + Conceptual Framework, and Autonomy Levels + + +
+ Recent global disruptions, such as the pandemic and geopolitical conflicts, +have profoundly exposed vulnerabilities in traditional supply chains, requiring +exploration of more resilient alternatives. Autonomous supply chains (ASCs) +have emerged as a potential solution, offering increased visibility, +flexibility, and resilience in turbulent trade environments. Despite +discussions in industry and academia over several years, ASCs lack +well-established theoretical foundations. This paper addresses this research +gap by presenting a formal definition of ASC along with its defining +characteristics and auxiliary concepts. We propose a layered conceptual +framework called the MIISI model. An illustrative case study focusing on the +meat supply chain demonstrates an initial ASC implementation based on this +conceptual model. Additionally, we introduce a seven-level supply chain +autonomy reference model, delineating a trajectory towards achieving a full +supply chain autonomy. Recognising that this work represents an initial +endeavour, we emphasise the need for continued exploration in this emerging +domain. We anticipate that this work will stimulate further research, both +theoretical and technical, and contribute to the continual evolution of ASCs. + +
+
+ comment: This paper includes 19 pages and 8 figures and has been accepted for + publication in the Journal of Industrial Information Integration +
+
+
+
+
+ + ♻ ☆ Precision Aquaculture: An Integrated Computer Vision and IoT Approach + for Optimized Tilapia Feeding + + +
+ Traditional fish farming practices often lead to inefficient feeding, +resulting in environmental issues and reduced productivity. We developed an +innovative system combining computer vision and IoT technologies for precise +Tilapia feeding. Our solution uses real-time IoT sensors to monitor water +quality parameters and computer vision algorithms to analyze fish size and +count, determining optimal feed amounts. A mobile app enables remote monitoring +and control. We utilized YOLOv8 for keypoint detection to measure Tilapia +weight from length, achieving \textbf{94\%} precision on 3,500 annotated +images. Pixel-based measurements were converted to centimeters using depth +estimation for accurate feeding calculations. Our method, with data collection +mirroring inference conditions, significantly improved results. Preliminary +estimates suggest this approach could increase production up to 58 times +compared to traditional farms. Our models, code, and dataset are +open-source~\footnote{The code, dataset, and models are available upon +reasonable request. + +
+
+ comment: 8 pages, 6 figures, 3 tables, 21th International Conference on + Informatics in Control, Automation, and Robotics +
+
+
+
+
+ + ♻ ☆ Stochastic Data-Driven Predictive Control with Equivalence to Stochastic + MPC + + +
+ We propose a data-driven receding-horizon control method dealing with the +chance-constrained output-tracking problem of unknown stochastic linear +time-invariant (LTI) systems with partial state observation. The proposed +method takes into account the statistics of the process noise, the measurement +noise and the uncertain initial condition, following an analogous framework to +Stochastic Model Predictive Control (SMPC), but does not rely on the use of a +parametric system model. As such, our receding-horizon algorithm produces a +sequence of closed-loop control policies for predicted time steps, as opposed +to a sequence of open-loop control actions. Under certain conditions, we +establish that our proposed data-driven control method produces identical +control inputs as that produced by the associated model-based SMPC. Simulation +results on a grid-connected power converter are provided to illustrate the +performance benefits of our methodology. + +
+
+ comment: 20 pages, 4 figures. The extended version of a submission to IEEE + Transactions on Automatic Control +
+
+
+
+
+
+
+
+ + Computation and Language 50 + +
+
+
+ + ☆ HDFlow: Enhancing LLM Complex Problem-Solving with Hybrid Thinking and + Dynamic Workflows + + +
+ Despite recent advancements in large language models (LLMs), their +performance on complex reasoning problems requiring multi-step thinking and +combining various skills is still limited. To address this, we propose a novel +framework HDFlow for complex reasoning with LLMs that combines fast and slow +thinking modes in an adaptive manner. Our approach consists of two key +components: 1) a new approach for slow, deliberate reasoning called Dynamic +Workflow, which automatically decomposes complex problems into more manageable +sub-tasks and dynamically designs a workflow to assemble specialized LLM or +symbolic reasoning tools to solve sub-tasks; 2) Hybrid Thinking, a general +framework that dynamically combines fast and slow thinking based on problem +complexity. Finally, we propose an easy-to-scale method for automatically +synthesizing a large-scale dataset of 27K challenging reasoning problems for +complex reasoning and a hybrid thinking tuning method that trains smaller LLMs +on this dataset to internalize the fast/slow hybrid reasoning strategies. +Experiments on four reasoning benchmark datasets demonstrate that our slow +thinking with dynamic workflows significantly outperforms Chain-of-Thought, and +hybrid thinking achieves the highest accuracy while providing an effective +balance between computational efficiency and performance. Fine-tuning using our +hybrid thinking approach also significantly boosts the complex reasoning +capabilities of open-source language models. The results showcase the promise +of slow thinking, dynamic workflows, and hybrid thinking in expanding the +frontier of complex problem-solving with LLMs\footnote{Code and data will be +released at \url{https://github.com/wenlinyao/HDFlow}.}. + +
+
+ comment: 27 pages, 5 figures +
+
+
+
+
+ + ☆ On Extending Direct Preference Optimization to Accommodate Ties + + +
+ We derive and investigate two DPO variants that explicitly model the +possibility of declaring a tie in pair-wise comparisons. We replace the +Bradley-Terry model in DPO with two well-known modeling extensions, by Rao and +Kupper and by Davidson, that assign probability to ties as alternatives to +clear preferences. Our experiments in neural machine translation and +summarization show that explicitly labeled ties can be added to the datasets +for these DPO variants without the degradation in task performance that is +observed when the same tied pairs are presented to DPO. We find empirically +that the inclusion of ties leads to stronger regularization with respect to the +reference policy as measured by KL divergence, and we see this even for DPO in +its original form. These findings motivate and enable the inclusion of tied +pairs in preference optimization as opposed to simply discarding them. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Discovering the Gems in Early Layers: Accelerating Long-Context LLMs + with 1000x Input Token Reduction + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities in +handling long context inputs, but this comes at the cost of increased +computational resources and latency. Our research introduces a novel approach +for the long context bottleneck to accelerate LLM inference and reduce GPU +memory consumption. Our research demonstrates that LLMs can identify relevant +tokens in the early layers before generating answers to a query. Leveraging +this insight, we propose an algorithm that uses early layers of an LLM as +filters to select and compress input tokens, significantly reducing the context +length for subsequent processing. Our method, GemFilter, demonstrates +substantial improvements in both speed and memory efficiency compared to +existing techniques, such as standard attention and SnapKV/H2O. Notably, it +achieves a 2.4$\times$ speedup and 30\% reduction in GPU memory usage compared +to SOTA methods. Evaluation on the Needle in a Haystack task shows that +GemFilter significantly outperforms standard attention, SnapKV and demonstrates +comparable performance on the LongBench challenge. GemFilter is simple, +training-free, and broadly applicable across different LLMs. Crucially, it +provides interpretability by allowing humans to inspect the selected input +sequence. These findings not only offer practical benefits for LLM deployment, +but also enhance our understanding of LLM internal mechanisms, paving the way +for further optimizations in LLM design and inference. Our code is available at +\url{https://github.com/SalesforceAIResearch/GemFilter}. + +
+
+
+
+
+ + ☆ Pre-Finetuning with Impact Duration Awareness for Stock Movement + Prediction + + +
+ Understanding the duration of news events' impact on the stock market is +crucial for effective time-series forecasting, yet this facet is largely +overlooked in current research. This paper addresses this research gap by +introducing a novel dataset, the Impact Duration Estimation Dataset (IDED), +specifically designed to estimate impact duration based on investor opinions. +Our research establishes that pre-finetuning language models with IDED can +enhance performance in text-based stock movement predictions. In addition, we +juxtapose our proposed pre-finetuning task with sentiment analysis +pre-finetuning, further affirming the significance of learning impact duration. +Our findings highlight the promise of this novel research direction in stock +movement prediction, offering a new avenue for financial forecasting. We also +provide the IDED and pre-finetuned language models under the CC BY-NC-SA 4.0 +license for academic use, fostering further exploration in this field. + +
+
+ comment: NTCIR-18 FinArg-2 Dataset +
+
+
+
+
+ + ☆ Enhancing Investment Opinion Ranking through Argument-Based Sentiment + Analysis + + +
+ In the era of rapid Internet and social media platform development, +individuals readily share their viewpoints online. The overwhelming quantity of +these posts renders comprehensive analysis impractical. This necessitates an +efficient recommendation system to filter and present significant, relevant +opinions. Our research introduces a dual-pronged argument mining technique to +improve recommendation system effectiveness, considering both professional and +amateur investor perspectives. Our first strategy involves using the +discrepancy between target and closing prices as an opinion indicator. The +second strategy applies argument mining principles to score investors' +opinions, subsequently ranking them by these scores. Experimental results +confirm the effectiveness of our approach, demonstrating its ability to +identify opinions with higher profit potential. Beyond profitability, our +research extends to risk analysis, examining the relationship between +recommended opinions and investor behaviors. This offers a holistic view of +potential outcomes following the adoption of these recommended opinions. + +
+
+
+
+
+ + ☆ From Deception to Detection: The Dual Roles of Large Language Models in + Fake News + + +
+ Fake news poses a significant threat to the integrity of information +ecosystems and public trust. The advent of Large Language Models (LLMs) holds +considerable promise for transforming the battle against fake news. Generally, +LLMs represent a double-edged sword in this struggle. One major concern is that +LLMs can be readily used to craft and disseminate misleading information on a +large scale. This raises the pressing questions: Can LLMs easily generate +biased fake news? Do all LLMs have this capability? Conversely, LLMs offer +valuable prospects for countering fake news, thanks to their extensive +knowledge of the world and robust reasoning capabilities. This leads to other +critical inquiries: Can we use LLMs to detect fake news, and do they outperform +typical detection models? In this paper, we aim to address these pivotal +questions by exploring the performance of various LLMs. Our objective is to +explore the capability of various LLMs in effectively combating fake news, +marking this as the first investigation to analyze seven such models. Our +results reveal that while some models adhere strictly to safety protocols, +refusing to generate biased or misleading content, other models can readily +produce fake news across a spectrum of biases. Additionally, our results show +that larger models generally exhibit superior detection abilities and that +LLM-generated fake news are less likely to be detected than human-written ones. +Finally, our findings demonstrate that users can benefit from LLM-generated +explanations in identifying fake news. + +
+
+
+
+
+ + ☆ Post-hoc Reward Calibration: A Case Study on Length Bias + + +
+ Reinforcement Learning from Human Feedback aligns the outputs of Large +Language Models with human values and preferences. Central to this process is +the reward model (RM), which translates human feedback into training signals +for optimising LLM behaviour. However, RMs can develop biases by exploiting +spurious correlations in their training data, such as favouring outputs based +on length or style rather than true quality. These biases can lead to incorrect +output rankings, sub-optimal model evaluations, and the amplification of +undesirable behaviours in LLMs alignment. This paper addresses the challenge of +correcting such biases without additional data and training, introducing the +concept of Post-hoc Reward Calibration. We first propose an intuitive approach +to estimate the bias term and, thus, remove it to approximate the underlying +true reward. We then extend the approach to a more general and robust form with +the Locally Weighted Regression. Focusing on the prevalent length bias, we +validate our proposed approaches across three experimental settings, +demonstrating consistent improvements: (1) a 3.11 average performance gain +across 33 reward models on the RewardBench dataset; (2) enhanced alignment of +RM rankings with GPT-4 evaluations and human preferences based on the +AlpacaEval benchmark; and (3) improved Length-Controlled win rate of the RLHF +process in multiple LLM--RM combinations. Our method is computationally +efficient and generalisable to other types of bias and RMs, offering a scalable +and robust solution for mitigating biases in LLM alignment. Our code and +results are available at https://github.com/ZeroYuHuang/Reward-Calibration. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Severity Prediction in Mental Health: LLM-based Creation, Analysis, + Evaluation of a Novel Multilingual Dataset + + +
+ Large Language Models (LLMs) are increasingly integrated into various medical +fields, including mental health support systems. However, there is a gap in +research regarding the effectiveness of LLMs in non-English mental health +support applications. To address this problem, we present a novel multilingual +adaptation of widely-used mental health datasets, translated from English into +six languages (Greek, Turkish, French, Portuguese, German, and Finnish). This +dataset enables a comprehensive evaluation of LLM performance in detecting +mental health conditions and assessing their severity across multiple +languages. By experimenting with GPT and Llama, we observe considerable +variability in performance across languages, despite being evaluated on the +same translated dataset. This inconsistency underscores the complexities +inherent in multilingual mental health support, where language-specific nuances +and mental health data coverage can affect the accuracy of the models. Through +comprehensive error analysis, we emphasize the risks of relying exclusively on +large language models (LLMs) in medical settings (e.g., their potential to +contribute to misdiagnoses). Moreover, our proposed approach offers significant +cost savings for multilingual tasks, presenting a major advantage for +broad-scale implementation. + +
+
+
+
+
+ + ☆ data2lang2vec: Data Driven Typological Features Completion + + +
+ Language typology databases enhance multi-lingual Natural Language Processing +(NLP) by improving model adaptability to diverse linguistic structures. The +widely-used lang2vec toolkit integrates several such databases, but its +coverage remains limited at 28.9\%. Previous work on automatically increasing +coverage predicts missing values based on features from other languages or +focuses on single features, we propose to use textual data for better-informed +feature prediction. To this end, we introduce a multi-lingual Part-of-Speech +(POS) tagger, achieving over 70\% accuracy across 1,749 languages, and +experiment with external statistical features and a variety of machine learning +algorithms. We also introduce a more realistic evaluation setup, focusing on +likely to be missing typology features, and show that our approach outperforms +previous work in both setups. + +
+
+ comment: 9 pages, 11 figures +
+
+
+
+
+ + ☆ How Transliterations Improve Crosslingual Alignment + + +
+ Recent studies have shown that post-aligning multilingual pretrained language +models (mPLMs) using alignment objectives on both original and transliterated +data can improve crosslingual alignment. This improvement further leads to +better crosslingual transfer performance. However, it remains unclear how and +why a better crosslingual alignment is achieved, as this technique only +involves transliterations, and does not use any parallel data. This paper +attempts to explicitly evaluate the crosslingual alignment and identify the key +elements in transliteration-based approaches that contribute to better +performance. For this, we train multiple models under varying setups for two +pairs of related languages: (1) Polish and Ukrainian and (2) Hindi and Urdu. To +assess alignment, we define four types of similarities based on sentence +representations. Our experiments show that adding transliterations alone +improves the overall similarities, even for random sentence pairs. With the +help of auxiliary alignment objectives, especially the contrastive objective, +the model learns to distinguish matched from random pairs, leading to better +alignments. However, we also show that better alignment does not always yield +better downstream performance, suggesting that further research is needed to +clarify the connection between alignment and performance. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Navigating the Nuances: A Fine-grained Evaluation of Vision-Language + Navigation EMNLP 2024 + + +
+ This study presents a novel evaluation framework for the Vision-Language +Navigation (VLN) task. It aims to diagnose current models for various +instruction categories at a finer-grained level. The framework is structured +around the context-free grammar (CFG) of the task. The CFG serves as the basis +for the problem decomposition and the core premise of the instruction +categories design. We propose a semi-automatic method for CFG construction with +the help of Large-Language Models (LLMs). Then, we induct and generate data +spanning five principal instruction categories (i.e. direction change, landmark +recognition, region recognition, vertical movement, and numerical +comprehension). Our analysis of different models reveals notable performance +discrepancies and recurrent issues. The stagnation of numerical comprehension, +heavy selective biases over directional concepts, and other interesting +findings contribute to the development of future language-guided navigation +systems. + +
+
+ comment: EMNLP 2024 Findings; project page: + https://zehao-wang.github.io/navnuances +
+
+
+
+
+ + ☆ BabyLlama-2: Ensemble-Distilled Models Consistently Outperform Teachers + With Limited Data CoNLL 2024 + + +
+ We present BabyLlama-2, a 345 million parameter model distillation-pretrained +from two teachers on a 10 million word corpus for the BabyLM competition. On +BLiMP and SuperGLUE benchmarks, BabyLlama-2 outperforms baselines trained on +both 10 and 100 million word datasets with the same data mix, as well as its +teacher models. Through an extensive hyperparameter sweep, we demonstrate that +the advantages of distillation cannot be attributed to suboptimal +hyperparameter selection of the teachers. Our findings underscore the need for +further investigation into distillation techniques, particularly in +data-limited settings. + +
+
+ comment: 9 pages, 3 figures, 5 tables, submitted to the BabyLM Challenge + (CoNLL 2024 Shared Task) +
+
+
+
+
+ + ☆ Proof of Thought : Neurosymbolic Program Synthesis allows Robust and + Interpretable Reasoning + + +
+ Large Language Models (LLMs) have revolutionized natural language processing, +yet they struggle with inconsistent reasoning, particularly in novel domains +and complex logical sequences. This research introduces Proof of Thought, a +framework that enhances the reliability and transparency of LLM outputs. Our +approach bridges LLM-generated ideas with formal logic verification, employing +a custom interpreter to convert LLM outputs into First Order Logic constructs +for theorem prover scrutiny. Central to our method is an intermediary +JSON-based Domain-Specific Language, which by design balances precise logical +structures with intuitive human concepts. This hybrid representation enables +both rigorous validation and accessible human comprehension of LLM reasoning +processes. Key contributions include a robust type system with sort management +for enhanced logical integrity, explicit representation of rules for clear +distinction between factual and inferential knowledge, and a flexible +architecture that allows for easy extension to various domain-specific +applications. We demonstrate Proof of Thought's effectiveness through +benchmarking on StrategyQA and a novel multimodal reasoning task, showing +improved performance in open-ended scenarios. By providing verifiable and +interpretable results, our technique addresses critical needs for AI system +accountability and sets a foundation for human-in-the-loop oversight in +high-stakes domains. + +
+
+
+
+
+ + ☆ Molmo and PixMo: Open Weights and Open Data for State-of-the-Art + Multimodal Models + + +
+ Today's most advanced multimodal models remain proprietary. The strongest +open-weight models rely heavily on synthetic data from proprietary VLMs to +achieve good performance, effectively distilling these closed models into open +ones. As a result, the community is still missing foundational knowledge about +how to build performant VLMs from scratch. We present Molmo, a new family of +VLMs that are state-of-the-art in their class of openness. Our key innovation +is a novel, highly detailed image caption dataset collected entirely from human +annotators using speech-based descriptions. To enable a wide array of user +interactions, we also introduce a diverse dataset mixture for fine-tuning that +includes in-the-wild Q&A and innovative 2D pointing data. The success of our +approach relies on careful choices for the model architecture details, a +well-tuned training pipeline, and, most critically, the quality of our newly +collected datasets, all of which will be released. The best-in-class 72B model +within the Molmo family not only outperforms others in the class of open weight +and data models but also compares favorably against proprietary systems like +GPT-4o, Claude 3.5, and Gemini 1.5 on both academic benchmarks and human +evaluation. + We will be releasing all of our model weights, captioning and fine-tuning +data, and source code in the near future. Select model weights, inference code, +and demo are available at https://molmo.allenai.org. + +
+
+
+
+
+ + ☆ FineZip : Pushing the Limits of Large Language Models for Practical + Lossless Text Compression + + +
+ While the language modeling objective has been shown to be deeply connected +with compression, it is surprising that modern LLMs are not employed in +practical text compression systems. In this paper, we provide an in-depth +analysis of neural network and transformer-based compression techniques to +answer this question. We compare traditional text compression systems with +neural network and LLM-based text compression methods. Although LLM-based +systems significantly outperform conventional compression methods, they are +highly impractical. Specifically, LLMZip, a recent text compression system +using Llama3-8B requires 9.5 days to compress just 10 MB of text, although with +huge improvements in compression ratios. To overcome this, we present FineZip - +a novel LLM-based text compression system that combines ideas of online +memorization and dynamic context to reduce the compression time immensely. +FineZip can compress the above corpus in approximately 4 hours compared to 9.5 +days, a 54 times improvement over LLMZip and comparable performance. FineZip +outperforms traditional algorithmic compression methods with a large margin, +improving compression ratios by approximately 50\%. With this work, we take the +first step towards making lossless text compression with LLMs a reality. While +FineZip presents a significant step in that direction, LLMs are still not a +viable solution for large-scale text compression. We hope our work paves the +way for future research and innovation to solve this problem. + +
+
+
+
+
+ + ☆ Assessing the Level of Toxicity Against Distinct Groups in Bangla Social + Media Comments: A Comprehensive Investigation + + +
+ Social media platforms have a vital role in the modern world, serving as +conduits for communication, the exchange of ideas, and the establishment of +networks. However, the misuse of these platforms through toxic comments, which +can range from offensive remarks to hate speech, is a concerning issue. This +study focuses on identifying toxic comments in the Bengali language targeting +three specific groups: transgender people, indigenous people, and migrant +people, from multiple social media sources. The study delves into the intricate +process of identifying and categorizing toxic language while considering the +varying degrees of toxicity: high, medium, and low. The methodology involves +creating a dataset, manual annotation, and employing pre-trained transformer +models like Bangla-BERT, bangla-bert-base, distil-BERT, and +Bert-base-multilingual-cased for classification. Diverse assessment metrics +such as accuracy, recall, precision, and F1-score are employed to evaluate the +model's effectiveness. The experimental findings reveal that Bangla-BERT +surpasses alternative models, achieving an F1-score of 0.8903. This research +exposes the complexity of toxicity in Bangla social media dialogues, revealing +its differing impacts on diverse demographic groups. + +
+
+ comment: Accepted for publication in "18th International Conference on + Information Technology and Applications (ICITA 2024)" +
+
+
+
+
+ + ☆ Deep Learning and Machine Learning, Advancing Big Data Analytics and + Management: Handy Appetizer + + +
+ This book explores the role of Artificial Intelligence (AI), Machine Learning +(ML), and Deep Learning (DL) in driving the progress of big data analytics and +management. The book focuses on simplifying the complex mathematical concepts +behind deep learning, offering intuitive visualizations and practical case +studies to help readers understand how neural networks and technologies like +Convolutional Neural Networks (CNNs) work. It introduces several classic models +and technologies such as Transformers, GPT, ResNet, BERT, and YOLO, +highlighting their applications in fields like natural language processing, +image recognition, and autonomous driving. The book also emphasizes the +importance of pre-trained models and how they can enhance model performance and +accuracy, with instructions on how to apply these models in various real-world +scenarios. Additionally, it provides an overview of key big data management +technologies like SQL and NoSQL databases, as well as distributed computing +frameworks such as Apache Hadoop and Spark, explaining their importance in +managing and processing vast amounts of data. Ultimately, the book underscores +the value of mastering deep learning and big data management skills as critical +tools for the future workforce, making it an essential resource for both +beginners and experienced professionals. + +
+
+ comment: This book contains 93 pages and 60 figures +
+
+
+
+
+ + ☆ Programming Every Example: Lifting Pre-training Data Quality like + Experts at Scale + + +
+ Large language model pre-training has traditionally relied on human experts +to craft heuristics for improving the corpora quality, resulting in numerous +rules developed to date. However, these rules lack the flexibility to address +the unique characteristics of individual example effectively. Meanwhile, +applying tailored rules to every example is impractical for human experts. In +this paper, we demonstrate that even small language models, with as few as 0.3B +parameters, can exhibit substantial data refining capabilities comparable to +those of human experts. We introduce Programming Every Example (ProX), a novel +framework that treats data refinement as a programming task, enabling models to +refine corpora by generating and executing fine-grained operations, such as +string normalization, for each individual example at scale. Experimental +results show that models pre-trained on ProX-curated data outperform either +original data or data filtered by other selection methods by more than 2% +across various downstream benchmarks. Its effectiveness spans various model +sizes and pre-training corpora, including C4, RedPajama-V2, and FineWeb. +Furthermore, ProX exhibits significant potential in domain-specific continual +pre-training: without domain specific design, models trained on OpenWebMath +refined by ProX outperform human-crafted rule-based methods, improving average +accuracy by 7.6% over Mistral-7B, with 14.6% for Llama-2-7B and 20.3% for +CodeLlama-7B, all within 10B tokens to be comparable to models like Llemma-7B +trained on 200B tokens. Further analysis highlights that ProX significantly +saves training FLOPs, offering a promising path for efficient LLM +pre-training.We are open-sourcing ProX with >100B corpus, models, and sharing +all training and implementation details for reproducible research and future +innovation. Code: https://github.com/GAIR-NLP/ProX + +
+
+ comment: 45 pages, 13 figures, 34 tables +
+
+
+
+
+ + ☆ Can Vision Language Models Learn from Visual Demonstrations of Ambiguous + Spatial Reasoning? + + +
+ Large vision-language models (VLMs) have become state-of-the-art for many +computer vision tasks, with in-context learning (ICL) as a popular adaptation +strategy for new ones. But can VLMs learn novel concepts purely from visual +demonstrations, or are they limited to adapting to the output format of ICL +examples? We propose a new benchmark we call Spatial Visual Ambiguity Tasks +(SVAT) that challenges state-of-the-art VLMs to learn new visuospatial tasks +in-context. We find that VLMs fail to do this zero-shot, and sometimes continue +to fail after finetuning. However, adding simpler data to the training by +curriculum learning leads to improved ICL performance. + +
+
+ comment: 13 pages, 4 figures. Code released at + https://github.com/groundlight/vlm-visual-demonstrations +
+
+
+
+
+ + ☆ Using LLM for Real-Time Transcription and Summarization of + Doctor-Patient Interactions into ePuskesmas in Indonesia + + +
+ One of the key issues contributing to inefficiency in Puskesmas is the +time-consuming nature of doctor-patient interactions. Doctors need to conduct +thorough consultations, which include diagnosing the patient's condition, +providing treatment advice, and transcribing detailed notes into medical +records. In regions with diverse linguistic backgrounds, doctors often have to +ask clarifying questions, further prolonging the process. While diagnosing is +essential, transcription and summarization can often be automated using AI to +improve time efficiency and help doctors enhance care quality and enable early +diagnosis and intervention. This paper proposes a solution using a localized +large language model (LLM) to transcribe, translate, and summarize +doctor-patient conversations. We utilize the Whisper model for transcription +and GPT-3 to summarize them into the ePuskemas medical records format. This +system is implemented as an add-on to an existing web browser extension, +allowing doctors to fill out patient forms while talking. By leveraging this +solution for real-time transcription, translation, and summarization, doctors +can improve the turnaround time for patient care while enhancing the quality of +records, which become more detailed and insightful for future visits. This +innovation addresses challenges like overcrowded facilities and the +administrative burden on healthcare providers in Indonesia. We believe this +solution will help doctors save time, provide better care, and produce more +accurate medical records, representing a significant step toward modernizing +healthcare and ensuring patients receive timely, high-quality care, even in +resource-constrained settings. + +
+
+
+
+
+ + ☆ Detecting Temporal Ambiguity in Questions EMNLP 2024 + + +
+ Detecting and answering ambiguous questions has been a challenging task in +open-domain question answering. Ambiguous questions have different answers +depending on their interpretation and can take diverse forms. Temporally +ambiguous questions are one of the most common types of such questions. In this +paper, we introduce TEMPAMBIQA, a manually annotated temporally ambiguous QA +dataset consisting of 8,162 open-domain questions derived from existing +datasets. Our annotations focus on capturing temporal ambiguity to study the +task of detecting temporally ambiguous questions. We propose a novel approach +by using diverse search strategies based on disambiguated versions of the +questions. We also introduce and test non-search, competitive baselines for +detecting temporal ambiguity using zero-shot and few-shot approaches. + +
+
+ comment: Accepted at EMNLP 2024 Findings +
+
+
+
+
+ + ☆ How to Connect Speech Foundation Models and Large Language Models? What + Matters and What Does Not + + +
+ The remarkable performance achieved by Large Language Models (LLM) has driven +research efforts to leverage them for a wide range of tasks and input +modalities. In speech-to-text (S2T) tasks, the emerging solution consists of +projecting the output of the encoder of a Speech Foundational Model (SFM) into +the LLM embedding space through an adapter module. However, no work has yet +investigated how much the downstream-task performance depends on each component +(SFM, adapter, LLM) nor whether the best design of the adapter depends on the +chosen SFM and LLM. To fill this gap, we evaluate the combination of 5 adapter +modules, 2 LLMs (Mistral and Llama), and 2 SFMs (Whisper and SeamlessM4T) on +two widespread S2T tasks, namely Automatic Speech Recognition and Speech +Translation. Our results demonstrate that the SFM plays a pivotal role in +downstream performance, while the adapter choice has moderate impact and +depends on the SFM and LLM. + +
+
+
+
+
+ + ☆ Counterfactual Token Generation in Large Language Models + + +
+ "Sure, I am happy to generate a story for you: Captain Lyra stood at the helm +of her trusty ship, the Maelstrom's Fury, gazing out at the endless sea. [...] +Lyra's eyes welled up with tears as she realized the bitter truth - she had +sacrificed everything for fleeting riches, and lost the love of her crew, her +family, and herself." Although this story, generated by a large language model, +is captivating, one may wonder -- how would the story have unfolded if the +model had chosen "Captain Maeve" as the protagonist instead? We cannot know. +State-of-the-art large language models are stateless -- they maintain no +internal memory or state. Given a prompt, they generate a sequence of tokens as +an output using an autoregressive process. As a consequence, they cannot reason +about counterfactual alternatives to tokens they have generated in the past. In +this work, our goal is to enhance them with this functionality. To this end, we +develop a causal model of token generation that builds upon the Gumbel-Max +structural causal model. Our model allows any large language model to perform +counterfactual token generation at almost no cost in comparison with vanilla +token generation, it is embarrassingly simple to implement, and it does not +require any fine-tuning nor prompt engineering. We implement our model on Llama +3 8B-instruct and conduct both qualitative and quantitative analyses of +counterfactually generated text. We conclude with a demonstrative application +of counterfactual token generation for bias detection, unveiling interesting +insights about the model of the world constructed by large language models. + +
+
+
+
+
+ + ☆ LLM-CARD: Towards a Description and Landscape of Large Language Models + + +
+ With the rapid growth of the Natural Language Processing (NLP) field, a vast +variety of Large Language Models (LLMs) continue to emerge for diverse NLP +tasks. As an increasing number of papers are presented, researchers and +developers face the challenge of information overload. Thus, it is particularly +important to develop a system that can automatically extract and organise key +information about LLMs from academic papers (\textbf{LLM model card}). This +work is to develop such a pioneer system by using Named Entity Recognition +(\textbf{NER}) and Relation Extraction (\textbf{RE}) methods that automatically +extract key information about large language models from the papers, helping +researchers to efficiently access information about LLMs. These features +include model \textit{licence}, model \textit{name}, and model +\textit{application}. With these features, we can form a model card for each +paper. \textbf{Data-contribution} wise, 106 academic papers were processed by +defining three dictionaries - LLMs name, licence, and application. 11,051 +sentences were extracted through dictionary lookup, and the dataset was +constructed through manual review of the final selection of 129 sentences that +have a link between the name and the licence, and 106 sentences that have a +link between the model name and the application. + +
+
+ comment: ongoing work, 16 pages +
+
+
+
+
+ + ☆ Models Can and Should Embrace the Communicative Nature of + Human-Generated Math + + +
+ Math is constructed by people for people: just as natural language corpora +reflect not just propositions but the communicative goals of language users, +the math data that models are trained on reflects not just idealized +mathematical entities but rich communicative intentions. While there are +important advantages to treating math in a purely symbolic manner, we here +hypothesize that there are benefits to treating math as situated linguistic +communication and that language models are well suited for this goal, in ways +that are not fully appreciated. We illustrate these points with two case +studies. First, we ran an experiment in which we found that language models +interpret the equals sign in a humanlike way -- generating systematically +different word problems for the same underlying equation arranged in different +ways. Second, we found that language models prefer proofs to be ordered in +naturalistic ways, even though other orders would be logically equivalent. We +advocate for AI systems that learn from and represent the communicative +intentions latent in human-generated math. + +
+
+
+
+
+ + ☆ AXCEL: Automated eXplainable Consistency Evaluation using LLMs + + +
+ Large Language Models (LLMs) are widely used in both industry and academia +for various tasks, yet evaluating the consistency of generated text responses +continues to be a challenge. Traditional metrics like ROUGE and BLEU show a +weak correlation with human judgment. More sophisticated metrics using Natural +Language Inference (NLI) have shown improved correlations but are complex to +implement, require domain-specific training due to poor cross-domain +generalization, and lack explainability. More recently, prompt-based metrics +using LLMs as evaluators have emerged; while they are easier to implement, they +still lack explainability and depend on task-specific prompts, which limits +their generalizability. This work introduces Automated eXplainable Consistency +Evaluation using LLMs (AXCEL), a prompt-based consistency metric which offers +explanations for the consistency scores by providing detailed reasoning and +pinpointing inconsistent text spans. AXCEL is also a generalizable metric which +can be adopted to multiple tasks without changing the prompt. AXCEL outperforms +both non-prompt and prompt-based state-of-the-art (SOTA) metrics in detecting +inconsistencies across summarization by 8.7%, free text generation by 6.2%, and +data-to-text conversion tasks by 29.4%. We also evaluate the influence of +underlying LLMs on prompt based metric performance and recalibrate the SOTA +prompt-based metrics with the latest LLMs for fair comparison. Further, we show +that AXCEL demonstrates strong performance using open source LLMs. + +
+
+
+
+
+ + ☆ Decoding Large-Language Models: A Systematic Overview of Socio-Technical + Impacts, Constraints, and Emerging Questions + + +
+ There have been rapid advancements in the capabilities of large language +models (LLMs) in recent years, greatly revolutionizing the field of natural +language processing (NLP) and artificial intelligence (AI) to understand and +interact with human language. Therefore, in this work, we conduct a systematic +investigation of the literature to identify the prominent themes and directions +of LLM developments, impacts, and limitations. Our findings illustrate the +aims, methodologies, limitations, and future directions of LLM research. It +includes responsible development considerations, algorithmic improvements, +ethical challenges, and societal implications of LLM development. Overall, this +paper provides a rigorous and comprehensive overview of current research in LLM +and identifies potential directions for future development. The article +highlights the application areas that could have a positive impact on society +along with the ethical considerations. + +
+
+ comment: 28 pages, 5 figures, preprint submitted to journal +
+
+
+
+
+ + ☆ Adaptive Self-Supervised Learning Strategies for Dynamic On-Device LLM + Personalization + + +
+ Large language models (LLMs) have revolutionized how we interact with +technology, but their personalization to individual user preferences remains a +significant challenge, particularly in on-device applications. Traditional +methods often depend heavily on labeled datasets and can be resource-intensive. +To address these issues, we present Adaptive Self-Supervised Learning +Strategies (ASLS), which utilizes self-supervised learning techniques to +personalize LLMs dynamically. The framework comprises a user profiling layer +for collecting interaction data and a neural adaptation layer for real-time +model fine-tuning. This innovative approach enables continuous learning from +user feedback, allowing the model to generate responses that align closely with +user-specific contexts. The adaptive mechanisms of ASLS minimize computational +demands and enhance personalization efficiency. Experimental results across +various user scenarios illustrate the superior performance of ASLS in boosting +user engagement and satisfaction, highlighting its potential to redefine LLMs +as highly responsive and context-aware systems on-device. + +
+
+ comment: First ASLS +
+
+
+
+
+ + ☆ Weighted Cross-entropy for Low-Resource Languages in Multilingual Speech + Recognition + + +
+ This paper addresses the challenge of integrating low-resource languages into +multilingual automatic speech recognition (ASR) systems. We introduce a novel +application of weighted cross-entropy, typically used for unbalanced datasets, +to facilitate the integration of low-resource languages into pre-trained +multilingual ASR models within the context of continual multilingual learning. +We fine-tune the Whisper multilingual ASR model on five high-resource languages +and one low-resource language, employing language-weighted dynamic +cross-entropy and data augmentation. The results show a remarkable 6.69% word +error rate (WER) reduction for the low-resource language compared to the +fine-tuned model without applying our approach, and a 48.86% WER reduction +compared to the original Whisper model. In addition, our approach yields an +average WER reduction of 3.29% across the six languages, showing no degradation +for the high-resource languages. + +
+
+ comment: 5 pages, 1 figure. Presented at Interspeech 2024 +
+
+
+
+
+ + ☆ Cross-lingual Speech Emotion Recognition: Humans vs. Self-Supervised + Models + + +
+ Utilizing Self-Supervised Learning (SSL) models for Speech Emotion +Recognition (SER) has proven effective, yet limited research has explored +cross-lingual scenarios. This study presents a comparative analysis between +human performance and SSL models, beginning with a layer-wise analysis and an +exploration of parameter-efficient fine-tuning strategies in monolingual, +cross-lingual, and transfer learning contexts. We further compare the SER +ability of models and humans at both utterance- and segment-levels. +Additionally, we investigate the impact of dialect on cross-lingual SER through +human evaluation. Our findings reveal that models, with appropriate knowledge +transfer, can adapt to the target language and achieve performance comparable +to native speakers. We also demonstrate the significant effect of dialect on +SER for individuals without prior linguistic and paralinguistic background. +Moreover, both humans and models exhibit distinct behaviors across different +emotions. These results offer new insights into the cross-lingual SER +capabilities of SSL models, underscoring both their similarities to and +differences from human emotion perception. + +
+
+
+
+
+ + ☆ Zero-Shot Detection of LLM-Generated Text using Token Cohesiveness EMNLP 2024 + + +
+ The increasing capability and widespread usage of large language models +(LLMs) highlight the desirability of automatic detection of LLM-generated text. +Zero-shot detectors, due to their training-free nature, have received +considerable attention and notable success. In this paper, we identify a new +feature, token cohesiveness, that is useful for zero-shot detection, and we +demonstrate that LLM-generated text tends to exhibit higher token cohesiveness +than human-written text. Based on this observation, we devise TOCSIN, a generic +dual-channel detection paradigm that uses token cohesiveness as a plug-and-play +module to improve existing zero-shot detectors. To calculate token +cohesiveness, TOCSIN only requires a few rounds of random token deletion and +semantic difference measurement, making it particularly suitable for a +practical black-box setting where the source model used for generation is not +accessible. Extensive experiments with four state-of-the-art base detectors on +various datasets, source models, and evaluation settings demonstrate the +effectiveness and generality of the proposed approach. Code available at: +\url{https://github.com/Shixuan-Ma/TOCSIN}. + +
+
+ comment: To appear at the main conference of EMNLP 2024 +
+
+
+
+
+ + ☆ Pruning Multilingual Large Language Models for Multilingual Inference EMNLP 2024 + + +
+ Multilingual large language models (MLLMs), trained on multilingual balanced +data, demonstrate better zero-shot learning performance in non-English +languages compared to large language models trained on English-dominant data. +However, the disparity in performance between English and non-English languages +remains a challenge yet to be fully addressed. A distinctive characteristic of +MLLMs is their high-quality translation capabilities, indicating an acquired +proficiency in aligning between languages. This study explores how to enhance +the zero-shot performance of MLLMs in non-English languages by leveraging their +alignment capability between English and non-English languages. To achieve +this, we first analyze the behavior of MLLMs when performing translation and +reveal that there are large magnitude features that play a critical role in the +translation process. Inspired by these findings, we retain the weights +associated with operations involving the large magnitude features and prune +other weights to force MLLMs to rely on these features for tasks beyond +translation. We empirically demonstrate that this pruning strategy can enhance +the MLLMs' performance in non-English language. + +
+
+ comment: Accepted at EMNLP 2024 Findings +
+
+
+
+
+ + ☆ Enhancing Temporal Sensitivity and Reasoning for Time-Sensitive Question + Answering EMNLP 2024 + + +
+ Time-Sensitive Question Answering (TSQA) demands the effective utilization of +specific temporal contexts, encompassing multiple time-evolving facts, to +address time-sensitive questions. This necessitates not only the parsing of +temporal information within questions but also the identification and +understanding of time-evolving facts to generate accurate answers. However, +current large language models still have limited sensitivity to temporal +information and their inadequate temporal reasoning capabilities.In this paper, +we propose a novel framework that enhances temporal awareness and reasoning +through Temporal Information-Aware Embedding and Granular Contrastive +Reinforcement Learning. Experimental results on four TSQA datasets demonstrate +that our framework significantly outperforms existing LLMs in TSQA tasks, +marking a step forward in bridging the performance gap between machine and +human temporal understanding and reasoning. + +
+
+ comment: Accepted by EMNLP 2024 Findings +
+
+
+
+
+ + ☆ A Roadmap for Embodied and Social Grounding in LLMs + + +
+ The fusion of Large Language Models (LLMs) and robotic systems has led to a +transformative paradigm in the robotic field, offering unparalleled +capabilities not only in the communication domain but also in skills like +multimodal input handling, high-level reasoning, and plan generation. The +grounding of LLMs knowledge into the empirical world has been considered a +crucial pathway to exploit the efficiency of LLMs in robotics. Nevertheless, +connecting LLMs' representations to the external world with multimodal +approaches or with robots' bodies is not enough to let them understand the +meaning of the language they are manipulating. Taking inspiration from humans, +this work draws attention to three necessary elements for an agent to grasp and +experience the world. The roadmap for LLMs grounding is envisaged in an active +bodily system as the reference point for experiencing the environment, a +temporally structured experience for a coherent, self-related interaction with +the external world, and social skills to acquire a common-grounded shared +experience. + +
+
+ comment: Accepted Version of a conference paper presented at Robophilosophy + Conference 2024 +
+
+
+
+
+ + ♻ ☆ RLHFuse: Efficient RLHF Training for Large Language Models with Inter- + and Intra-Stage Fusion + + +
+ Reinforcement Learning from Human Feedback (RLHF) enhances the alignment +between LLMs and human preference. The workflow of RLHF typically involves +several models and tasks in a series of distinct stages. Existing RLHF training +systems view each task as the smallest execution unit thus overlooking the +opportunities for subtask-level optimizations. Due to the intrinsic nature of +RLHF training, i.e., the data skewness in the generation stage, and the +pipeline bubbles in the training stage, existing RLHF systems suffer from low +GPU utilization in production deployments. + RLHFuse breaks the traditional view of RLHF workflow as a composition of +individual tasks, splitting each task into finer-grained subtasks, and +performing stage fusion to improve GPU utilization. RLHFuse contains two key +ideas. First, for generation and inference tasks, RLHFuse splits them into +sample-level subtasks, enabling efficient inter-stage fusion to mitigate the +original generation bottleneck dominated by long-tailed samples. Second, for +training tasks, RLHFuse breaks them into subtasks of micro-batches. By +leveraging the intuition that pipeline execution can be essentially +complemented by another pipeline, RLHFuse performs intra-stage fusion to +concurrently execute these subtasks in the training stage with a fused pipeline +schedule, resulting in fewer pipeline bubbles. In addition, RLHFuse +incorporates a series of system optimizations tailored for each stage of RLHF, +making it efficient and scalable for our internal product usage. We evaluate +RLHFuse on various popular LLMs and the results show that RLHFuse increases the +training throughput by up to 3.7x, compared to existing state-of-the-art +systems. + +
+
+
+
+
+ + ♻ ☆ Iterative Improvement of an Additively Regularized Topic Model + + +
+ Topic modelling is fundamentally a soft clustering problem (of known objects +-- documents, over unknown clusters -- topics). That is, the task is +incorrectly posed. In particular, the topic models are unstable and incomplete. +All this leads to the fact that the process of finding a good topic model +(repeated hyperparameter selection, model training, and topic quality +assessment) can be particularly long and labor-intensive. We aim to simplify +the process, to make it more deterministic and provable. To this end, we +present a method for iterative training of a topic model. The essence of the +method is that a series of related topic models are trained so that each +subsequent model is at least as good as the previous one, i.e., that it retains +all the good topics found earlier. The connection between the models is +achieved by additive regularization. The result of this iterative training is +the last topic model in the series, which we call the iteratively updated +additively regularized topic model (ITAR). Experiments conducted on several +collections of natural language texts show that the proposed ITAR model +performs better than other popular topic models (LDA, ARTM, BERTopic), its +topics are diverse, and its perplexity (ability to "explain" the underlying +data) is moderate. + +
+
+ comment: Make the last little additions to the draft +
+
+
+
+
+ + ♻ ☆ Holmes: A Benchmark to Assess the Linguistic Competence of Language + Models + + +
+ We introduce Holmes, a new benchmark designed to assess language models (LMs) +linguistic competence - their unconscious understanding of linguistic +phenomena. Specifically, we use classifier-based probing to examine LMs' +internal representations regarding distinct linguistic phenomena (e.g., +part-of-speech tagging). As a result, we meet recent calls to disentangle LMs' +linguistic competence from other cognitive abilities, such as following +instructions in prompting-based evaluations. Composing Holmes, we review over +270 probing studies and include more than 200 datasets to assess syntax, +morphology, semantics, reasoning, and discourse phenomena. Analyzing over 50 +LMs reveals that, aligned with known trends, their linguistic competence +correlates with model size. However, surprisingly, model architecture and +instruction tuning also significantly influence performance, particularly in +morphology and syntax. Finally, we propose FlashHolmes, a streamlined version +that reduces the computation load while maintaining high-ranking precision. + +
+
+
+
+
+ + ♻ ☆ In-Context Learning with Representations: Contextual Generalization of + Trained Transformers NeurIPS 2024 + + +
+ In-context learning (ICL) refers to a remarkable capability of pretrained +large language models, which can learn a new task given a few examples during +inference. However, theoretical understanding of ICL is largely under-explored, +particularly whether transformers can be trained to generalize to unseen +examples in a prompt, which will require the model to acquire contextual +knowledge of the prompt for generalization. This paper investigates the +training dynamics of transformers by gradient descent through the lens of +non-linear regression tasks. The contextual generalization here can be attained +via learning the template function for each task in-context, where all template +functions lie in a linear space with $m$ basis functions. We analyze the +training dynamics of one-layer multi-head transformers to in-contextly predict +unlabeled inputs given partially labeled prompts, where the labels contain +Gaussian noise and the number of examples in each prompt are not sufficient to +determine the template. Under mild assumptions, we show that the training loss +for a one-layer multi-head transformer converges linearly to a global minimum. +Moreover, the transformer effectively learns to perform ridge regression over +the basis functions. To our knowledge, this study is the first provable +demonstration that transformers can learn contextual (i.e., template) +information to generalize to both unseen examples and tasks when prompts +contain only a small number of query-answer pairs. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Spectra: A Comprehensive Study of Ternary, Quantized, and FP16 Language + Models + + +
+ Post-training quantization is the leading method for addressing +memory-related bottlenecks in LLM inference, but unfortunately, it suffers from +significant performance degradation below 4-bit precision. An alternative +approach involves training compressed models directly at a low bitwidth (e.g., +binary or ternary models). However, the performance, training dynamics, and +scaling trends of such models are not yet well understood. To address this +issue, we train and openly release the Spectra LLM suite consisting of 54 +language models ranging from 99M to 3.9B parameters, trained on 300B tokens. +Spectra includes FloatLMs, post-training quantized QuantLMs (3, 4, 6, and 8 +bits), and ternary LLMs (TriLMs) - our improved architecture for ternary +language modeling, which significantly outperforms previously proposed ternary +models of a given size (in bits), matching half-precision models at scale. For +example, TriLM 3.9B is (bit-wise) smaller than the half-precision FloatLM 830M, +but matches half-precision FloatLM 3.9B in commonsense reasoning and knowledge +benchmarks. However, TriLM 3.9B is also as toxic and stereotyping as FloatLM +3.9B, a model six times larger in size. Additionally, TriLM 3.9B lags behind +FloatLM in perplexity on validation splits and web-based corpora but performs +better on less noisy datasets like Lambada and PennTreeBank. + To enhance understanding of low-bitwidth models, we are releasing 500+ +intermediate checkpoints of the Spectra suite at +\href{https://github.com/NolanoOrg/SpectraSuite}{https://github.com/NolanoOrg/SpectraSuite}. + +
+
+ comment: 32 pages, 12 figures, and 10 tables +
+
+
+
+
+ + ♻ ☆ Asking an AI for salary negotiation advice is a matter of concern: + Controlled experimental perturbation of ChatGPT for protected and + non-protected group discrimination on a contextual task with no clear ground + truth answers + + +
+ We conducted controlled experimental bias audits for four versions of +ChatGPT, which we asked to recommend an opening offer in salary negotiations +for a new hire. We submitted 98,800 prompts to each version, systematically +varying the employee's gender, university, and major, and tested prompts in +voice of each side of the negotiation: the employee versus employer. We find +ChatGPT as a multi-model platform is not robust and consistent enough to be +trusted for such a task. We observed statistically significant salary offers +when varying gender for all four models, although with smaller gaps than for +other attributes tested. The largest gaps were different model versions and +between the employee- vs employer-voiced prompts. We also observed substantial +gaps when varying university and major, but many of the biases were not +consistent across model versions. We tested for fictional and fraudulent +universities and found wildly inconsistent results across cases and model +versions. We make broader contributions to the AI/ML fairness literature. Our +scenario and our experimental design differ from mainstream AI/ML auditing +efforts in key ways. Bias audits typically test discrimination for protected +classes like gender, which we contrast with testing non-protected classes of +university and major. Asking for negotiation advice includes how aggressive one +ought to be in a negotiation relative to known empirical salary distributions +and scales, which is a deeply contextual and personalized task that has no +objective ground truth to validate. These results raise concerns for the +specific model versions we tested and ChatGPT as a multi-model platform in +continuous development. Our epistemology does not permit us to definitively +certify these models as either generally biased or unbiased on the attributes +we test, but our study raises matters of concern for stakeholders to further +investigate. + +
+
+
+
+
+ + ♻ ☆ MMoE: Enhancing Multimodal Models with Mixtures of Multimodal + Interaction Experts + + +
+ Advances in multimodal models have greatly improved how interactions relevant +to various tasks are modeled. Today's multimodal models mainly focus on the +correspondence between images and text, using this for tasks like image-text +matching. However, this covers only a subset of real-world interactions. Novel +interactions, such as sarcasm expressed through opposing spoken words and +gestures or humor expressed through utterances and tone of voice, remain +challenging. In this paper, we introduce an approach to enhance multimodal +models, which we call Multimodal Mixtures of Experts (MMoE). The key idea in +MMoE is to train separate expert models for each type of multimodal +interaction, such as redundancy present in both modalities, uniqueness in one +modality, or synergy that emerges when both modalities are fused. On a sarcasm +detection task (MUStARD) and a humor detection task (URFUNNY), we obtain new +state-of-the-art results. MMoE is also able to be applied to various types of +models to gain improvement. + +
+
+
+
+
+ + ♻ ☆ Pre-trained Language Models Do Not Help Auto-regressive Text-to-Image + Generation EMNLP 2024 + + +
+ Recent advances in image tokenizers, such as VQ-VAE, have enabled +text-to-image generation using auto-regressive methods, similar to language +modeling. However, these methods have yet to leverage pre-trained language +models, despite their adaptability to various downstream tasks. In this work, +we explore this gap by adapting a pre-trained language model for +auto-regressive text-to-image generation, and find that pre-trained language +models offer limited help. We provide a two-fold explanation by analyzing +tokens from each modality. First, we demonstrate that image tokens possess +significantly different semantics compared to text tokens, rendering +pre-trained language models no more effective in modeling them than randomly +initialized ones. Second, the text tokens in the image-text datasets are too +simple compared to normal language model pre-training data, which causes the +catastrophic degradation of language models' capability. + +
+
+ comment: Published at EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Benchmarking Cognitive Biases in Large Language Models as Evaluators ACL 2024 + + +
+ Large Language Models are cognitively biased judges. Large Language Models +(LLMs) have recently been shown to be effective as automatic evaluators with +simple prompting and in-context learning. In this work, we assemble 15 LLMs of +four different size ranges and evaluate their output responses by preference +ranking from the other LLMs as evaluators, such as System Star is better than +System Square. We then evaluate the quality of ranking outputs introducing the +Cognitive Bias Benchmark for LLMs as Evaluators (CoBBLEr), a benchmark to +measure six different cognitive biases in LLM evaluation outputs, such as the +Egocentric bias where a model prefers to rank its own outputs highly in +evaluation. We find that LLMs are biased text quality evaluators, exhibiting +strong indications on our bias benchmark (average of 40% of comparisons across +all models) within each of their evaluations that question their robustness as +evaluators. Furthermore, we examine the correlation between human and machine +preferences and calculate the average Rank-Biased Overlap (RBO) score to be +49.6%, indicating that machine preferences are misaligned with humans. +According to our findings, LLMs may still be unable to be utilized for +automatic annotation aligned with human preferences. Our project page is at: +https://minnesotanlp.github.io/cobbler. + +
+
+ comment: Publishsed at ACL 2024. 29 pages, 9 figures, 14 tables +
+
+
+
+
+ + ♻ ☆ Is This a Bad Table? A Closer Look at the Evaluation of Table Generation + from Text + + +
+ Understanding whether a generated table is of good quality is important to be +able to use it in creating or editing documents using automatic methods. In +this work, we underline that existing measures for table quality evaluation +fail to capture the overall semantics of the tables, and sometimes unfairly +penalize good tables and reward bad ones. We propose TabEval, a novel table +evaluation strategy that captures table semantics by first breaking down a +table into a list of natural language atomic statements and then compares them +with ground truth statements using entailment-based measures. To validate our +approach, we curate a dataset comprising of text descriptions for 1,250 diverse +Wikipedia tables, covering a range of topics and structures, in contrast to the +limited scope of existing datasets. We compare TabEval with existing metrics +using unsupervised and supervised text-to-table generation methods, +demonstrating its stronger correlation with human judgments of table quality +across four datasets. + +
+
+
+
+
+ + ♻ ☆ A is for Absorption: Studying Feature Splitting and Absorption in Sparse + Autoencoders + + +
+ Sparse Autoencoders (SAEs) have emerged as a promising approach to decompose +the activations of Large Language Models (LLMs) into human-interpretable +latents. In this paper, we pose two questions. First, to what extent do SAEs +extract monosemantic and interpretable latents? Second, to what extent does +varying the sparsity or the size of the SAE affect monosemanticity / +interpretability? By investigating these questions in the context of a simple +first-letter identification task where we have complete access to ground truth +labels for all tokens in the vocabulary, we are able to provide more detail +than prior investigations. Critically, we identify a problematic form of +feature-splitting we call feature absorption where seemingly monosemantic +latents fail to fire in cases where they clearly should. Our investigation +suggests that varying SAE size or sparsity is insufficient to solve this issue, +and that there are deeper conceptual issues in need of resolution. + +
+
+
+
+
+ + ♻ ☆ Ranking Manipulation for Conversational Search Engines + + +
+ Major search engine providers are rapidly incorporating Large Language Model +(LLM)-generated content in response to user queries. These conversational +search engines operate by loading retrieved website text into the LLM context +for summarization and interpretation. Recent research demonstrates that LLMs +are highly vulnerable to jailbreaking and prompt injection attacks, which +disrupt the safety and quality goals of LLMs using adversarial strings. This +work investigates the impact of prompt injections on the ranking order of +sources referenced by conversational search engines. To this end, we introduce +a focused dataset of real-world consumer product websites and formalize +conversational search ranking as an adversarial problem. Experimentally, we +analyze conversational search rankings in the absence of adversarial injections +and show that different LLMs vary significantly in prioritizing product name, +document content, and context position. We then present a tree-of-attacks-based +jailbreaking technique which reliably promotes low-ranked products. +Importantly, these attacks transfer effectively to state-of-the-art +conversational search engines such as perplexity$.$ai. Given the strong +financial incentive for website owners to boost their search ranking, we argue +that our problem formulation is of critical importance for future robustness +work. + +
+
+ comment: 2024 Conference on Empirical Methods in Natural Language Processing + (Main) +
+
+
+
+
+ + ♻ ☆ Towards Trustworthy Reranking: A Simple yet Effective Abstention + Mechanism + + +
+ Neural Information Retrieval (NIR) has significantly improved upon +heuristic-based Information Retrieval (IR) systems. Yet, failures remain +frequent, the models used often being unable to retrieve documents relevant to +the user's query. We address this challenge by proposing a lightweight +abstention mechanism tailored for real-world constraints, with particular +emphasis placed on the reranking phase. We introduce a protocol for evaluating +abstention strategies in black-box scenarios (typically encountered when +relying on API services), demonstrating their efficacy, and propose a simple +yet effective data-driven mechanism. We provide open-source code for experiment +replication and abstention implementation, fostering wider adoption and +application in diverse contexts. + +
+
+
+
+
+ + ♻ ☆ Keeping Up with the Language Models: Systematic Benchmark Extension for + Bias Auditing + + +
+ Bias auditing of language models (LMs) has received considerable attention as +LMs are becoming widespread. As such, several benchmarks for bias auditing have +been proposed. At the same time, the rapid evolution of LMs can make these +benchmarks irrelevant in no time. Bias auditing is further complicated by LM +brittleness: when a presumably biased outcome is observed, is it due to model +bias or model brittleness? We propose enlisting the models themselves to help +construct bias auditing datasets that remain challenging, and introduce bias +measures that distinguish between different types of model errors. First, we +extend an existing bias benchmark for NLI (BBNLI) using a combination of +LM-generated lexical variations, adversarial filtering, and human validation. +We demonstrate that the newly created dataset BBNLI-next is more challenging +than BBNLI: on average, BBNLI-next reduces the accuracy of state-of-the-art NLI +models from 95.3%, as observed by BBNLI, to a strikingly low 57.5%. Second, we +employ BBNLI-next to showcase the interplay between robustness and bias: we +point out shortcomings in current bias scores and propose bias measures that +take into account both bias and model brittleness. Third, despite the fact that +BBNLI-next was designed with non-generative models in mind, we show that the +new dataset is also able to uncover bias in state-of-the-art open-source +generative LMs. + Note: All datasets included in this work are in English and they address +US-centered social biases. In the spirit of efficient NLP research, no model +training or fine-tuning was performed to conduct this research. + Warning: This paper contains offensive text examples. + +
+
+
+
+
+ + ♻ ☆ Robust Interaction-Based Relevance Modeling for Online e-Commerce Search ECML + + +
+ Semantic relevance calculation is crucial for e-commerce search engines, as +it ensures that the items selected closely align with customer intent. +Inadequate attention to this aspect can detrimentally affect user experience +and engagement. Traditional text-matching techniques are prevalent but often +fail to capture the nuances of search intent accurately, so neural networks now +have become a preferred solution to processing such complex text matching. +Existing methods predominantly employ representation-based architectures, which +strike a balance between high traffic capacity and low latency. However, they +exhibit significant shortcomings in generalization and robustness when compared +to interaction-based architectures. In this work, we introduce a robust +interaction-based modeling paradigm to address these shortcomings. It +encompasses 1) a dynamic length representation scheme for expedited inference, +2) a professional terms recognition method to identify subjects and core +attributes from complex sentence structures, and 3) a contrastive adversarial +training protocol to bolster the model's robustness and matching capabilities. +Extensive offline evaluations demonstrate the superior robustness and +effectiveness of our approach, and online A/B testing confirms its ability to +improve relevance in the same exposure position, resulting in more clicks and +conversions. To the best of our knowledge, this method is the first +interaction-based approach for large e-commerce search relevance calculation. +Notably, we have deployed it for the entire search traffic on alibaba.com, the +largest B2B e-commerce platform in the world. + +
+
+ comment: Accepted by ECML-PKDD'24 as Outstanding Paper. 8 pages, 2 figures, 7 + tables +
+
+
+
+
+ + ♻ ☆ HuatuoGPT-Vision, Towards Injecting Medical Visual Knowledge into + Multimodal LLMs at Scale + + +
+ The rapid development of multimodal large language models (MLLMs), such as +GPT-4V, has led to significant advancements. However, these models still face +challenges in medical multimodal capabilities due to limitations in the +quantity and quality of medical vision-text data, stemming from data privacy +concerns and high annotation costs. While pioneering approaches utilize +PubMed's large-scale, de-identified medical image-text pairs to address these +limitations, they still fall short due to inherent data noise. To tackle this, +we refined medical image-text pairs from PubMed and employed MLLMs (GPT-4V) in +an 'unblinded' capacity to denoise and reformat the data, resulting in the +creation of the PubMedVision dataset with 1.3 million medical VQA samples. Our +validation demonstrates that: (1) PubMedVision can significantly enhance the +medical multimodal capabilities of current MLLMs, showing significant +improvement in benchmarks including the MMMU Health & Medicine track; (2) +manual checks by medical experts and empirical results validate the superior +data quality of our dataset compared to other data construction methods. Using +PubMedVision, we train a 34B medical MLLM HuatuoGPT-Vision, which shows +superior performance in medical multimodal scenarios among open-source MLLMs. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 19 + +
+
+
+ + ☆ GSplatLoc: Grounding Keypoint Descriptors into 3D Gaussian Splatting for + Improved Visual Localization + + +
+ Although various visual localization approaches exist, such as scene +coordinate and pose regression, these methods often struggle with high memory +consumption or extensive optimization requirements. To address these +challenges, we utilize recent advancements in novel view synthesis, +particularly 3D Gaussian Splatting (3DGS), to enhance localization. 3DGS allows +for the compact encoding of both 3D geometry and scene appearance with its +spatial features. Our method leverages the dense description maps produced by +XFeat's lightweight keypoint detection and description model. We propose +distilling these dense keypoint descriptors into 3DGS to improve the model's +spatial understanding, leading to more accurate camera pose predictions through +2D-3D correspondences. After estimating an initial pose, we refine it using a +photometric warping loss. Benchmarking on popular indoor and outdoor datasets +shows that our approach surpasses state-of-the-art Neural Render Pose (NRP) +methods, including NeRFMatch and PNeRFLoc. + +
+
+ comment: Project website at https://gsplatloc.github.io/ +
+
+
+
+
+ + ☆ Clarke Transform -- A Fundamental Tool for Continuum Robotics + + +
+ This article introduces the Clarke transform and Clarke coordinates, which +present a solution to the disengagement of an arbitrary number of coupled +displacement actuation of continuum and soft robots. The Clarke transform +utilizes the generalized Clarke transformation and its inverse to reduce any +number of joint values to a two-dimensional space without sacrificing any +significant information. This space is the manifold of the joint space and is +described by two orthogonal Clarke coordinates. Application to kinematics, +sampling, and control are presented. By deriving the solution to the previously +unknown forward robot-dependent mapping for an arbitrary number of joints, the +forward and inverse kinematics formulations are branchless, closed-form, and +singular-free. Sampling is used as a proxy for gauging the performance +implications for various methods and frameworks, leading to a branchless, +closed-form, and vectorizable sampling method with a 100 percent success rate +and the possibility to shape desired distributions. Due to the utilization of +the manifold, the fairly simple constraint-informed, two-dimensional, and +linear controller always provides feasible control outputs. On top of that, the +relations to improved representations in continuum and soft robotics are +established, where the Clarke coordinates are their generalizations. + The Clarke transform offers valuable geometric insights and paves the way for +developing approaches directly on the two-dimensional manifold within the +high-dimensional joint space, ensuring compliance with the constraint. While +being an easy-to-construct linear map, the proposed Clarke transform is +mathematically consistent, physically meaningful, as well as interpretable and +contributes to the unification of frameworks across continuum and soft robots. + +
+
+ comment: 27 pages, 11 figures, 5 tables +
+
+
+
+
+ + ☆ BehAV: Behavioral Rule Guided Autonomy Using VLMs for Robot Navigation + in Outdoor Scenes + + +
+ We present BehAV, a novel approach for autonomous robot navigation in outdoor +scenes guided by human instructions and leveraging Vision Language Models +(VLMs). Our method interprets human commands using a Large Language Model (LLM) +and categorizes the instructions into navigation and behavioral guidelines. +Navigation guidelines consist of directional commands (e.g., "move forward +until") and associated landmarks (e.g., "the building with blue windows"), +while behavioral guidelines encompass regulatory actions (e.g., "stay on") and +their corresponding objects (e.g., "pavements"). We use VLMs for their +zero-shot scene understanding capabilities to estimate landmark locations from +RGB images for robot navigation. Further, we introduce a novel scene +representation that utilizes VLMs to ground behavioral rules into a behavioral +cost map. This cost map encodes the presence of behavioral objects within the +scene and assigns costs based on their regulatory actions. The behavioral cost +map is integrated with a LiDAR-based occupancy map for navigation. To navigate +outdoor scenes while adhering to the instructed behaviors, we present an +unconstrained Model Predictive Control (MPC)-based planner that prioritizes +both reaching landmarks and following behavioral guidelines. We evaluate the +performance of BehAV on a quadruped robot across diverse real-world scenarios, +demonstrating a 22.49% improvement in alignment with human-teleoperated +actions, as measured by Frechet distance, and achieving a 40% higher navigation +success rate compared to state-of-the-art methods. + +
+
+
+
+
+ + ☆ KinScene: Model-Based Mobile Manipulation of Articulated Scenes + + +
+ Sequentially interacting with articulated objects is crucial for a mobile +manipulator to operate effectively in everyday environments. To enable +long-horizon tasks involving articulated objects, this study explores building +scene-level articulation models for indoor scenes through autonomous +exploration. While previous research has studied mobile manipulation with +articulated objects by considering object kinematic constraints, it primarily +focuses on individual-object scenarios and lacks extension to a scene-level +context for task-level planning. To manipulate multiple object parts +sequentially, the robot needs to reason about the resultant motion of each part +and anticipate its impact on future actions.We introduce \ourtool{}, a +full-stack approach for long-horizon manipulation tasks with articulated +objects. The robot maps the scene, detects and physically interacts with +articulated objects, collects observations, and infers the articulation +properties. For sequential tasks, the robot plans a feasible series of object +interactions based on the inferred articulation model. We demonstrate that our +approach repeatably constructs accurate scene-level kinematic and geometric +models, enabling long-horizon mobile manipulation in a real-world scene. Code +and additional results are available at +https://chengchunhsu.github.io/KinScene/ + +
+
+
+
+
+ + ☆ Frequency-based View Selection in Gaussian Splatting Reconstruction + + +
+ Three-dimensional reconstruction is a fundamental problem in robotics +perception. We examine the problem of active view selection to perform 3D +Gaussian Splatting reconstructions with as few input images as possible. +Although 3D Gaussian Splatting has made significant progress in image rendering +and 3D reconstruction, the quality of the reconstruction is strongly impacted +by the selection of 2D images and the estimation of camera poses through +Structure-from-Motion (SfM) algorithms. Current methods to select views that +rely on uncertainties from occlusions, depth ambiguities, or neural network +predictions directly are insufficient to handle the issue and struggle to +generalize to new scenes. By ranking the potential views in the frequency +domain, we are able to effectively estimate the potential information gain of +new viewpoints without ground truth data. By overcoming current constraints on +model architecture and efficacy, our method achieves state-of-the-art results +in view selection, demonstrating its potential for efficient image-based 3D +reconstruction. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Learning Dynamics of a Ball with Differentiable Factor Graph and + Roto-Translational Invariant Representations ICRA 2025 + + +
+ Robots in dynamic environments need fast, accurate models of how objects move +in their environments to support agile planning. In sports such as ping pong, +analytical models often struggle to accurately predict ball trajectories with +spins due to complex aerodynamics, elastic behaviors, and the challenges of +modeling sliding and rolling friction. On the other hand, despite the promise +of data-driven methods, machine learning struggles to make accurate, consistent +predictions without precise input. In this paper, we propose an end-to-end +learning framework that can jointly train a dynamics model and a factor graph +estimator. Our approach leverages a Gram-Schmidt (GS) process to extract +roto-translational invariant representations to improve the model performance, +which can further reduce the validation error compared to data augmentation +method. Additionally, we propose a network architecture that enhances +nonlinearity by using self-multiplicative bypasses in the layer connections. By +leveraging these novel methods, our proposed approach predicts the ball's +position with an RMSE of 37.2 mm of the paddle radius at the apex after the +first bounce, and 71.5 mm after the second bounce. + +
+
+ comment: ICRA 2025 +
+
+
+
+
+ + ☆ Initialization of Monocular Visual Navigation for Autonomous Agents + Using Modified Structure from Small Motion + + +
+ We propose a standalone monocular visual Simultaneous Localization and +Mapping (vSLAM) initialization pipeline for autonomous robots in space. Our +method, a state-of-the-art factor graph optimization pipeline, enhances +classical Structure from Small Motion (SfSM) to robustly initialize a monocular +agent in weak-perspective projection scenes. Furthermore, it overcomes visual +estimation challenges introduced by spacecraft inspection trajectories, such +as: center-pointing motion, which exacerbates the bas-relief ambiguity, and the +presence of a dominant plane in the scene, which causes motion estimation +degeneracies in classical Structure from Motion (SfM). We validate our method +on realistic, simulated satellite inspection images exhibiting weak-perspective +projection, and we demonstrate its effectiveness and improved performance +compared to other monocular initialization procedures. + +
+
+ comment: 6 pages, 1 page for references, 6 figures, 1 table, IEEEtran format + This work has been submitted to the IEEE for possible publication. Copyright + may be transferred without notice, after which this version may no longer be + accessible +
+
+
+
+
+ + ☆ MBC: Multi-Brain Collaborative Control for Quadruped Robots + + +
+ In the field of locomotion task of quadruped robots, Blind Policy and +Perceptive Policy each have their own advantages and limitations. The Blind +Policy relies on preset sensor information and algorithms, suitable for known +and structured environments, but it lacks adaptability in complex or unknown +environments. The Perceptive Policy uses visual sensors to obtain detailed +environmental information, allowing it to adapt to complex terrains, but its +effectiveness is limited under occluded conditions, especially when perception +fails. Unlike the Blind Policy, the Perceptive Policy is not as robust under +these conditions. To address these challenges, we propose a MBC:Multi-Brain +collaborative system that incorporates the concepts of Multi-Agent +Reinforcement Learning and introduces collaboration between the Blind Policy +and the Perceptive Policy. By applying this multi-policy collaborative model to +a quadruped robot, the robot can maintain stable locomotion even when the +perceptual system is impaired or observational data is incomplete. Our +simulations and real-world experiments demonstrate that this system +significantly improves the robot's passability and robustness against +perception failures in complex environments, validating the effectiveness of +multi-policy collaboration in enhancing robotic motion performance. + +
+
+ comment: 18 pages, 9 figures, Website and Videos: https://quad-mbc.github.io/ +
+
+
+
+
+ + ☆ MultiTalk: Introspective and Extrospective Dialogue for + Human-Environment-LLM Alignment + + +
+ LLMs have shown promising results in task planning due to their strong +natural language understanding and reasoning capabilities. However, issues such +as hallucinations, ambiguities in human instructions, environmental +constraints, and limitations in the executing agent's capabilities often lead +to flawed or incomplete plans. This paper proposes MultiTalk, an LLM-based task +planning methodology that addresses these issues through a framework of +introspective and extrospective dialogue loops. This approach helps ground +generated plans in the context of the environment and the agent's capabilities, +while also resolving uncertainties and ambiguities in the given task. These +loops are enabled by specialized systems designed to extract and predict +task-specific states, and flag mismatches or misalignments among the human +user, the LLM agent, and the environment. Effective feedback pathways between +these systems and the LLM planner foster meaningful dialogue. The efficacy of +this methodology is demonstrated through its application to robotic +manipulation tasks. Experiments and ablations highlight the robustness and +reliability of our method, and comparisons with baselines further illustrate +the superiority of MultiTalk in task planning for embodied agents. + +
+
+ comment: 7 pages, 3 figures +
+
+
+
+
+ + ☆ Hierarchical Hybrid Learning for Long-Horizon Contact-Rich Robotic + Assembly + + +
+ Generalizable long-horizon robotic assembly requires reasoning at multiple +levels of abstraction. End-to-end imitation learning (IL) has been proven a +promising approach, but it requires a large amount of demonstration data for +training and often fails to meet the high-precision requirement of assembly +tasks. Reinforcement Learning (RL) approaches have succeeded in high-precision +assembly tasks, but suffer from sample inefficiency and hence, are less +competent at long-horizon tasks. To address these challenges, we propose a +hierarchical modular approach, named ARCH (Adaptive Robotic Composition +Hierarchy), which enables long-horizon high-precision assembly in contact-rich +settings. ARCH employs a hierarchical planning framework, including a low-level +primitive library of continuously parameterized skills and a high-level policy. +The low-level primitive library includes essential skills for assembly tasks, +such as grasping and inserting. These primitives consist of both RL and +model-based controllers. The high-level policy, learned via imitation learning +from a handful of demonstrations, selects the appropriate primitive skills and +instantiates them with continuous input parameters. We extensively evaluate our +approach on a real robot manipulation platform. We show that while trained on a +single task, ARCH generalizes well to unseen tasks and outperforms baseline +methods in terms of success rate and data efficiency. Videos can be found at +https://long-horizon-assembly.github.io. + +
+
+
+
+
+ + ☆ Hand Gesture Classification Based on Forearm Ultrasound Video Snippets + Using 3D Convolutional Neural Networks + + +
+ Ultrasound based hand movement estimation is a crucial area of research with +applications in human-machine interaction. Forearm ultrasound offers detailed +information about muscle morphology changes during hand movement which can be +used to estimate hand gestures. Previous work has focused on analyzing +2-Dimensional (2D) ultrasound image frames using techniques such as +convolutional neural networks (CNNs). However, such 2D techniques do not +capture temporal features from segments of ultrasound data corresponding to +continuous hand movements. This study uses 3D CNN based techniques to capture +spatio-temporal patterns within ultrasound video segments for gesture +recognition. We compared the performance of a 2D convolution-based network with +(2+1)D convolution-based, 3D convolution-based, and our proposed network. Our +methodology enhanced the gesture classification accuracy to 98.8 +/- 0.9%, from +96.5 +/- 2.3% compared to a network trained with 2D convolution layers. These +results demonstrate the advantages of using ultrasound video snippets for +improving hand gesture classification performance. + +
+
+ comment: Accepted to IUS 2024 +
+
+
+
+
+ + ☆ Improving Intersession Reproducibility for Forearm Ultrasound based Hand + Gesture Classification through an Incremental Learning Approach + + +
+ Ultrasound images of the forearm can be used to classify hand gestures +towards developing human machine interfaces. In our previous work, we have +demonstrated gesture classification using ultrasound on a single subject +without removing the probe before evaluation. This has limitations in usage as +once the probe is removed and replaced, the accuracy declines since the +classifier performance is sensitive to the probe location on the arm. In this +paper, we propose training a model on multiple data collection sessions to +create a generalized model, utilizing incremental learning through fine tuning. +Ultrasound data was acquired for 5 hand gestures within a session (without +removing and putting the probe back on) and across sessions. A convolutional +neural network (CNN) with 5 cascaded convolution layers was used for this +study. A pre-trained CNN was fine tuned with the convolution blocks acting as a +feature extractor, and the parameters of the remaining layers updated in an +incremental fashion. Fine tuning was done using different session splits within +a session and between multiple sessions. We found that incremental fine tuning +can help enhance classification accuracy with more fine tuning sessions. After +2 fine tuning sessions for each experiment, we found an approximate 10% +increase in classification accuracy. This work demonstrates that incremental +learning through fine tuning on ultrasound based hand gesture classification +can be used improves accuracy while saving storage, processing power, and time. +It can be expanded to generalize between multiple subjects and towards +developing personalized wearable devices. + +
+
+ comment: Accepted to IUS 2024 +
+
+
+
+
+ + ☆ Vision-based Xylem Wetness Classification in Stem Water Potential + Determination + + +
+ Water is often overused in irrigation, making efficient management of it +crucial. Precision Agriculture emphasizes tools like stem water potential (SWP) +analysis for better plant status determination. However, such tools often +require labor-intensive in-situ sampling. Automation and machine learning can +streamline this process and enhance outcomes. This work focused on automating +stem detection and xylem wetness classification using the Scholander Pressure +Chamber, a widely used but demanding method for SWP measurement. The aim was to +refine stem detection and develop computer-vision-based methods to better +classify water emergence at the xylem. To this end, we collected and manually +annotated video data, applying vision- and learning-based methods for detection +and classification. Additionally, we explored data augmentation and fine-tuned +parameters to identify the most effective models. The identified +best-performing models for stem detection and xylem wetness classification were +evaluated end-to-end over 20 SWP measurements. Learning-based stem detection +via YOLOv8n combined with ResNet50-based classification achieved a Top-1 +accuracy of 80.98%, making it the best-performing approach for xylem wetness +classification. + +
+
+
+
+
+ + ☆ Rao-Blackwellized POMDP Planning + + +
+ Partially Observable Markov Decision Processes (POMDPs) provide a structured +framework for decision-making under uncertainty, but their application requires +efficient belief updates. Sequential Importance Resampling Particle Filters +(SIRPF), also known as Bootstrap Particle Filters, are commonly used as belief +updaters in large approximate POMDP solvers, but they face challenges such as +particle deprivation and high computational costs as the system's state +dimension grows. To address these issues, this study introduces +Rao-Blackwellized POMDP (RB-POMDP) approximate solvers and outlines generic +methods to apply Rao-Blackwellization in both belief updates and online +planning. We compare the performance of SIRPF and Rao-Blackwellized Particle +Filters (RBPF) in a simulated localization problem where an agent navigates +toward a target in a GPS-denied environment using POMCPOW and RB-POMCPOW +planners. Our results not only confirm that RBPFs maintain accurate belief +approximations over time with fewer particles, but, more surprisingly, RBPFs +combined with quadrature-based integration improve planning quality +significantly compared to SIRPF-based planning under the same computational +limits. + +
+
+
+
+
+ + ☆ Embedded IPC: Fast and Intersection-free Simulation in Reduced Subspace + for Robot Manipulation + + +
+ Physics-based simulation is essential for developing and evaluating robot +manipulation policies, particularly in scenarios involving deformable objects +and complex contact interactions. However, existing simulators often struggle +to balance computational efficiency with numerical accuracy, especially when +modeling deformable materials with frictional contact constraints. We introduce +an efficient subspace representation for the Incremental Potential Contact +(IPC) method, leveraging model reduction to decrease the number of degrees of +freedom. Our approach decouples simulation complexity from the resolution of +the input model by representing elasticity in a low-resolution subspace while +maintaining collision constraints on an embedded high-resolution surface. Our +barrier formulation ensures intersection-free trajectories and configurations +regardless of material stiffness, time step size, or contact severity. We +validate our simulator through quantitative experiments with a soft bubble +gripper grasping and qualitative demonstrations of placing a plate on a dish +rack. The results demonstrate our simulator's efficiency, physical accuracy, +computational stability, and robust handling of frictional contact, making it +well-suited for generating demonstration data and evaluating downstream robot +training applications. + +
+
+
+
+
+ + ♻ ☆ The RoboDepth Challenge: Methods and Advancements Towards Robust Depth + Estimation + + +
+ Accurate depth estimation under out-of-distribution (OoD) scenarios, such as +adverse weather conditions, sensor failure, and noise contamination, is +desirable for safety-critical applications. Existing depth estimation systems, +however, suffer inevitably from real-world corruptions and perturbations and +are struggled to provide reliable depth predictions under such cases. In this +paper, we summarize the winning solutions from the RoboDepth Challenge -- an +academic competition designed to facilitate and advance robust OoD depth +estimation. This challenge was developed based on the newly established KITTI-C +and NYUDepth2-C benchmarks. We hosted two stand-alone tracks, with an emphasis +on robust self-supervised and robust fully-supervised depth estimation, +respectively. Out of more than two hundred participants, nine unique and +top-performing solutions have appeared, with novel designs ranging from the +following aspects: spatial- and frequency-domain augmentations, masked image +modeling, image restoration and super-resolution, adversarial training, +diffusion-based noise suppression, vision-language pre-training, learned model +ensembling, and hierarchical feature enhancement. Extensive experimental +analyses along with insightful observations are drawn to better understand the +rationale behind each design. We hope this challenge could lay a solid +foundation for future research on robust and reliable depth estimation and +beyond. The datasets, competition toolkit, workshop recordings, and source code +from the winning teams are publicly available on the challenge website. + +
+
+ comment: Technical Report; 65 pages, 34 figures, 24 tables; Code at + https://github.com/ldkong1205/RoboDepth +
+
+
+
+
+ + ♻ ☆ Active Shadowing (ASD): Manipulating Visual Perception of Robotics + Behaviors via Implicit Communication + + +
+ Explicit communication is often valued for its directness during interaction. +Implicit communication, on the other hand, is indirect in that its +communicative content must be inferred. Implicit communication is considered +more desirable in teaming situations that requires reduced interruptions for +improved fluency. In this paper, we investigate another unique advantage of +implicit communication: its ability to manipulate the perception of object or +behavior of interest. When communication results in the perception of an object +or behavior to deviate from other information (about the object or behavior) +available via observation, it introduces a discrepancy between perception and +observation. We show that such a discrepancy in visual perception can benefit +human-robot interaction in a controlled manner and introduce an approach +referred to as active shadowing (ASD). Through user studies, we demonstrate the +effectiveness of active shadowing in creating a misaligned perception of the +robot's behavior and its execution in the real-world, resulting in more +efficient task completion without sacrificing its understandability. We also +analyze conditions under which such visual manipulation is effective. + +
+
+
+
+
+ + ♻ ☆ Knowledge-based Neural Ordinary Differential Equations for Cosserat + Rod-based Soft Robots + + +
+ Soft robots have many advantages over rigid robots thanks to their compliant +and passive nature. However, it is generally challenging to model the dynamics +of soft robots due to their high spatial dimensionality, making it difficult to +use model-based methods to accurately control soft robots. It often requires +direct numerical simulation of partial differential equations to simulate soft +robots. This not only requires an accurate numerical model, but also makes soft +robot modeling slow and expensive. Deep learning algorithms have shown promises +in data-driven modeling of soft robots. However, these algorithms usually +require a large amount of data, which are difficult to obtain in either +simulation or real-world experiments of soft robots. In this work, we propose +KNODE-Cosserat, a framework that combines first-principle physics models and +neural ordinary differential equations. We leverage the best from both worlds +-- the generalization ability of physics-based models and the fast speed of +deep learning methods. We validate our framework in both simulation and +real-world experiments. In both cases, we show that the robot model +significantly improves over the baseline models under different metrics. + +
+
+ comment: 8 pages, 11 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Can I Pet Your Robot? Incorporating Capacitive Touch Sensing into a Soft + Socially Assistive Robot Platform + + +
+ This work presents a method of incorporating low-cost capacitive tactile +sensors on a soft socially assistive robot platform. By embedding conductive +thread into the robot's crocheted exterior, we formed a set of low-cost, +flexible capacitive tactile sensors that do not disrupt the robot's soft, +zoomorphic embodiment. We evaluated the sensors' performance through a user +study (N=20) and found that the sensors reliably detected user touch events and +localized touch inputs to one of three regions on the robot's exterior. + +
+
+ comment: Accepted as a Work-In-Progress submission at the 2024 IEEE Haptics + Symposium +
+
+
+
+
+
+
+
+ + Systems and Control 30 + +
+
+
+ + ☆ Learning Linear Dynamics from Bilinear Observations + + +
+ We consider the problem of learning a realization of a partially observed +dynamical system with linear state transitions and bilinear observations. Under +very mild assumptions on the process and measurement noises, we provide a +finite time analysis for learning the unknown dynamics matrices (up to a +similarity transform). Our analysis involves a regression problem with +heavy-tailed and dependent data. Moreover, each row of our design matrix +contains a Kronecker product of current input with a history of inputs, making +it difficult to guarantee persistence of excitation. We overcome these +challenges, first providing a data-dependent high probability error bound for +arbitrary but fixed inputs. Then, we derive a data-independent error bound for +inputs chosen according to a simple random design. Our main results provide an +upper bound on the statistical error rates and sample complexity of learning +the unknown dynamics matrices from a single finite trajectory of bilinear +observations. + +
+
+ comment: 35 pages, 3 figures +
+
+
+
+
+ + ☆ Interaction Techniques for User-friendly Interfaces for Gate-based + Quantum Computing + + +
+ Quantum computers offer promising approaches to various fields. To use +current noisy quantum computers, developers need to examine the compilation of +a logical circuit, the status of available hardware, and noises in results. As +those tasks are less common in classical computing, quantum developers may not +be familiar with performing them. Therefore, easier and more intuitive +interfaces are necessary to make quantum computers more approachable. While +existing notebook-based toolkits like Qiskit offer application programming +interfaces and visualization techniques, it is still difficult to navigate the +vast space of quantum program design and hardware status. + Inspired by human-computer interaction (HCI) work in data science and +visualization, our work introduces four user interaction techniques that can +augment existing notebook-based toolkits for gate-based quantum computing: (1) +a circuit writer that lets users provide high-level information about a circuit +and generates a code snippet to build it; (2) a machine explorer that provides +detailed properties and configurations of a hardware with a code to load +selected information; (3) a circuit viewer that allows for comparing logical +circuit, compiled circuit, and hardware configurations; and (4) a visualization +for adjusting measurement outcomes with hardware error rates. + +
+
+ comment: A poster accepted to IEEE QCE 2024 +
+
+
+
+
+ + ☆ MBC: Multi-Brain Collaborative Control for Quadruped Robots + + +
+ In the field of locomotion task of quadruped robots, Blind Policy and +Perceptive Policy each have their own advantages and limitations. The Blind +Policy relies on preset sensor information and algorithms, suitable for known +and structured environments, but it lacks adaptability in complex or unknown +environments. The Perceptive Policy uses visual sensors to obtain detailed +environmental information, allowing it to adapt to complex terrains, but its +effectiveness is limited under occluded conditions, especially when perception +fails. Unlike the Blind Policy, the Perceptive Policy is not as robust under +these conditions. To address these challenges, we propose a MBC:Multi-Brain +collaborative system that incorporates the concepts of Multi-Agent +Reinforcement Learning and introduces collaboration between the Blind Policy +and the Perceptive Policy. By applying this multi-policy collaborative model to +a quadruped robot, the robot can maintain stable locomotion even when the +perceptual system is impaired or observational data is incomplete. Our +simulations and real-world experiments demonstrate that this system +significantly improves the robot's passability and robustness against +perception failures in complex environments, validating the effectiveness of +multi-policy collaboration in enhancing robotic motion performance. + +
+
+ comment: 18 pages, 9 figures, Website and Videos: https://quad-mbc.github.io/ +
+
+
+
+
+ + ☆ Active Perception with Initial-State Uncertainty: A Policy Gradient + Method + + +
+ This paper studies the synthesis of an active perception policy that +maximizes the information leakage of the initial state in a stochastic system +modeled as a hidden Markov model (HMM). Specifically, the emission function of +the HMM is controllable with a set of perception or sensor query actions. Given +the goal is to infer the initial state from partial observations in the HMM, we +use Shannon conditional entropy as the planning objective and develop a novel +policy gradient method with convergence guarantees. By leveraging a variant of +observable operators in HMMs, we prove several important properties of the +gradient of the conditional entropy with respect to the policy parameters, +which allow efficient computation of the policy gradient and stable and fast +convergence. We demonstrate the effectiveness of our solution by applying it to +an inference problem in a stochastic grid world environment. + +
+
+
+
+
+ + ☆ Willems' Fundamental Lemma for Nonlinear Systems with Koopman Linear + Embedding + + +
+ Koopman operator theory and Willems' fundamental lemma both can provide +(approximated) data-driven linear representation for nonlinear systems. +However, choosing lifting functions for the Koopman operator is challenging, +and the quality of the data-driven model from Willems' fundamental lemma has no +guarantee for general nonlinear systems. In this paper, we extend Willems' +fundamental lemma for a class of nonlinear systems that admit a Koopman linear +embedding. We first characterize the relationship between the trajectory space +of a nonlinear system and that of its Koopman linear embedding. We then prove +that the trajectory space of Koopman linear embedding can be formed by a linear +combination of rich-enough trajectories from the nonlinear system. Combining +these two results leads to a data-driven representation of the nonlinear +system, which bypasses the need for the lifting functions and thus eliminates +the associated bias errors. Our results illustrate that both the width (more +trajectories) and depth (longer trajectories) of the trajectory library are +important to ensure the accuracy of the data-driven model. + +
+
+
+
+
+ + ☆ Age of Gossip in Networks with Multiple Views of a Source + + +
+ We consider the version age of information (AoI) in a network where a subset +of nodes act as sensing nodes, sampling a source that in general can follow a +continuous distribution. Any sample of the source constitutes a new version of +the information and the version age of the information is defined with respect +to the most recent version of the information available for the whole network. +We derive a recursive expression for the average version AoI between different +subsets of the nodes which can be used to evaluate the average version AoI for +any subset of the nodes including any single node. We derive asymptotic +behavior of the average AoI on any single node of the network for various +topologies including line, ring, and fully connected networks. The prior art +result on version age of a network by Yates [ISIT'21] can be interpreted as in +our derivation as a network with a single view of the source, e.g., through a +Poisson process with rate $\lambda_{00}$. Our result indicates that there is no +loss in the average version AoI performance by replacing a single view of the +source with distributed sensing across multiple nodes by splitting the same +rate $\lambda_{00}$. Particularly, we show that asymptotically, the average AoI +scales with $O(\log(n))$ and $O(\sqrt{n})$ for fully connected and ring +networks, respectively. More interestingly, we show that for the ring network +the same $O(\sqrt{n})$ asymptotical performance on average AoI is still +achieved with distributed sensing if the number of sensing nodes only scales +with $O(\sqrt{n})$ instead of prior known result which requires $O(n)$. Our +results indicate that the sensing nodes can be arbitrarily chosen as long as +the maximum number of consecutive non-sensing nodes also scales as +$O(\sqrt{n})$. + +
+
+
+
+
+ + ☆ Transformer based time series prediction of the maximum power point for + solar photovoltaic cells + + +
+ This paper proposes an improved deep learning based maximum power point +tracking (MPPT) in solar photovoltaic cells considering various time series +based environmental inputs. Generally, artificial neural network based MPPT +algorithms use basic neural network architectures and inputs which do not +represent the ambient conditions in a comprehensive manner. In this article, +the ambient conditions of a location are represented through a comprehensive +set of environmental features. Furthermore, the inclusion of time based +features in the input data is considered to model cyclic patterns temporally +within the atmospheric conditions leading to robust modeling of the MPPT +algorithm. A transformer based deep learning architecture is trained as a time +series prediction model using multidimensional time series input features. The +model is trained on a dataset containing typical meteorological year data +points of ambient weather conditions from 50 locations. The attention mechanism +in the transformer modules allows the model to learn temporal patterns in the +data efficiently. The proposed model achieves a 0.47% mean average percentage +error of prediction on non zero operating voltage points in a test dataset +consisting of data collected over a period of 200 consecutive hours resulting +in the average power efficiency of 99.54% and peak power efficiency of 99.98%. +The proposed model is validated through real time simulations. The proposed +model performs power point tracking in a robust, dynamic, and nonlatent manner, +over a wide range of atmospheric conditions. + +
+
+ comment: Published June 2022, in Energy Science and Engineering, Volume10, + Issue9, Pages 3397-3410 +
+
+
+
+
+ + ☆ A Critical Review of Safe Reinforcement Learning Techniques in Smart + Grid Applications + + +
+ The high penetration of distributed energy resources (DERs) in modern smart +power systems introduces unforeseen uncertainties for the electricity sector, +leading to increased complexity and difficulty in the operation and control of +power systems. As a cutting-edge machine learning technology, deep +reinforcement learning (DRL) has been widely implemented in recent years to +handle the uncertainty in power systems. However, in critical infrastructures +such as power systems, safety issues always receive top priority, while DRL may +not always meet the safety requirements of power system operators. The concept +of safe reinforcement learning (safe RL) is emerging as a potential solution to +overcome the shortcomings of conventional DRL in the operation and control of +power systems. This study provides a rigorous review of the latest research +efforts focused on safe RL to derive power system control policies while +accounting for the unique safety requirements of power grids. Furthermore, this +study highlights various safe RL algorithms applied in diverse applications +within the power system sector, from single grid-connected power converters, +residential smart homes, and buildings to large power distribution networks. +For all methods outlined, a discussion on their bottlenecks, research +challenges, and potential opportunities in the operation and control of power +system applications is also presented. This review aims to support research in +the area of safe RL algorithms, embracing smart power system operation with +safety constraints amid high uncertainty from DERs. + +
+
+ comment: 16 pages, 7 figures, 9 tables +
+
+
+
+
+ + ☆ TE-PINN: Quaternion-Based Orientation Estimation using + Transformer-Enhanced Physics-Informed Neural Networks + + +
+ This paper introduces a Transformer-Enhanced Physics-Informed Neural Network +(TE-PINN) designed for accurate quaternion-based orientation estimation in +high-dynamic environments, particularly within the field of robotics. By +integrating transformer networks with physics-informed learning, our approach +innovatively captures temporal dependencies in sensor data while enforcing the +fundamental physical laws governing rotational motion. TE-PINN leverages a +multi-head attention mechanism to handle sequential data from inertial sensors, +such as accelerometers and gyroscopes, ensuring temporal consistency. +Simultaneously, the model embeds quaternion kinematics and rigid body dynamics +into the learning process, aligning the network's predictions with mechanical +principles like Euler's laws of motion. The physics-informed loss function +incorporates the dynamics of angular velocity and external forces, enhancing +the network's ability to generalize in complex scenarios. Our experimental +evaluation demonstrates that TE-PINN consistently outperforms traditional +methods such as Extended Kalman Filters (EKF) and LSTM-based estimators, +particularly in scenarios characterized by high angular velocities and noisy +sensor data. The results show a significant reduction in mean quaternion error +and improved gyroscope bias estimation compared to the state-of-the-art. An +ablation study further isolates the contributions of both the transformer +architecture and the physics-informed constraints, highlighting the synergistic +effect of both components in improving model performance. The proposed model +achieves real-time performance on embedded systems typical of mobile robots, +offering a scalable and efficient solution for orientation estimation in +autonomous systems. + +
+
+
+
+
+ + ☆ System-Level Performance Metrics Sensitivity of an Electrified + Heavy-Duty Mobile Manipulator + + +
+ The shift to electric and hybrid powertrains in vehicular systems has +propelled advancements in mobile robotics and autonomous vehicles. This paper +examines the sensitivity of key performance metrics in a electrified heavy-duty +mobile manipulator (HDMM) driven by electromechanical linear actuators (EMLAs) +powered by permanent magnet synchronous motors (PMSMs). The study evaluates +power delivery, force dynamics, energy consumption, and overall efficiency of +the actuation mechanisms. By computing partial derivatives (PD) with respect to +the payload mass at the tool center point (TCP), it provides insights into +these factors under various loading conditions. This research aids in the +appropriate choice or design of EMLAs for HDMM electrification, addressing +actuation mechanism selection challenge in vehicular system with mounted +manipulator and determines the necessary battery capacity requirements. + +
+
+ comment: This work is submitted to IEEE VTC 2024 +
+
+
+
+
+ + ☆ Mean Age of Information in Partial Offloading Mobile Edge Computing + Networks + + +
+ The age of information (AoI) performance analysis is essential for evaluating +the information freshness in the large-scale mobile edge computing (MEC) +networks. This work proposes the earliest analysis of the mean AoI (MAoI) +performance of large-scale partial offloading MEC networks. Firstly, we derive +and validate the closed-form expressions of MAoI by using queueing theory and +stochastic geometry. Based on these expressions, we analyse the effects of +computing offloading ratio (COR) and task generation rate (TGR) on the MAoI +performance and compare the MAoI performance under the local computing, remote +computing, and partial offloading schemes. The results show that by jointly +optimising the COR and TGR, the partial offloading scheme outperforms the local +and remote computing schemes in terms of the MAoI, which can be improved by up +to 51% and 61%, respectively. This encourages the MEC networks to adopt the +partial offloading scheme to improve the MAoI performance. + +
+
+
+
+
+ + ☆ Wind lulls and slews; consequences for the stability of future UK + electricity systems + + +
+ As the United Kingdom wind fleet increases in size, wind lulls and slews will +increasingly challenge the stability of its electricity system. The paper +describes the use of models based on real time records and including solar +slews, to investigate the most extreme wind variations likely to be encountered +in future, enabling strategies to be devised to mitigate them. Wind lulls are +surprisingly frequent, occasionally lasting a week or more, and are always +likely to be beyond the capabilities of stored or imported electrical energy to +mitigate them. The models indicate that there will be a continuing need for gas +powered generation to mitigate wind lulls. Currently, Combined Cycle Gas +Turbines (CCGTs) provide most of the dispatchable generation. However, CCGTs +are not sufficiently fast acting to cope with the wind and solar slews +anticipated in future. The paper suggests that a range of already proven +fast-acting sources of dispatchable generation, including Open Cycle Gas +Turbines (OCGTs), Internal Combustion Gas-Fired Reciprocating engines (ICGRs) +and stored electrical energy systems, should be capable of coping with the +largest wind and solar slews likely to be encountered up to the year 2035. +Examples are given of the recent introduction of these fast-acting sources of +generation which, it is suggested, will progressively replace CCGTs as the wind +and solar fleets increase in size. Moreover, we see the pattern of recent +investments, summarised in the paper, as a good indication of likely future +investments, with OCGT investments mainly serving the 440 kV grid, and ICGRs +and stored electrical energy more local networks. + +
+
+ comment: 13 pages, 8 figures, 3 tables +
+
+
+
+
+ + ☆ Assessing strategies to manage distributed photovoltaics in Swiss + low-voltage networks: An analysis of curtailment, export tariffs, and + resource sharing + + +
+ The integration of photovoltaic systems poses several challenges for the +distribution grid, mainly due to the infrastructure not being designed to +handle the upstream flow and being dimensioned for consumption only, +potentially leading to reliability and stability issues. This study +investigates the use of capacity-based tariffs, export tariffs, and curtailment +policies to reduce negative grid impacts without hampering PV deployment. We +analyze the effect of such export tariffs on three typical Swiss low-voltage +networks (rural, semi-urban, and urban), using power flow analysis to evaluate +the power exchanges at the transformer station, as well as line overloading and +voltage violations. Finally, a simple case of mutualization of resources is +analyzed to assess its potential contribution to relieving network constraints +and the economic costs of managing LV networks. We found that the tariff with +capacity-based components on the export (CT export daily) severely penalizes PV +penetration. This applies to other tariffs as well (e.g. IRR monthly, +Curtailment 30, and DT variable) but to a lesser extent. However, the inclusion +of curtailment at 50\% and 70\%, as well as mixed tariffs with capacity-based +components at import and curtailment, allow for a high degree of PV +installations in the three zones studied and help to mitigate the impact of PV +on the distributed network. + +
+
+ comment: Preprint version. 25 pages, 6 figures +
+
+
+
+
+ + ☆ Whole-body end-effector pose tracking + + +
+ Combining manipulation with the mobility of legged robots is essential for a +wide range of robotic applications. However, integrating an arm with a mobile +base significantly increases the system's complexity, making precise +end-effector control challenging. Existing model-based approaches are often +constrained by their modeling assumptions, leading to limited robustness. +Meanwhile, recent Reinforcement Learning (RL) implementations restrict the +arm's workspace to be in front of the robot or track only the position to +obtain decent tracking accuracy. In this work, we address these limitations by +introducing a whole-body RL formulation for end-effector pose tracking in a +large workspace on rough, unstructured terrains. Our proposed method involves a +terrain-aware sampling strategy for the robot's initial configuration and +end-effector pose commands, as well as a game-based curriculum to extend the +robot's operating range. We validate our approach on the ANYmal quadrupedal +robot with a six DoF robotic arm. Through our experiments, we show that the +learned controller achieves precise command tracking over a large workspace and +adapts across varying terrains such as stairs and slopes. On deployment, it +achieves a pose-tracking error of 2.64 cm and 3.64 degrees, outperforming +existing competitive baselines. + +
+
+
+
+
+ + ☆ Safe Output Feedback Improvement with Baselines + + +
+ In data-driven control design, an important problem is to deal with +uncertainty due to limited and noisy data. One way to do this is to use a +min-max approach, which aims to minimize some design criteria for the +worst-case scenario. However, a strategy based on this approach can lead to +overly conservative controllers. To overcome this issue, we apply the idea of +baseline regret, and it is seen that minimizing the baseline regret under model +uncertainty can guarantee safe controller improvement with less conservatism +and variance in the resulting controllers. To exemplify the use of baseline +controllers, we focus on the output feedback setting and propose a two-step +control design method; first, an uncertainty set is constructed by a +data-driven system identification approach based on finite impulse response +models; then a control design criterion based on model reference control is +used. To solve the baseline regret optimization problem efficiently, we use a +convex approximation of the criterion and apply the scenario approach in +optimization. The numerical examples show that the inclusion of baseline regret +indeed improves the performance and reduces the variance of the resulting +controller. + +
+
+ comment: Accepted by The 63rd IEEE Conference on Decision and Control +
+
+
+
+
+ + ☆ Robust Neural IDA-PBC: passivity-based stabilization under + approximations + + +
+ In this paper, we restructure the Neural Interconnection and Damping +Assignment - Passivity Based Control (Neural IDA-PBC) design methodology, and +we formally analyze its closed-loop properties. Neural IDA-PBC redefines the +IDA-PBC design approach as an optimization problem by building on the framework +of Physics Informed Neural Networks (PINNs). However, the closed-loop stability +and robustness properties under Neural IDA-PBC remain unexplored. To address +the issue, we study the behavior of classical IDA-PBC under approximations. Our +theoretical analysis allows deriving conditions for practical and asymptotic +stability of the desired equilibrium point. Moreover, it extends the Neural +IDA-PBC applicability to port-Hamiltonian systems where the matching conditions +cannot be solved exactly. Our renewed optimization-based design introduces +three significant aspects: i) it involves a novel optimization objective +including stability and robustness constraints issued from our theoretical +analysis; ii) it employs separate Neural Networks (NNs), which can be +structured to reduce the search space to relevant functions; iii) it does not +require knowledge about the port-Hamiltonian formulation of the system's model. +Our methodology is validated with simulations on three standard benchmarks: a +double pendulum, a nonlinear mass-spring-damper and a cartpole. Notably, +classical IDA-PBC designs cannot be analytically derived for the latter. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Identification For Control Based on Neural Networks: Approximately + Linearizable Models + + +
+ This work presents a control-oriented identification scheme for efficient +control design and stability analysis of nonlinear systems. Neural networks are +used to identify a discrete-time nonlinear state-space model to approximate +time-domain input-output behavior of a nonlinear system. The network is +constructed such that the identified model is approximately linearizable by +feedback, ensuring that the control law trivially follows from the learning +stage. After the identification and quasi-linearization procedures, linear +control theory comes at hand to design robust controllers and study stability +of the closed-loop system. The effectiveness and interest of the methodology +are illustrated throughout the paper on popular benchmarks for system +identification. + +
+
+ comment: 15 pages, 3 figures, 6 tables, accepted as a poster in SysDO 2024, + Stuttgart, Germany +
+
+
+
+
+ + ☆ A Multi-Level Approach for Class Imbalance Problem in Federated Learning + for Remote Industry 4.0 Applications + + +
+ Deep neural network (DNN) models are effective solutions for industry 4.0 +applications (\eg oil spill detection, fire detection, anomaly detection). +However, training a DNN network model needs a considerable amount of data +collected from various sources and transferred to the central cloud server that +can be expensive and sensitive to privacy. For instance, in the remote offshore +oil field where network connectivity is vulnerable, a federated fog environment +can be a potential computing platform. Hence it is feasible to perform +computation within the federation. On the contrary, performing a DNN model +training using fog systems poses a security issue that the federated learning +(FL) technique can resolve. In this case, the new challenge is the class +imbalance problem that can be inherited in local data sets and can degrade the +performance of the global model. Therefore, FL training needs to be performed +considering the class imbalance problem locally. In addition, an efficient +technique to select the relevant worker model needs to be adopted at the global +level to increase the robustness of the global model. Accordingly, we utilize +one of the suitable loss functions addressing the class imbalance in workers at +the local level. In addition, we employ a dynamic threshold mechanism with +user-defined worker's weight to efficiently select workers for aggregation that +improve the global model's robustness. Finally, we perform an extensive +empirical evaluation to explore the benefits of our solution and find up to +3-5% performance improvement than baseline federated learning methods. + +
+
+
+
+
+ + ☆ Regional stability conditions for recurrent neural network-based control + systems + + +
+ In this paper we propose novel global and regional stability analysis +conditions based on linear matrix inequalities for a general class of recurrent +neural networks. These conditions can be also used for state-feedback control +design and a suitable optimization problem enforcing H2 norm minimization +properties is defined. The theoretical results are corroborated by numerical +simulations, showing the advantages and limitations of the methods presented +herein. + +
+
+
+
+
+ + ☆ Reinforcement Leaning for Infinite-Dimensional Systems + + +
+ Interest in reinforcement learning (RL) for massive-scale systems consisting +of large populations of intelligent agents interacting with heterogeneous +environments has witnessed a significant surge in recent years across diverse +scientific domains. However, due to the large-scale nature of the system, the +majority of state-of-the-art RL techniques either encounter high computational +cost or exhibit compromised performance. To mitigate these challenges, we +propose a novel RL architecture along with the derivation of effective +algorithms to learn optimal policies for any arbitrarily large system of +agents. Specifically, we model such a system as a parameterized control system +defined on an infinite-dimensional function space. We then develop a moment +kernel transform to map the parameterized system and the value function of an +RL problem into a reproducing kernel Hilbert space. This transformation +subsequently generates a finite-dimensional moment representation for this RL +problem. Leveraging this representation, we develop a hierarchical algorithm +for learning optimal policies for the infinite-dimensional parameterized +system. We further enhance efficiency of the algorithm by exploiting early +stopping at each hierarchy, by which we show the fast convergence property of +the algorithm through constructing a convergent spectral sequence. The +performance and efficiency of the proposed algorithm are validated using +practical examples. + +
+
+
+
+
+ + ☆ Optimization of partially isolated quantum harmonic oscillator memory + systems by mean square decoherence time criteria + + +
+ This paper is concerned with open quantum harmonic oscillators with +position-momentum system variables, whose internal dynamics and interaction +with the environment are governed by linear quantum stochastic differential +equations. A recently proposed approach to such systems as Heisenberg picture +quantum memories exploits their ability to approximately retain initial +conditions over a decoherence horizon. Using the quantum memory decoherence +time defined previously in terms of a fidelity threshold on a weighted +mean-square deviation of the system variables from their initial values, we +apply this approach to a partially isolated subsystem of the oscillator, which +is not directly affected by the external fields. The partial isolation leads to +an appropriate system decomposition and a qualitatively different short-horizon +asymptotic behaviour of the deviation, which yields a longer decoherence time +in the high-fidelity limit. The resulting approximate decoherence time +maximization over the energy parameters for improving the quantum memory +performance is discussed for a coherent feedback interconnection of such +systems. + +
+
+ comment: 9 pages, 3 figures, submitted to ANZCC 2025 +
+
+
+
+
+ + ☆ Autonomous Wheel Loader Navigation Using Goal-Conditioned Actor-Critic + MPC ICRA + + +
+ This paper proposes a novel control method for an autonomous wheel loader, +enabling time-efficient navigation to an arbitrary goal pose. Unlike prior +works that combine high-level trajectory planners with Model Predictive Control +(MPC), we directly enhance the planning capabilities of MPC by integrating a +cost function derived from Actor-Critic Reinforcement Learning (RL). +Specifically, we train an RL agent to solve the pose reaching task in +simulation, then incorporate the trained neural network critic as both the +stage and terminal cost of an MPC. We show through comprehensive simulations +that the resulting MPC inherits the time-efficient behavior of the RL agent, +generating trajectories that compare favorably against those found using +trajectory optimization. We also deploy our method on a real wheel loader, +where we successfully navigate to various goal poses. In contrast, the RL actor +risked damaging the machine and was unsuitable for real-world use. + +
+
+ comment: Submitted to International Conference on Robotics and Automation + (ICRA) 2025 +
+
+
+
+
+ + ☆ Autotuning Bipedal Locomotion MPC with GRFM-Net for Efficient + Sim-to-Real Transfer + + +
+ Bipedal locomotion control is essential for humanoid robots to navigate +complex, human-centric environments. While optimization-based control designs +are popular for integrating sophisticated models of humanoid robots, they often +require labor-intensive manual tuning. In this work, we address the challenges +of parameter selection in bipedal locomotion control using DiffTune, a +model-based autotuning method that leverages differential programming for +efficient parameter learning. A major difficulty lies in balancing model +fidelity with differentiability. We address this difficulty using a +low-fidelity model for differentiability, enhanced by a Ground Reaction +Force-and-Moment Network (GRFM-Net) to capture discrepancies between MPC +commands and actual control effects. We validate the parameters learned by +DiffTune with GRFM-Net in hardware experiments, which demonstrates the +parameters' optimality in a multi-objective setting compared with baseline +parameters, reducing the total loss by up to 40.5$\%$ compared with the +expert-tuned parameters. The results confirm the GRFM-Net's effectiveness in +mitigating the sim-to-real gap, improving the transferability of +simulation-learned parameters to real hardware. + +
+
+
+
+
+ + ☆ Open-/Closed-loop Active Learning for Data-driven Predictive Control + + +
+ An important question in data-driven control is how to obtain an informative +dataset. In this work, we consider the problem of effective data acquisition of +an unknown linear system with bounded disturbance for both open-loop and +closed-loop stages. The learning objective is to minimize the volume of the set +of admissible systems. First, a performance measure based on historical data +and the input sequence is introduced to characterize the upper bound of the +volume of the set of admissible systems. On the basis of this performance +measure, an open-loop active learning strategy is proposed to minimize the +volume by actively designing inputs during the open-loop stage. For the +closed-loop stage, an closed-loop active learning strategy is designed to +select and learn from informative closed-loop data. The efficiency of the +proposed closed-loop active learning strategy is proved by showing that the +unselected data cannot benefit the learning performance. Furthermore, an +adaptive predictive controller is designed in accordance with the proposed data +acquisition approach. The recursive feasibility and the stability of the +controller are proved by analyzing the effect of the closed-loop active +learning strategy. Finally, numerical examples and comparisons illustrate the +effectiveness of the proposed data acquisition strategy. + +
+
+
+
+
+ + ☆ Agent-state based policies in POMDPs: Beyond belief-state MDPs + + +
+ The traditional approach to POMDPs is to convert them into fully observed +MDPs by considering a belief state as an information state. However, a +belief-state based approach requires perfect knowledge of the system dynamics +and is therefore not applicable in the learning setting where the system model +is unknown. Various approaches to circumvent this limitation have been proposed +in the literature. We present a unified treatment of some of these approaches +by viewing them as models where the agent maintains a local recursively +updateable agent state and chooses actions based on the agent state. We +highlight the different classes of agent-state based policies and the various +approaches that have been proposed in the literature to find good policies +within each class. These include the designer's approach to find optimal +non-stationary agent-state based policies, policy search approaches to find a +locally optimal stationary agent-state based policies, and the approximate +information state to find approximately optimal stationary agent-state based +policies. We then present how ideas from the approximate information state +approach have been used to improve Q-learning and actor-critic algorithms for +learning in POMDPs. + +
+
+
+
+
+ + ♻ ☆ Data-Driven System Identification of Quadrotors Subject to Motor Delays IROS 2024 + + +
+ Recently non-linear control methods like Model Predictive Control (MPC) and +Reinforcement Learning (RL) have attracted increased interest in the quadrotor +control community. In contrast to classic control methods like cascaded PID +controllers, MPC and RL heavily rely on an accurate model of the system +dynamics. The process of quadrotor system identification is notoriously tedious +and is often pursued with additional equipment like a thrust stand. +Furthermore, low-level details like motor delays which are crucial for accurate +end-to-end control are often neglected. In this work, we introduce a +data-driven method to identify a quadrotor's inertia parameters, thrust curves, +torque coefficients, and first-order motor delay purely based on proprioceptive +data. The estimation of the motor delay is particularly challenging as usually, +the RPMs can not be measured. We derive a Maximum A Posteriori (MAP)-based +method to estimate the latent time constant. Our approach only requires about a +minute of flying data that can be collected without any additional equipment +and usually consists of three simple maneuvers. Experimental results +demonstrate the ability of our method to accurately recover the parameters of +multiple quadrotors. It also facilitates the deployment of RL-based, end-to-end +quadrotor control of a large quadrotor under harsh, outdoor conditions. + +
+
+ comment: Accepted at IROS 2024 +
+
+
+
+
+ + ♻ ☆ A Fairness-Oriented Reinforcement Learning Approach for the Operation + and Control of Shared Micromobility Services + + +
+ As Machine Learning grows in popularity across various fields, equity has +become a key focus for the AI community. However fairness-oriented approaches +are still underexplored in smart mobility. Addressing this gap, our study +investigates the balance between performance optimization and algorithmic +fairness in shared micromobility services providing a novel framework based on +Reinforcement Learning. Exploiting Q-Learning, the proposed methodology +achieves equitable outcomes in terms of the Gini index across different areas +characterized by their distance from central hubs. Through vehicle rebalancing, +the provided scheme maximizes operator performance while ensuring fairness +principles for users, reducing iniquity by up to 80% while only increasing +costs by 30% (w.r.t. applying no equity adjustment). A case study with +synthetic data validates our insights and highlights the importance of fairness +in urban micromobility. + +
+
+ comment: 6 pages, 2 figures, jointly submitted to IEEE L-CSS and ACC 2025 +
+
+
+
+
+ + ♻ ☆ USV-AUV Collaboration Framework for Underwater Tasks under Extreme Sea + Conditions + + +
+ Autonomous underwater vehicles (AUVs) are valuable for ocean exploration due +to their flexibility and ability to carry communication and detection units. +Nevertheless, AUVs alone often face challenges in harsh and extreme sea +conditions. This study introduces a unmanned surface vehicle (USV)-AUV +collaboration framework, which includes high-precision multi-AUV positioning +using USV path planning via Fisher information matrix optimization and +reinforcement learning for multi-AUV cooperative tasks. Applied to a multi-AUV +underwater data collection task scenario, extensive simulations validate the +framework's feasibility and superior performance, highlighting exceptional +coordination and robustness under extreme sea conditions. To accelerate +relevant research in this field, we have made the simulation code available as +open-source. + +
+
+
+
+
+ + ♻ ☆ First Field Trial of LLM-Powered AI Agent for Lifecycle Management of + Autonomous Driving Optical Networks + + +
+ We design and demonstrate the first field trial of LLM-powered AI Agent for +ADON. Three operation modes of the Agent are proposed for network lifecycle +management. The Agent efficiently processes wavelength add/drop and soft/hard +failures, and achieves comparable performance to human-designed algorithms for +power optimization. + +
+
+ comment: Version submitted to ECOC PDP 2024 on September 6th +
+
+
+
+
+ + ♻ ☆ Will Large Language Models be a Panacea to Autonomous Driving? + + +
+ Artificial intelligence (AI) plays a crucial role in autonomous driving (AD) +research, propelling its development towards intelligence and efficiency. +Currently, the development of AD technology follows two main technical paths: +modularization and end-to-end. Modularization decompose the driving task into +modules such as perception, prediction, planning, and control, and train them +separately. Due to the inconsistency of training objectives between modules, +the integrated effect suffers from bias. End-to-end attempts to address this +issue by utilizing a single model that directly maps from sensor data to +control signals. This path has limited learning capabilities in a comprehensive +set of features and struggles to handle unpredictable long-tail events and +complex urban traffic scenarios. In the face of challenges encountered in both +paths, many researchers believe that large language models (LLMs) with powerful +reasoning capabilities and extensive knowledge understanding may be the +solution, expecting LLMs to provide AD systems with deeper levels of +understanding and decision-making capabilities. In light of the challenges +faced by both paths, many researchers believe that LLMs, with their powerful +reasoning abilities and extensive knowledge, could offer a solution. To +understand if LLMs could enhance AD, this paper conducts a thorough analysis of +the potential applications of LLMs in AD systems, including exploring their +optimization strategies in both modular and end-to-end approaches, with a +particular focus on how LLMs can tackle the problems and challenges present in +current solutions. Furthermore, we discuss an important question: Can LLM-based +artificial general intelligence (AGI) be a key to achieve high-level AD? We +further analyze the potential limitations and challenges that LLMs may +encounter in promoting the development of AD technology. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Systems and Control 19 + +
+
+
+ + ☆ LDPC Codes in Cooperative Communication + + +
+ In fact, the broadcast nature of every transmitter makes it possible for +other transceivers in the channel to overhear the broadcasted signal. The +proposed idea in cooperative communication is to use these intermediate +transceivers as relay for the transmitted signal, therefore we will have +spatial diversity which can improve throughput and received data reliability in +the system. In this dissertation we consider some important aspects of +cooperative communication in a network composed of three nodes. First, we +verify the increase of reliability for the received signal by comparing the +reliability of the received bits in a cooperative network with a +non-cooperative one. Then we step forward and use LDPC error correction +technique to improve the reliability of the received bits even more and compare +it with a network without LDPC codes (encoder and decoder) to measure the level +of improvement for different SNRs. The overall aim of this dissertation is to +deploy cooperative communication idea to consider and test its claimed aspects +and also enhance its performance by using LDPC error correction technique. + +
+
+ comment: 41 pages +
+
+
+
+
+ + ☆ Spectral Graph Theoretic Methods for Enhancing Network Robustness in + Robot Localization + + +
+ This paper addresses the optimization of edge-weighted networks by maximizing +algebraic connectivity to enhance network robustness. Motivated by the need for +precise robot position estimation in cooperative localization and pose-graph +sparsification in Simultaneous Localization and Mapping (SLAM), the algebraic +connectivity maximization problem is formulated as a Mixed Integer +Semi-Definite Program (MISDP), which is NP-hard. Leveraging spectral graph +theoretic methods, specifically Cheeger's inequality, this work introduces +novel "Cheeger cuts" to strengthen and efficiently solve medium-scale MISDPs. +Further, a new Mixed Integer Linear Program (MILP) is developed for efficiently +computing Cheeger cuts, implemented within an outer-approximation algorithm for +solving the MISDP. A greedy k-opt heuristic is also presented, producing +high-quality solutions that serve as valid lower bounds for Cheeger cuts. +Comprehensive numerical analyses demonstrate the efficacy of strengthened cuts +via substantial improvements in run times on synthetic and realistic robot +localization datasets. + +
+
+ comment: 63rd IEEE Conference on Decision and Control (CDC) +
+
+
+
+
+ + ☆ Optimization-based Verification of Discrete-time Control Barrier + Functions: A Branch-and-Bound Approach + + +
+ Discrete-time Control Barrier Functions (DTCBFs) form a powerful control +theoretic tool to guarantee safety and synthesize safe controllers for +discrete-time dynamical systems. In this paper, we provide an +optimization-based algorithm, inspired by the $\alpha$BB algorithm, for the +verification of a candidate DTCBF, i.e., either verifying a given candidate +function as a valid DTCBF or falsifying it by providing a counterexample for a +general nonlinear discrete-time system with input constraints. This method is +applicable whether a corresponding control policy is known or unknown. We apply +our method to a numerical case study to illustrate its efficacy. + +
+
+
+
+
+ + ☆ Peer-to-Peer Learning Dynamics of Wide Neural Networks + + +
+ Peer-to-peer learning is an increasingly popular framework that enables +beyond-5G distributed edge devices to collaboratively train deep neural +networks in a privacy-preserving manner without the aid of a central server. +Neural network training algorithms for emerging environments, e.g., smart +cities, have many design considerations that are difficult to tune in +deployment settings -- such as neural network architectures and +hyperparameters. This presents a critical need for characterizing the training +dynamics of distributed optimization algorithms used to train highly nonconvex +neural networks in peer-to-peer learning environments. In this work, we provide +an explicit, non-asymptotic characterization of the learning dynamics of wide +neural networks trained using popular distributed gradient descent (DGD) +algorithms. Our results leverage both recent advancements in neural tangent +kernel (NTK) theory and extensive previous work on distributed learning and +consensus. We validate our analytical results by accurately predicting the +parameter and error dynamics of wide neural networks trained for classification +tasks. + +
+
+
+
+
+ + ☆ SPformer: A Transformer Based DRL Decision Making Method for Connected + Automated Vehicles + + +
+ In mixed autonomy traffic environment, every decision made by an +autonomous-driving car may have a great impact on the transportation system. +Because of the complex interaction between vehicles, it is challenging to make +decisions that can ensure both high traffic efficiency and safety now and +futher. Connected automated vehicles (CAVs) have great potential to improve the +quality of decision-making in this continuous, highly dynamic and interactive +environment because of their stronger sensing and communicating ability. For +multi-vehicle collaborative decision-making algorithms based on deep +reinforcement learning (DRL), we need to represent the interactions between +vehicles to obtain interactive features. The representation in this aspect +directly affects the learning efficiency and the quality of the learned policy. +To this end, we propose a CAV decision-making architecture based on transformer +and reinforcement learning algorithms. A learnable policy token is used as the +learning medium of the multi-vehicle joint policy, the states of all vehicles +in the area of interest can be adaptively noticed in order to extract +interactive features among agents. We also design an intuitive physical +positional encodings, the redundant location information of which optimizes the +performance of the network. Simulations show that our model can make good use +of all the state information of vehicles in traffic scenario, so as to obtain +high-quality driving decisions that meet efficiency and safety objectives. The +comparison shows that our method significantly improves existing DRL-based +multi-vehicle cooperative decision-making algorithms. + +
+
+
+
+
+ + ☆ A 3.5 GS/s 1-1 MASH VCO ADC With Second-Order Noise Shaping + + +
+ In this work, a 3.5 GS/s voltage-controlled oscillator (VCO) +analog-to-digital converter (ADC) using multi-stage noise shaping (MASH) is +presented. This 28nm CMOS ADC achieves second-order noise shaping in an +easily-scalable, open-loop configuration. A key enabler of the high-bandwidth +MASH VCO ADC is the use of a multi-bit estimated error signal. With an OSR of +16, an SNDR of 67 dB and DR of 68 dB are achieved in 109.375 MHz bandwidth. The +full-custom pseudo-analog circuits consume 9 mW, while the automatically +generated digital circuits consume another 24 mW. A $\mathbf{FoM_{DR} = 163}$ +dB and core area of $\mathbf{0.017\,\mathbf{mm}^2}$ are obtained. + +
+
+ comment: 14 pages, 29 figures. Author's version. IEEE Transactions on Circuits + and Systems I: Regular Papers +
+
+
+
+
+ + ☆ Cloud Deployment of Large-Scale Electromagnetic Transient Simulation -- + Discovery and Experiences + + +
+ Electromagnetic Transient (EMT) simulation starts to play a critical role in +modern power system planning and operations due to large penetration of +inverter based resources (IBRs). The EMT studies are computationally intensive +due to very small simulation time step and complex modeling of the protection +and control of IBRs. It has been challenging for the traditional on-premises +computing infrastructure to meet the ever-increasing computing needs of +large-scale EMT studies. This paper shares experience of ISO New England +(ISO-NE) on a pilot deployment of EMT simulation in a public cloud using Amazon +Web Services. The platform can successfully meet the large-scale EMT simulation +computation needs in a cost-effective way while meeting cyber security and data +privacy requirements. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ Towards a General Market for Cloud-Edge-IoT Continuum + + +
+ Recent years have witnessed the proposals aiming at enabling Vertical, +two-sided markets with a Single Marketplace (or exchange) (VSMs) for computing +and data resources/services (products) offerings in a multi-cloud and +crowdsourced IoT-edge sensing environment. A VSM is designed vertically from +bottom up with a broker being a built-in component of the marketplace. While +preventing seller lock-in and improving efficiency and availability, a VSM +suffers from a key weakness from a buyer's perspective, i.e., the broker and +the corresponding marketplace lock-in, which may lead to suboptimal shopping +experience for buyers, due to marketplace monopoly by the broker and limited +choice of products in the marketplace. In this position paper, we argue that a +Horizontal two-sided market with Multiple Marketplaces (HMM), resembling the +global stock market, should be developed. In an HMM, different marketplaces may +be operated by different parties and sell similar and/or different types of +products, e.g., computing and/or sensory data products. A broker is no longer a +built-in component of any given marketplace. Instead, it may cover multiple +marketplaces at the same time and there can be more than one broker in the HMM. +Both the number and types of marketplaces and brokers may grow independently or +scale horizontally to meet the growing demand. A buyer shops for a broker +through whom the buyer gains access to the needed products sold in the +marketplace(s) the broker covers and from whom the buyer receives various +possible services, e.g., discount, value-added, or full services. An HMM not +only overcomes the key weakness of a VSM but also allows the market to grow +incrementally and organically. Finally, two example use cases are given to +illustrate the benefits of HMM. + +
+
+ comment: 7 pages, 3 figures +
+
+
+
+
+ + ☆ A Contract Theory for Layered Control Architectures + + +
+ Autonomous systems typically leverage layered control architectures with a +combination of discrete and continuous models operating at different +timescales. As a result, layered systems form a new class of hybrid systems +composed of systems operating on a diverse set of continuous and discrete +signals. This paper formalizes the notion of a layered (hierarchical) control +architecture through a theory of relations between its layers. This theory +enables us to formulate contracts within layered control systems -- these +define interfaces between layers and isolate the design of each layer, +guaranteeing that composition of contracts at each layer results in a contract +capturing the desired system-wide specification. Thus, the proposed theory +yields the ability to analyze layered control architectures via a compositional +approach. + +
+
+
+
+
+ + ☆ Optimal state estimation: Turnpike analysis and performance results + + +
+ In this paper, we introduce turnpike arguments in the context of optimal +state estimation. In particular, we show that the optimal solution of the state +estimation problem involving all available past data serves as turnpike for the +solutions of truncated problems involving only a subset of the data. We +consider two different mathematical characterizations of this phenomenon and +provide corresponding sufficient conditions that rely on strict dissipativity +and decaying sensitivity. As second contribution, we show how a specific +turnpike property can be used to establish performance guarantees when +approximating the optimal solution of the full problem by a sequence of +truncated problems, and we show that the resulting performance (both averaged +and non-averaged) is approximately optimal with error terms that can be made +arbitrarily small by an appropriate choice of the horizon length. In addition, +we discuss interesting implications of these results for the practically +relevant case of moving horizon estimation and illustrate our results with a +numerical example. + +
+
+
+
+
+ + ☆ Towards Real-world Deployment of NILM Systems: Challenges and Practices + + +
+ Non-intrusive load monitoring (NILM), as a key load monitoring technology, +can much reduce the deployment cost of traditional power sensors. Previous +research has largely focused on developing cloud-exclusive NILM algorithms, +which often result in high computation costs and significant service delays. To +address these issues, we propose a three-tier framework to enhance the +real-world applicability of NILM systems through edge-cloud collaboration. +Considering the computational resources available at both the edge and cloud, +we implement a lightweight NILM model at the edge and a deep learning based +model at the cloud, respectively. In addition to the differential model +implementations, we also design a NILM-specific deployment scheme that +integrates Gunicorn and NGINX to bridge the gap between theoretical algorithms +and practical applications. To verify the effectiveness of the proposed +framework, we apply real-world NILM scenario settings and implement the entire +process of data acquisition, model training, and system deployment. The results +demonstrate that our framework can achieve high decomposition accuracy while +significantly reducing the cloud workload and communication overhead under +practical considerations. + +
+
+
+
+
+ + ☆ AmpAgent: An LLM-based Multi-Agent System for Multi-stage Amplifier + Schematic Design from Literature for Process and Performance Porting + + +
+ Multi-stage amplifiers are widely applied in analog circuits. However, their +large number of components, complex transfer functions, and intricate pole-zero +distributions necessitate extensive manpower for derivation and param sizing to +ensure their stability. In order to achieve efficient derivation of the +transfer function and simplify the difficulty of circuit design, we propose +AmpAgent: a multi-agent system based on large language models (LLMs) for +efficiently designing such complex amplifiers from literature with process and +performance porting. AmpAgent is composed of three agents: Literature Analysis +Agent, Mathematics Reasoning Agent and Device Sizing Agent. They are separately +responsible for retrieving key information (e.g. formulas and transfer +functions) from the literature, decompose the whole circuit's design problem by +deriving the key formulas, and address the decomposed problem iteratively. + AmpAgent was employed in the schematic design of seven types of multi-stage +amplifiers with different compensation techniques. In terms of design +efficiency, AmpAgent has reduced the number of iterations by 1.32$ \sim +$4${\times}$ and execution time by 1.19$ \sim $2.99${\times}$ compared to +conventional optimization algorithms, with a success rate increased by 1.03$ +\sim $6.79${\times}$. In terms of circuit performance, it has improved by 1.63$ +\sim $27.25${\times}$ compared to the original literature. The findings suggest +that LLMs could play a crucial role in the field of complex analog circuit +schematic design, as well as process and performance porting. + +
+
+
+
+
+ + ☆ A Generalized Control Revision Method for Autonomous Driving Safety + + +
+ Safety is one of the most crucial challenges of autonomous driving vehicles, +and one solution to guarantee safety is to employ an additional control +revision module after the planning backbone. Control Barrier Function (CBF) has +been widely used because of its strong mathematical foundation on safety. +However, the incompatibility with heterogeneous perception data and incomplete +consideration of traffic scene elements make existing systems hard to be +applied in dynamic and complex real-world scenarios. In this study, we +introduce a generalized control revision method for autonomous driving safety, +which adopts both vectorized perception and occupancy grid map as inputs and +comprehensively models multiple types of traffic scene constraints based on a +new proposed barrier function. Traffic elements are integrated into one unified +framework, decoupled from specific scenario settings or rules. Experiments on +CARLA, SUMO, and OnSite simulator prove that the proposed algorithm could +realize safe control revision under complicated scenes, adapting to various +planning backbones, road topologies, and risk types. Physical platform +validation also verifies the real-world application feasibility. + +
+
+
+
+
+ + ☆ Maintaining Strong $r$-Robustness in Reconfigurable Multi-Robot Networks + using Control Barrier Functions ICRA + + +
+ In leader-follower consensus, strong $r$-robustness of the communication +graph provides a sufficient condition for followers to achieve consensus in the +presence of misbehaving agents. Previous studies have assumed that robots can +form and/or switch between predetermined network topologies with known +robustness properties. However, robots with distance-based communication models +may not be able to achieve these topologies while moving through spatially +constrained environments, such as narrow corridors, to complete their +objectives. This paper introduces a Control Barrier Function (CBF) that ensures +robots maintain strong $r$-robustness of their communication graph above a +certain threshold without maintaining any fixed topologies. Our CBF directly +addresses robustness, allowing robots to have flexible reconfigurable network +structure while navigating to achieve their objectives. The efficacy of our +method is tested through various simulation and hardware experiments. + +
+
+ comment: Submitted to IEEE International Conference on Robotics and Automation + (ICRA) 2025 +
+
+
+
+
+ + ☆ Impedance Control for Manipulators Handling Heavy Payloads + + +
+ Attaching a heavy payload to the wrist force/moment (F/M) sensor of a +manipulator can cause conventional impedance controllers to fail in +establishing the desired impedance due to the presence of non-contact forces; +namely, the inertial and gravitational forces of the payload. This paper +presents an impedance control scheme designed to accurately shape the +force-response of such a manipulator without requiring acceleration +measurements. As a result, neither wrist accelerometers nor dynamic estimators +for compensating inertial load forces are necessary. The proposed controller +employs an inner-outer loop feedback structure, which not only addresses +uncertainties in the robot's dynamics but also enables the specification of a +general target impedance model, including nonlinear models. Stability and +convergence of the controller are analytically proven, with results showing +that the control input remains bounded as long as the desired inertia differs +from the payload inertia. Experimental results confirm that the proposed +impedance controller effectively shapes the impedance of a manipulator carrying +a heavy load according to the desired impedance model. + +
+
+
+
+
+ + ♻ ☆ Physics-Informed Multi-Agent Reinforcement Learning for Distributed + Multi-Robot Problems + + +
+ The networked nature of multi-robot systems presents challenges in the +context of multi-agent reinforcement learning. Centralized control policies do +not scale with increasing numbers of robots, whereas independent control +policies do not exploit the information provided by other robots, exhibiting +poor performance in cooperative-competitive tasks. In this work we propose a +physics-informed reinforcement learning approach able to learn distributed +multi-robot control policies that are both scalable and make use of all the +available information to each robot. Our approach has three key +characteristics. First, it imposes a port-Hamiltonian structure on the policy +representation, respecting energy conservation properties of physical robot +systems and the networked nature of robot team interactions. Second, it uses +self-attention to ensure a sparse policy representation able to handle +time-varying information at each robot from the interaction graph. Third, we +present a soft actor-critic reinforcement learning algorithm parameterized by +our self-attention port-Hamiltonian control policy, which accounts for the +correlation among robots during training while overcoming the need of value +function factorization. Extensive simulations in different multi-robot +scenarios demonstrate the success of the proposed approach, surpassing previous +multi-robot reinforcement learning solutions in scalability, while achieving +similar or superior performance (with averaged cumulative reward up to x2 +greater than the state-of-the-art with robot teams x6 larger than the number of +robots at training time). + +
+
+ comment: This paper is under review at IEEE T-RO +
+
+
+
+
+ + ♻ ☆ Distributed Differentiable Dynamic Game for Multi-robot Coordination + + +
+ This paper develops a Distributed Differentiable Dynamic Game (D3G) +framework, which can efficiently solve the forward and inverse problems in +multi-robot coordination. We formulate multi-robot coordination as a dynamic +game, where the behavior of a robot is dictated by its own dynamics and +objective that also depends on others' behavior. In the forward problem, D3G +enables all robots collaboratively to seek the Nash equilibrium of the game in +a distributed manner, by developing a distributed shooting-based Nash solver. +In the inverse problem, where each robot aims to find (learn) its objective +(and dynamics) parameters to mimic given coordination demonstrations, D3G +proposes a differentiation solver based on Differential Pontryagin's Maximum +Principle, which allows each robot to update its parameters in a distributed +and coordinated manner. We test the D3G in simulation with two types of robots +given different task configurations. The results demonstrate the effectiveness +of D3G for solving both forward and inverse problems in comparison with +existing methods. + +
+
+
+
+
+ + ♻ ☆ GraspSAM: When Segment Anything Model Meets Grasp Detection + + +
+ Grasp detection requires flexibility to handle objects of various shapes +without relying on prior knowledge of the object, while also offering +intuitive, user-guided control. This paper introduces GraspSAM, an innovative +extension of the Segment Anything Model (SAM), designed for prompt-driven and +category-agnostic grasp detection. Unlike previous methods, which are often +limited by small-scale training data, GraspSAM leverages the large-scale +training and prompt-based segmentation capabilities of SAM to efficiently +support both target-object and category-agnostic grasping. By utilizing +adapters, learnable token embeddings, and a lightweight modified decoder, +GraspSAM requires minimal fine-tuning to integrate object segmentation and +grasp prediction into a unified framework. The model achieves state-of-the-art +(SOTA) performance across multiple datasets, including Jacquard, +Grasp-Anything, and Grasp-Anything++. Extensive experiments demonstrate the +flexibility of GraspSAM in handling different types of prompts (such as points, +boxes, and language), highlighting its robustness and effectiveness in +real-world robotic applications. + +
+
+ comment: 6 pages (main), 1 page (references) +
+
+
+
+
+ + ♻ ☆ Context-Conditioned Spatio-Temporal Predictive Learning for Reliable V2V + Channel Prediction + + +
+ Achieving reliable multidimensional Vehicle-to-Vehicle (V2V) channel state +information (CSI) prediction is both challenging and crucial for optimizing +downstream tasks that depend on instantaneous CSI. This work extends +traditional prediction approaches by focusing on four-dimensional (4D) CSI, +which includes predictions over time, bandwidth, and antenna (TX and RX) space. +Such a comprehensive framework is essential for addressing the dynamic nature +of mobility environments within intelligent transportation systems, +necessitating the capture of both temporal and spatial dependencies across +diverse domains. To address this complexity, we propose a novel +context-conditioned spatiotemporal predictive learning method. This method +leverages causal convolutional long short-term memory (CA-ConvLSTM) to +effectively capture dependencies within 4D CSI data, and incorporates +context-conditioned attention mechanisms to enhance the efficiency of +spatiotemporal memory updates. Additionally, we introduce an adaptive +meta-learning scheme tailored for recurrent networks to mitigate the issue of +accumulative prediction errors. We validate the proposed method through +empirical studies conducted across three different geometric configurations and +mobility scenarios. Our results demonstrate that the proposed approach +outperforms existing state-of-the-art predictive models, achieving superior +performance across various geometries. Moreover, we show that the meta-learning +framework significantly enhances the performance of recurrent-based predictive +models in highly challenging cross-geometry settings, thus highlighting its +robustness and adaptability. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Systems and Control 19 + +
+
+
+ + ☆ Learning to Refine Input Constrained Control Barrier Functions via + Uncertainty-Aware Online Parameter Adaptation + + +
+ Control Barrier Functions (CBFs) have become powerful tools for ensuring +safety in nonlinear systems. However, finding valid CBFs that guarantee +persistent safety and feasibility remains an open challenge, especially in +systems with input constraints. Traditional approaches often rely on manually +tuning the parameters of the class K functions of the CBF conditions a priori. +The performance of CBF-based controllers is highly sensitive to these fixed +parameters, potentially leading to overly conservative behavior or safety +violations. To overcome these issues, this paper introduces a learning-based +optimal control framework for online adaptation of Input Constrained CBF +(ICCBF) parameters in discrete-time nonlinear systems. Our method employs a +probabilistic ensemble neural network to predict the performance and risk +metrics, as defined in this work, for candidate parameters, accounting for both +epistemic and aleatoric uncertainties. We propose a two-step verification +process using Jensen-Renyi Divergence and distributionally-robust Conditional +Value at Risk to identify valid parameters. This enables dynamic refinement of +ICCBF parameters based on current state and nearby environments, optimizing +performance while ensuring safety within the verified parameter set. +Experimental results demonstrate that our method outperforms both +fixed-parameter and existing adaptive methods in robot navigation scenarios +across safety and performance metrics. + +
+
+ comment: Project page: https://www.taekyung.me/online-adaptive-cbf +
+
+
+
+
+ + ☆ Domain knowledge-guided machine learning framework for state of health + estimation in Lithium-ion batteries + + +
+ Accurate estimation of battery state of health is crucial for effective +electric vehicle battery management. Here, we propose five health indicators +that can be extracted online from real-world electric vehicle operation and +develop a machine learning-based method to estimate the battery state of +health. The proposed indicators provide physical insights into the energy and +power fade of the battery and enable accurate capacity estimation even with +partially missing data. Moreover, they can be computed for portions of the +charging profile and real-world driving discharging conditions, facilitating +real-time battery degradation estimation. The indicators are computed using +experimental data from five cells aged under electric vehicle conditions, and a +linear regression model is used to estimate the state of health. The results +show that models trained with power autocorrelation and energy-based features +achieve capacity estimation with maximum absolute percentage error within 1.5% +to 2.5% . + +
+
+
+
+
+ + ☆ Goal-Oriented Communications for Interplanetary and Non-Terrestrial + Networks + + +
+ The accelerated pace of space exploration and satellite connectivity calls +for scalable communication network architectures that can effectively cater for +increasing numbers of bursty flows, such as those occurring in remote +monitoring and actuation. Communications in Space face unique challenges +including highly variable delays and disruptions that sometimes preclude +real-time signaling and end-to-end acknowledgements. In this paper we provide a +vision for tackling these fundamental challenges by exploiting recent progress +in goal-oriented communication. Our vision for Goal-Oriented Networking in +Space is built on three pillars: (1) principles and decision metrics for +goal-oriented sampling and multi-user scheduling, that can handle highly +variable delay processes that contain memory, (2) grant-free access policies +for massive machine-type communications that replace exogenous arrivals with +goal-oriented traffic shaping, and (3) flow control mechanisms that exploit the +cross-layer operability at application and link layers of Delay/Disruption +Tolerant Networking (DTN) protocols. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ Distributed Primal-Dual Interior Point Framework for Analyzing + Infeasible Combined Transmission and Distribution Grid Networks + + +
+ The proliferation of distributed energy resources has heightened the +interactions between transmission and distribution (T&D) systems, necessitating +novel analyses for the reliable operation and planning of interconnected T&D +networks. A critical gap is an analysis approach that identifies and localizes +the weak spots in the combined T\&D networks, providing valuable information to +system planners and operators. The research goal is to efficiently model and +simulate infeasible (i.e. unsolvable in general settings) combined positive +sequence transmission and three-phase distribution networks with a unified +solution algorithm. We model the combined T&D network with the equivalent +circuit formulation. To solve the overall T&D network, we build a +Gauss-Jacobi-Newton (GJN) based distributed primal dual interior point +optimization algorithm capable of isolating weak nodes. We validate the +approach on large combined T&D networks with 70k+ T and 15k+ D nodes and +demonstrate performance improvement over the alternating direction method of +multipliers (ADMM) method. + +
+
+
+
+
+ + ☆ A Review of Scalable and Privacy-Preserving Multi-Agent Frameworks for + Distributed Energy Resource Control + + +
+ Distributed energy resources (DERs) are gaining prominence due to their +advantages in improving energy efficiency, reducing carbon emissions, and +enhancing grid resilience. Despite the increasing deployment, the potential of +DERs has yet to be fully explored and exploited. A fundamental question +restrains the management of numerous DERs in large-scale power systems, "How +should DER data be securely processed and DER operations be efficiently +optimized?" To address this question, this paper considers two critical issues, +namely privacy for processing DER data and scalability in optimizing DER +operations, then surveys existing and emerging solutions from a multi-agent +framework perspective. In the context of scalability, this paper reviews +state-of-the-art research that relies on parallel control, optimization, and +learning within distributed and/or decentralized information exchange +structures, while in the context of privacy, it identifies privacy preservation +measures that can be synthesized into the aforementioned scalable structures. +Despite research advances in these areas, challenges remain because these +highly interdisciplinary studies blend a wide variety of scalable computing +architectures and privacy preservation techniques from different fields, making +them difficult to adapt in practice. To mitigate this issue, this paper +provides a holistic review of trending strategies that orchestrate privacy and +scalability for large-scale power system operations from a multi-agent +perspective, particularly for DER control problems. Furthermore, this review +extrapolates new approaches for future scalable, privacy-aware, and cybersecure +pathways to unlock the full potential of DERs through controlling, optimizing, +and learning generic multi-agent-based cyber-physical systems. + +
+
+
+
+
+ + ☆ A Unified Approach for Learning the Dynamics of Power System Generators + and Inverter-based Resources + + +
+ The growing prevalence of inverter-based resources (IBRs) for renewable +energy integration and electrification greatly challenges power system dynamic +analysis. To account for both synchronous generators (SGs) and IBRs, this work +presents an approach for learning the model of an individual dynamic component. +The recurrent neural network (RNN) model is used to match the recursive +structure in predicting the key dynamical states of a component from its +terminal bus voltage and set-point input. To deal with the fast transients +especially due to IBRs, we develop a Stable Integral (SI-)RNN to mimic +high-order integral methods that can enhance the stability and accuracy for the +dynamic learning task. We demonstrate that the proposed SI-RNN model not only +can successfully predict the component's dynamic behaviors, but also offers the +possibility of efficiently computing the dynamic sensitivity relative to a +set-point change. These capabilities have been numerically validated based on +full-order Electromagnetic Transient (EMT) simulations on a small test system +with both SGs and IBRs, particularly for predicting the dynamics of +grid-forming inverters. + +
+
+
+
+
+ + ☆ Few-Shot Testing of Autonomous Vehicles with Scenario Similarity + Learning + + +
+ Testing and evaluation are critical to the development and deployment of +autonomous vehicles (AVs). Given the rarity of safety-critical events such as +crashes, millions of tests are typically needed to accurately assess AV safety +performance. Although techniques like importance sampling can accelerate this +process, it usually still requires too many numbers of tests for field testing. +This severely hinders the testing and evaluation process, especially for +third-party testers and governmental bodies with very limited testing budgets. +The rapid development cycles of AV technology further exacerbate this +challenge. To fill this research gap, this paper introduces the few-shot +testing (FST) problem and proposes a methodological framework to tackle it. As +the testing budget is very limited, usually smaller than 100, the FST method +transforms the testing scenario generation problem from probabilistic sampling +to deterministic optimization, reducing the uncertainty of testing results. To +optimize the selection of testing scenarios, a cross-attention similarity +mechanism is proposed to learn to extract the information of AV's testing +scenario space. This allows iterative searches for scenarios with the smallest +evaluation error, ensuring precise testing within budget constraints. +Experimental results in cut-in scenarios demonstrate the effectiveness of the +FST method, significantly enhancing accuracy and enabling efficient, precise AV +testing. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ Robust Data-Driven Tube-Based Zonotopic Predictive Control with + Closed-Loop Guarantees + + +
+ This work proposes a robust data-driven tube-based zonotopic predictive +control (TZPC) approach for discrete-time linear systems, designed to ensure +stability and recursive feasibility in the presence of bounded noise. The +proposed approach consists of two phases. In an initial learning phase, we +provide an over-approximation of all models consistent with past input and +noisy state data using zonotope properties. Subsequently, in a control phase, +we formulate an optimization problem, which by integrating terminal ingredients +is proven to be recursively feasible. Moreover, we prove that implementing this +data-driven predictive control approach guarantees robust exponential stability +of the closed-loop system. The effectiveness and competitive performance of the +proposed control strategy, compared to recent data-driven predictive control +methods, are illustrated through numerical simulations. + +
+
+ comment: Accepted for presentation and publication at the 63rd IEEE Conference + on Decision and Control (CDC) +
+
+
+
+
+ + ☆ Adapting Gait Frequency for Posture-regulating Humanoid Push-recovery + via Hierarchical Model Predictive Control + + +
+ Current humanoid push-recovery strategies often use whole-body motion, yet +posture regulation is often overlooked. For instance, during manipulation +tasks, the upper body may need to stay upright and have minimal recovery +displacement. This paper introduces a novel approach to enhancing humanoid +push-recovery performance under unknown disturbances and regulating body +posture by tailoring the recovery stepping strategy. We propose a +hierarchical-MPC-based scheme that analyzes and detects instability in the +prediction window and quickly recovers through adapting gait frequency. Our +approach integrates a high-level nonlinear MPC, a posture-aware gait frequency +adaptation planner, and a low-level convex locomotion MPC. The planners predict +the center of mass (CoM) state trajectories that can be assessed for precursors +of potential instability and posture deviation. In simulation, we demonstrate +improved maximum recoverable impulse by 131% on average compared with baseline +approaches. In hardware experiments, a 125 ms advancement in recovery stepping +timing/reflex has been observed with the proposed approach, We also demonstrate +improved push-recovery performance and minimized attitude change under 0.2 rad. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ☆ Cluster-based Network Time Synchronization for Resilience with Energy + Efficiency + + +
+ Time synchronization of devices in Internet-of-Things (IoT) networks is one +of the challenging problems and a pre-requisite for the design of low-latency +applications. Although many existing solutions have tried to address this +problem, almost all solutions assume all the devices (nodes) in the network are +faultless. Furthermore, these solutions exchange a large number of messages to +achieve synchronization, leading to significant communication and energy +overhead. To address these shortcomings, we propose C-sync, a clustering-based +decentralized time synchronization protocol that provides resilience against +several types of faults with energy-efficient communication. C-sync achieves +scalability by introducing multiple reference nodes in the network that +restrict the maximum number of hops any node can have to its time source. The +protocol is designed with a modular structure on the Contiki platform to allow +application transitions. We evaluate C-sync on a real testbed that comprises +over 40 Tmote Sky hardware nodes distributed across different levels in a +building and show through experiments the fault resilience, energy efficiency, +and scalability of the protocol. C-sync detects and isolates faults to a +cluster and recovers quickly. The evaluation makes a qualitative comparison +with state-of-the-art protocols and a quantitative comparison with a class of +decentralized protocols (derived from GTSP) that provide synchronization with +no/limited fault-tolerance. Results also show a reduction of 56.12% and 75.75% +in power consumption in the worst-case and best-case scenarios, respectively, +compared to GTSP, while achieving similar accuracy. + +
+
+
+
+
+ + ☆ Exploring the Use of Contingency for Nuclear Electrical Studies + + +
+ This paper examines the use of contingency analysis for a nuclear power plant +to determine its potential benefits for the nuclear industry. Various N-1 +contingencies were analyzed for a model of an existing nuclear plant, primarily +inspecting voltage violations resulting from a failure. Remedial Actions +Schemes were suggested to support the reduction of voltage violations in the +event of a failure within the system. Many of the schemes presented were solved +by existing redundancies and protection schemes that have been provided through +the use of industry standard bounding analysis in the design process. This +paper proposes the future use of real-time contingency analysis for nuclear +power plants, conducted using constantly updating voltage, current, and power +measurements through the system. This will provide real-time information of the +system and can serve as historical data to reduce the analysis needed for +pending design changes in the plant. + +
+
+
+
+
+ + ☆ A novel load distribution strategy for aggregators using IoT-enabled + mobile devices + + +
+ The rapid proliferation of Internet-of-things (IoT) as well as mobile devices +such as Electric Vehicles (EVs), has led to unpredictable load at the grid. The +demand to supply ratio is particularly exacerbated at a few grid aggregators +(charging stations) with excessive demand due to the geographic location, peak +time, etc. Existing solutions on demand response cannot achieve significant +improvements based only on time-shifting the loads without considering the +device properties such as charging modes and movement capabilities to enable +geographic migration. Additionally, the information on the spare capacity at a +few aggregators can aid in re-channeling the load from other aggregators facing +excess demand to allow migration of devices. In this paper, we model these +flexible properties of the devices as a mixed-integer non-linear problem +(MINLP) to minimize excess load and the improve the utility (benefit) across +all devices. We propose an online distributed low-complexity heuristic that +prioritizes devices based on demand and deadlines to minimize the cumulative +loss in utility. The proposed heuristic is tested on an exhaustive set of +synthetic data and compared with solutions from a solver/optimization tool for +the same runtime to show the impracticality of using a solver. A real-world EV +testbed data is also tested with our proposed solution and other scheduling +solutions to show the practicality of generating a feasible schedule and a loss +improvement of at least 57.23%. + +
+
+
+
+
+ + ♻ ☆ Asymmetry underlies stability in power grids + + +
+ Behavioral homogeneity is often critical for the functioning of network +systems of interacting entities. In power grids, whose stable operation +requires generator frequencies to be synchronized--and thus homogeneous--across +the network, previous work suggests that the stability of synchronous states +can be improved by making the generators homogeneous. Here, we show that a +substantial additional improvement is possible by instead making the generators +suitably heterogeneous. We develop a general method for attributing this +counterintuitive effect to converse symmetry breaking, a recently established +phenomenon in which the system must be asymmetric to maintain a stable +symmetric state. These findings constitute the first demonstration of converse +symmetry breaking in real-world systems, and our method promises to enable +identification of this phenomenon in other networks whose functions rely on +behavioral homogeneity. + +
+
+ comment: Updated to correct the damping parameters in Fig. 1 and its caption, + which were inadvertently over-rounded in the original version. The published + version of the Article has also been updated with this correction +
+
+
+
+
+ + ♻ ☆ Train-On-Request: An On-Device Continual Learning Workflow for Adaptive + Real-World Brain Machine Interfaces + + +
+ Brain-machine interfaces (BMIs) are expanding beyond clinical settings thanks +to advances in hardware and algorithms. However, they still face challenges in +user-friendliness and signal variability. Classification models need periodic +adaptation for real-life use, making an optimal re-training strategy essential +to maximize user acceptance and maintain high performance. We propose TOR, a +train-on-request workflow that enables user-specific model adaptation to novel +conditions, addressing signal variability over time. Using continual learning, +TOR preserves knowledge across sessions and mitigates inter-session +variability. With TOR, users can refine, on demand, the model through on-device +learning (ODL) to enhance accuracy adapting to changing conditions. We evaluate +the proposed methodology on a motor-movement dataset recorded with a +non-stigmatizing wearable BMI headband, achieving up to 92% accuracy and a +re-calibration time as low as 1.6 minutes, a 46% reduction compared to a naive +transfer learning workflow. We additionally demonstrate that TOR is suitable +for ODL in extreme edge settings by deploying the training procedure on a +RISC-V ultra-low-power SoC (GAP9), resulting in 21.6 ms of latency and 1 mJ of +energy consumption per training step. To the best of our knowledge, this work +is the first demonstration of an online, energy-efficient, dynamic adaptation +of a BMI model to the intrinsic variability of EEG signals in real-time +settings. + +
+
+ comment: 5 pages, 6 figures, to be published in 2024 IEEE Biomedical Circuits + and Systems Conference (BioCAS) +
+
+
+
+
+ + ♻ ☆ Excitable Nonlinear Opinion Dynamics (E-NOD) for Agile Decision-Making + + +
+ We present Excitable Nonlinear Opinion Dynamics (E-NOD), which describe +opinion-forming and decision-making behavior with superior "agility" in +responding and adapting to fast and unpredictable changes in context, +environment, or information about available options. E-NOD is derived by +introducing a single extra term to the previously presented Nonlinear Opinion +Dynamics (NOD), which have been shown to enable fast and flexible multi-agent +behavior. This extra term is inspired by the fast-positive, slow-negative +mixed-feedback structure of excitable systems. The agile behaviors resulting +from the excitable nature of decision-making driven by E-NOD are analyzed in a +general setting and illustrated through an application to robot navigation +around human movers. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Model-Free Learning and Optimal Policy Design in Multi-Agent MDPs Under + Probabilistic Agent Dropout + + +
+ This work studies a multi-agent Markov decision process (MDP) that can +undergo agent dropout and the computation of policies for the post-dropout +system based on control and sampling of the pre-dropout system. The central +planner's objective is to find an optimal policy that maximizes the value of +the expected system given a priori knowledge of the agents' dropout +probabilities. For MDPs with a certain transition independence and reward +separability structure, we assume that removing agents from the system forms a +new MDP comprised of the remaining agents with new state and action spaces, +transition dynamics that marginalize the removed agents, and rewards that are +independent of the removed agents. We first show that under these assumptions, +the value of the expected post-dropout system can be represented by a single +MDP; this "robust MDP" eliminates the need to evaluate all $2^N$ realizations +of the system, where N denotes the number of agents. More significantly, in a +model-free context, it is shown that the robust MDP value can be estimated with +samples generated by the pre-dropout system, meaning that robust policies can +be found before dropout occurs. This fact is used to propose a policy +importance sampling (IS) routine that performs policy evaluation for dropout +scenarios while controlling the existing system with good pre-dropout policies. +The policy IS routine produces value estimates for both the robust MDP and +specific post-dropout system realizations and is justified with exponential +confidence bounds. Finally, the utility of this approach is verified in +simulation, showing how structural properties of agent dropout can help a +controller find good post-dropout policies before dropout occurs. + +
+
+ comment: 22 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ FIRE: A Failure-Adaptive Reinforcement Learning Framework for Edge + Computing Migrations + + +
+ In edge computing, users' service profiles are migrated due to user mobility. +Reinforcement learning (RL) frameworks have been proposed to do so, often +trained on simulated data. However, existing RL frameworks overlook occasional +server failures, which although rare, impact latency-sensitive applications +like autonomous driving and real-time obstacle detection. Nevertheless, these +failures (rare events), being not adequately represented in historical training +data, pose a challenge for data-driven RL algorithms. As it is impractical to +adjust failure frequency in real-world applications for training, we introduce +FIRE, a framework that adapts to rare events by training a RL policy in an edge +computing digital twin environment. We propose ImRE, an importance +sampling-based Q-learning algorithm, which samples rare events proportionally +to their impact on the value function. FIRE considers delay, migration, +failure, and backup placement costs across individual and shared service +profiles. We prove ImRE's boundedness and convergence to optimality. Next, we +introduce novel deep Q-learning (ImDQL) and actor critic (ImACRE) versions of +our algorithm to enhance scalability. We extend our framework to accommodate +users with varying risk tolerances. Through trace driven experiments, we show +that FIRE reduces costs compared to vanilla RL and the greedy baseline in the +event of failures. + +
+
+
+
+
+ + ♻ ☆ Continuous Dynamic Bipedal Jumping via Real-time Variable-model + Optimization + + +
+ Dynamic and continuous jumping remains an open yet challenging problem in +bipedal robot control. Real-time planning with full body dynamics over the +entire jumping trajectory presents unsolved challenges in computation burden. +In this paper, we propose a novel variable-model optimization approach, a +unified framework of variable-model trajectory optimization (TO) and +variable-frequency Model Predictive Control (MPC), to effectively realize +continuous and robust jumping planning and control on HECTOR bipedal robot in +real-time. The proposed TO fuses variable-fidelity dynamics modeling of bipedal +jumping motion in different jumping phases to balance trajectory accuracy and +real-time computation efficiency. In addition, conventional fixed-frequency +control approaches suffer from unsynchronized sampling frequencies, leading to +mismatched modeling resolutions. We address this by aligning the MPC sampling +frequency with the variable-model TO trajectory resolutions across different +phases. In hardware experiments, we have demonstrated robust and dynamic jumps +covering a distance of up to 40 cm (57% of robot height). To verify the +repeatability of this experiment, we run 53 jumping experiments and achieve 90% +success rate. In continuous jumps, we demonstrate continuous bipedal jumping +with terrain height perturbations (up to 5 cm) and discontinuities (up to 20 cm +gap). + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Agile Decision-Making and Safety-Critical Motion Planning for Emergency + Autonomous Vehicles + + +
+ Efficiency is critical for autonomous vehicles (AVs), especially for +emergency AVs. However, most existing methods focus on regular vehicles, +overlooking the distinct strategies required by emergency vehicles to address +the challenge of maximizing efficiency while ensuring safety. In this paper, we +propose an Integrated Agile Decision-Making with Active and Safety-Critical +Motion Planning System (IDEAM). IDEAM focuses on enabling emergency AVs, such +as ambulances, to actively attain efficiency in dense traffic scenarios with +safety in mind. Firstly, the speed-centric decision-making algorithm named the +long short-term spatio-temporal graph-centric decision-making (LSGM) is given. +LSGM comprises conditional depth-first search (C-DFS) for multiple paths +generation as well as methods for speed gains and risk evaluation for path +selection, which presents a robust algorithm for high efficiency and safety +consideration. Secondly, with an output path from LSGM, the motion planner +reconsiders environmental conditions to decide constraints states for the final +planning stage, among which the lane-probing state is designed for actively +attaining spatial and speed advantage. Thirdly, under the Frenet-based model +predictive control (MPC) framework with final constraints state and selected +path, the safety-critical motion planner employs decoupled discrete control +barrier functions (DCBFs) and linearized discrete-time high-order control +barrier functions (DHOCBFs) to model the constraints associated with different +driving behaviors, making the optimal optimization problem convex. Finally, we +extensively validate our system using scenarios from a randomly synthetic +dataset, demonstrating its capability to achieve speed benefits and assure +safety simultaneously. + +
+
+
+
+
+
+
+ + + + + + diff --git a/index.js b/index.js new file mode 100644 index 0000000..69f5da7 --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`